From 5d0e6c3a650cbd8460f5b0392461327ca6ba1cff Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Sat, 30 May 2026 18:00:39 -0700 Subject: [PATCH] fix: harden metadata parsers and engines; sync docs (full-repo review) Apply fixes from a full-repo review (code, tests, docs). Security / correctness: - Clamp attacker-controlled PNG/caBX chunk lengths to the remaining file size in metadata.py and noai/c2pa.py (a malformed length no longer drives a multi-GB read); skipped chunks seek instead of read. - noai/isobmff.strip_c2pa_boxes is now fail-safe on a malformed box: return the original bytes with a warning instead of silently truncating the tail, so metadata --remove can no longer emit a corrupt file. - doubao_engine._fixed_alpha_map clamps the glyph box to the image (no crash on degenerate width-vs-height). - watermark_remover._run_region_hires gates the phaseCorrelate offset on response and magnitude (a spurious shift no longer garbles text) and drops the generator after a CPU fallback (no MPS/CPU device mismatch). Robustness: - gemini_engine, doubao_engine, region_eraser normalize grayscale and RGBA inputs to BGR at the engine entry points. - image_io.imwrite returns False on an unwritable path (matches cv2). - invisible_engine guards a None imread result before use. - trustmark_detector._decoder uses a double-checked threading lock. - ctrlregen.tiling.tile_positions raises on overlap >= tile. - humanizer chromatic shift no longer wraps opposite-edge pixels. - identify OpenAI caveat keyed on the normalized vendor, not a substring. - Remove the dead "visible --detect-threshold" CLI option. - publish.yml verifies the release tag matches the package version. Docs: - README strength 0.05 to 0.10; .env.example HF_TOKEN marked optional; doubao_capture README updated to reverse-alpha-only; CLAUDE.md synced with the new behaviors and the batch command. Tests: new test_security_clamp.py for the read clamp and isobmff fail-safe; erase CLI coverage; integrity-clash rule 2 end-to-end; multi-tag EXIF survival and cross-format strip guards; channel/size, tiling, humanizer, and imwrite regressions. Full suite 493 passed, 2 skipped; ruff and pyright src/ clean. Co-Authored-By: Claude Opus 4.8 --- .env.example | 2 +- .github/workflows/publish.yml | 10 ++ CLAUDE.md | 24 ++-- README.md | 4 +- data/doubao_capture/README.md | 12 +- src/remove_ai_watermarks/cli.py | 2 - src/remove_ai_watermarks/doubao_engine.py | 22 ++- src/remove_ai_watermarks/gemini_engine.py | 13 +- src/remove_ai_watermarks/humanizer.py | 6 +- src/remove_ai_watermarks/identify.py | 2 +- src/remove_ai_watermarks/image_io.py | 8 +- src/remove_ai_watermarks/invisible_engine.py | 4 +- src/remove_ai_watermarks/metadata.py | 7 +- src/remove_ai_watermarks/noai/c2pa.py | 30 +++- .../noai/ctrlregen/tiling.py | 2 + src/remove_ai_watermarks/noai/isobmff.py | 21 +++ .../noai/watermark_remover.py | 23 +++- src/remove_ai_watermarks/region_eraser.py | 10 +- .../trustmark_detector.py | 9 +- tests/test_cli.py | 75 ++++++++++ tests/test_doubao_engine.py | 25 ++++ tests/test_humanizer.py | 20 +++ tests/test_identify.py | 63 +++++++++ tests/test_image_io.py | 5 + tests/test_metadata.py | 60 ++++++++ tests/test_noai.py | 7 +- tests/test_region_eraser.py | 17 +++ tests/test_security_clamp.py | 130 ++++++++++++++++++ tests/test_tiling.py | 10 ++ 29 files changed, 580 insertions(+), 43 deletions(-) create mode 100644 tests/test_security_clamp.py diff --git a/.env.example b/.env.example index 9fceac5..2431293 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,3 @@ -# HuggingFace token (required for invisible watermark removal) +# HuggingFace token (optional; only needed for gated/private models) # Get yours at: https://huggingface.co/settings/tokens HF_TOKEN= diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index e5fdc10..a645460 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -17,6 +17,16 @@ jobs: - uses: astral-sh/setup-uv@v7 + - name: Verify release tag matches package version + run: | + tag="${{ github.event.release.tag_name }}" + version="$(grep -m1 '^version = ' pyproject.toml | sed -E 's/^version = "([^"]+)"/\1/')" + if [ "${tag#v}" != "$version" ]; then + echo "Release tag '$tag' does not match pyproject.toml version '$version'" >&2 + exit 1 + fi + echo "Release tag '$tag' matches package version '$version'" + - name: Build package run: uv build diff --git a/CLAUDE.md b/CLAUDE.md index b31e1e8..c2aebf5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,10 +10,11 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `uv run remove-ai-watermarks identify ` — provenance verdict (platform + watermark inventory + confidence); `--json` for machine output, `--no-visible` to skip the cv2 sparkle detector - `uv run remove-ai-watermarks metadata --check` — inspect AI metadata (C2PA, EXIF, PNG chunks) - `uv run remove-ai-watermarks metadata --remove -o ` — strip all AI metadata +- `uv run remove-ai-watermarks batch ` — process every supported image in a directory (output defaults to `_clean/`, set with `-o`). `--mode visible|invisible|metadata|all` (default `visible`); the invisible/all path reuses the same `--strength`/`--steps`/`--pipeline`/`--device`/`--max-resolution`/`--seed`/`--hf-token` knobs as `invisible`, `--inpaint/--no-inpaint` for the visible pass, and `--humanize` for the Analog Humanizer ## Test and lint -- **CI** (`.github/workflows/test.yml`): runs on push to `main` + every PR. A `lint` job (ubuntu: `ruff check` + `ruff format --check`) plus a `test` matrix (ubuntu/macos/windows x py3.10/3.12) that does `uv sync --frozen --extra dev` then `pytest`. The matrix installs only core + dev (no `gpu` extra), so the GPU/model-running tests skip there and it exercises the metadata/identify/visible/cv2-eraser surface on all three OSes. Keep `uv.lock` valid (don't break `--frozen`) when editing `pyproject.toml`. `publish.yml` stays release-only. +- **CI** (`.github/workflows/test.yml`): runs on push to `main` + every PR. A `lint` job (ubuntu: `ruff check` + `ruff format --check`) plus a `test` matrix (ubuntu/macos/windows x py3.10/3.12) that does `uv sync --frozen --extra dev` then `pytest`. The matrix installs only core + dev (no `gpu` extra), so the GPU/model-running tests skip there and it exercises the metadata/identify/visible/cv2-eraser surface on all three OSes. Keep `uv.lock` valid (don't break `--frozen`) when editing `pyproject.toml`. `publish.yml` stays release-only and now verifies the release tag matches the `pyproject.toml` version (fails the build on a mismatch) before building. - `bash maintain.sh` — uv-outdated, uv-secure, ruff check/fix, ruff format, pyright, pytest -n auto - **Strict pyright is clean across `src/` (0 errors).** The cv2/torch/diffusers boundary files (`gemini_engine`, `region_eraser`, `doubao_engine`, `face_protector`, `humanizer`, `invisible_engine`, `noai/watermark_remover`, and the whole `noai/ctrlregen/` subpackage) carry a documented per-file `# pyright:` relax pragma (or, for `ctrlregen`, a `tool.pyright.executionEnvironments` entry) that turns off only the unknown-type / untyped-third-party rules — those libs ship no usable types, so strict typing there fights the ecosystem. Pure-logic files stay fully strict; `typings/piexif/__init__.pyi` is a local stub so `metadata.py`/`extractor.py` resolve piexif. Public ndarray-returning signatures on the relaxed engines are still annotated `NDArray[Any]` so strict consumers (`cli.py`) stay clean. When touching a relaxed file, prefer fixing real issues over widening the pragma; keep the pragma scoped to genuinely-untyped boundaries. (`uv-secure` is clean since idna was bumped 3.11 -> 3.16, fixing GHSA-65pc-fj4g-8rjx.) - **Full-project `uv run pyright` (no path) OOMs/crashes node on this ML-heavy repo** (emits a `libnode` stack frame, no summary) — a known environment limit, not a code error. Gate with `uv run --extra dev --extra gpu pyright src/` (completes, authoritative) or scope to changed files; also run `uv run ruff check` and `uv run pytest` directly. @@ -26,23 +27,24 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - GPU/ML modules (invisible_engine, ctrlregen, watermark_remover) are optional — guard imports with `is_available()` checks - Optional detection extras: `detect` (imwatermark — open SD/SDXL/FLUX watermark) and `trustmark` (Adobe TrustMark decoder; pulls torch + downloads weights). Both are guarded by `is_available()` and skipped by `identify` when absent. -- Tests for the *model-running* paths are limited to availability checks (multi-GB downloads). But the **pure helpers inside ML-adjacent modules are unit-tested without any download** and must stay that way: `_target_size` (native-vs-downscale, `test_invisible_engine.py`), the MPS->CPU fallback control flow via mocked pipelines (`test_img2img_runner.py`, 100% cover), and the tiling math `tile_positions`/`make_blend_weight`/`resize_center_crop` (`test_tiling.py`; `pytest.importorskip("torch")` since `tiling.py` imports torch at module top). Don't skip these as "ML, needs a model" — only `run_tiled`/`remove_watermark`/the diffusion bodies do. +- Tests for the *model-running* paths are limited to availability checks (multi-GB downloads). But the **pure helpers inside ML-adjacent modules are unit-tested without any download** and must stay that way: `_target_size` (native-vs-downscale, `test_invisible_engine.py`), the MPS->CPU fallback control flow via mocked pipelines (`test_img2img_runner.py`, 100% cover), and the tiling math `tile_positions` (now raises `ValueError` when not `0 <= overlap < tile`)/`make_blend_weight`/`resize_center_crop` (`test_tiling.py`; `pytest.importorskip("torch")` since `tiling.py` imports torch at module top). Don't skip these as "ML, needs a model" — only `run_tiled`/`remove_watermark`/the diffusion bodies do. ## Key modules -- `noai/c2pa.py` — PNG chunk parser; use `extract_c2pa_chunk(path)` to get raw caBX payload, `has_c2pa_metadata(path)` to detect. Do not reimplement chunk parsing. `extract_c2pa_info(path)` sets `synthid_watermark`/`synthid_vendors` when the manifest is signed by a SynthID-using vendor, and `soft_binding`/`soft_binding_vendors` when a `c2pa.soft-binding` `alg` names a forensic-watermark vendor (`soft_binding_vendors_in(buffer)` is the shared byte-scan, used by both the PNG parser and the non-PNG binary path). +- `noai/c2pa.py` — PNG chunk parser; use `extract_c2pa_chunk(path)` to get raw caBX payload, `has_c2pa_metadata(path)` to detect. Do not reimplement chunk parsing. `extract_c2pa_info(path)` sets `synthid_watermark`/`synthid_vendors` when the manifest is signed by a SynthID-using vendor, and `soft_binding`/`soft_binding_vendors` when a `c2pa.soft-binding` `alg` names a forensic-watermark vendor (`soft_binding_vendors_in(buffer)` is the shared byte-scan, used by both the PNG parser and the non-PNG binary path). PNG/caBX chunk reads are clamped to the remaining file size (`safe_length = min(length, remaining)`; skipped chunks use seek) so a malformed huge `length` cannot drive a multi-GB allocation (shared safety discipline matching `isobmff.scan_c2pa_region`). - `noai/constants.py` — PNG_SIGNATURE, C2PA_CHUNK_TYPE, C2PA_SIGNATURES, C2PA_ISSUERS, `SYNTHID_C2PA_ISSUERS` (issuers that pair SynthID with C2PA: Google, OpenAI), and `C2PA_SOFT_BINDINGS` (soft-binding `alg` prefix → forensic-watermark vendor: Adobe TrustMark, Digimarc, Imatag, Steg.AI, Microsoft, ...). Add a new issuer/binding here, not inline. -- `metadata.py` — `scan_head(path, size=1MB)` is the shared input for every C2PA/AIGC/IPTC byte scan: first `size` bytes plus the payloads of any provenance metadata found beyond that window — for ISOBMFF, the late provenance boxes from `isobmff.scan_c2pa_region` (catches a manifest after a large `mdat`); for **PNG**, the late `tEXt`/`iTXt`/`zTXt`/`eXIf`/`iCCP` chunks from `_png_late_metadata` (catches an XMP/EXIF packet appended after a large `IDAT`, e.g. a TC260 AIGC label at ~2.7 MB). Behavior-neutral (`f.read(size)`) for non-ISOBMFF inputs and for any file that fits within `size`. Use it instead of `open().read(1MB)` for any new marker scan. `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. `xai_signature(path)` detects xAI/Grok's EXIF-only scheme (`ImageDescription` = `Signature: ` + UUID `Artist`); it feeds `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify`. `iptc_ai_system(path)` detects the IPTC Photo Metadata 2025.1 AI-disclosure XMP properties (`IPTC_AI_FIELD_MARKERS` = `AISystemUsed`/`AISystemVersionUsed`/`AIPromptInformation`/`AIPromptWriterName`) and returns the `AISystemUsed` generator name (or `"fields present"`). `remove_ai_metadata` routes **ISOBMFF video** (`.mp4`/`.mov`/`.m4v`) through the same `isobmff.strip_c2pa_boxes` as AVIF/HEIF (MP4 is ISOBMFF), and `_scrub_ai_exif` removes the xAI signature + AI-generator EXIF tags on JPEG output. -- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, C2PA soft-binding forensic-watermark vendor, IPTC "Made with AI" + IPTC 2025.1 `AISystemUsed`, embedded SD/ComfyUI params, SynthID proxy, xAI/Grok EXIF signature via `metadata.xai_signature`, the China TC260 AIGC label via `metadata.aigc_label`, the HuggingFace `hf-job-id` job marker via `metadata.huggingface_job`, the Samsung Galaxy AI editing marker via `metadata.samsung_genai`, visible Gemini sparkle, open invisible watermark, Adobe TrustMark via `trustmark_detector`) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). The `hf_job`, visible-sparkle, and Samsung `samsung_genai` signals are **medium** confidence: each lifts an otherwise-Unknown verdict to a tentative AI (`hf_only` / `visible_only` / `samsung_only`, parallel branches) but is excluded from the high-confidence `ai_from_metadata` set, so none overrides a hard metadata signal. Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. **C2PA platform attribution is device-token-first, issuer-scan fallback** (`_device_platform` scans manifest bytes for `_DEVICE_C2PA_PLATFORM` tokens, then `_attribute_platform`/`_ISSUER_PLATFORM`). **Why, verified on real signed files 2026-05-26:** the old issuer-only byte-scan matched ANY issuer substring anywhere, so multi-entity manifests mis-attributed -- Leica→"Truepic" (a signing authority in the trust chain), Nikon→"Adobe Firefly" (XMP-toolkit "Adobe" + the sample's "Adobe_MAX" name), Pixel→"Google (Gemini)" ("Google LLC" cert org), Truepic→"Google". A distinctive device token wins instead. **Token distinctiveness is load-bearing:** bare `b"Truepic"` mis-fires (it appears in unrelated trust chains -- it mis-attributed the OpenAI `chatgpt-1.png` fixture), so the token is the specific `b"Truepic_Lens"` from the Lens SDK claim generator; likewise `b"Pixel Camera"` (cert CN) not bare `b"Pixel"`. `_DEVICE_C2PA_PLATFORM` lists ONLY tokens **verified against a real C2PA file**: Leica (`lc_c2pa`/`Leica Camera`), Nikon (`NIKON`), Pixel (`Pixel Camera` -- from a real Pixel 10 Pro file attached to c2pa-rs issue #1609/#1554), Sony (`sony.sig`/`sony.cert` -- Sony's own C2PA assertion namespace, verified on a real Sony PXW-Z300 file; NOT bare "Sony" which is a common EXIF Make), Truepic (`Truepic_Lens`). Canon/Bria have **no public direct-download C2PA sample** (checked exhaustively: GitHub issue/PR attachments, contentcredentials gallery, HF datasets -- all upload-to-verify or token-gated; Canon's only public file was a self-signed hobbyist CR3, not factory), so they stay unmapped until a real file is captured (same fixture discipline as Grok/Doubao). The Sony sample is video (MP4) -- our ISOBMFF C2PA path detects it; Sony Alpha stills likely share the `sony.*` namespace but are not separately verified. **Samsung Galaxy + ASUS Gallery live in a separate `_SIGNER_C2PA_PLATFORM` (scanned after `_device_platform`, before the issuer fallback), NOT in `_DEVICE_C2PA_PLATFORM`** — verified on real signed files 2026-05-29. Reason: a Galaxy phone stamps BOTH its device cert AND a `trainedAlgorithmicMedia`/genAIType AI marker on a Generative-Edit image, so treating it as a "genuine camera capture" would false-fire integrity-clash rule 2 on every Galaxy AI edit. The signer tokens (`b"Samsung Galaxy"` cert org — distinct from the EXIF `SM-xxxx` model string on ordinary Samsung photos; `b"com.asus.gallery"` claim generator) only resolve the platform label; the AI verdict still comes from the source-type / genAIType. ASUS Gallery is a C2PA-signed edit with no AI marker, so it attributes the platform without asserting `is_ai`. **Samsung's `genAIType` (in the proprietary `PhotoEditor_Re_Edit_Data` JSON) is an undocumented Galaxy-AI editing marker** (`metadata.samsung_genai`, gated on the `PhotoEditor_Re_Edit_Data` container; non-zero value = AI tool used, values {1,5} observed): medium-confidence because the field has no public spec (verified 2026-05-29: absent from C2PA spec + Samsung docs), but it co-occurred with `trainedAlgorithmicMedia` in 3/3 verified files that record a source-type and was the SOLE AI marker on a Galaxy S24 file that omits the source type. Camera C2PA marks capture authenticity, not AI (Pixel carries `computationalCapture`, not `trainedAlgorithmicMedia`), so these never set `is_ai` -- that stays driven by digital-source-type. `c2pa.cbor_text_after` (now public) is best-effort for the `generator` detail string only and can be None when the manifest keys it `claim_generator_info` (Pixel). **Issuer→generator mapping is `is_ai`-gated** (`_attribute_platform(issuers, is_ai=c2pa_is_ai)`): a specific AI-generator platform is named only when the digital-source-type is `trainedAlgorithmicMedia`; on a non-AI source an issuer substring is treated as incidental (an "Adobe XMP" toolkit string in an *unmapped* Canon/Sony capture would otherwise mislabel it "Adobe Firefly"), so it degrades to the neutral "C2PA signer: X" label. Real Firefly/OpenAI/Google output carries the AI source-type, so it is unaffected (verified: chatgpt-1.png→OpenAI, firefly-1.png→Adobe Firefly still attribute). `_attribute_platform` defaults `is_ai=True` so the mapping stays unit-testable in isolation. Add capture-camera tokens to `_DEVICE_C2PA_PLATFORM`, editing-app/AI-device signer tokens to `_SIGNER_C2PA_PLATFORM`, generator/issuer platforms to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. **Integrity-clash detection** (`_integrity_clashes`, surfaced as `ProvenanceReport.integrity_clashes`, printed in red by `identify` and serialized to `--json`): contradictions between independent generator stamps are a laundering/spoofing tell. Two rules: (1) two or more distinct AI-origin vendors named by independent signals (e.g. C2PA OpenAI + EXIF `Make="Ideogram AI"`), and (2) a camera-capture C2PA device (`_DEVICE_C2PA_PLATFORM`) coexisting with any AI-generation marker. Vendor normalization is `_vendor_of` over `_AI_VENDOR_TOKENS` (so a C2PA "Google (Gemini)" issuer and a SynthID-Google proxy agree, while different vendors clash). **High-precision by design:** only hard generator stamps feed it (C2PA-issuer when source is AI, SynthID, EXIF/XMP generator, IPTC `AISystemUsed`, xAI, AIGC); the fuzzy visible sparkle and the open invisible watermark are **excluded** (the latter can be a by-product of our own SDXL removal pass). The c2pa vendor is classified from the issuer attribution / generator, NOT the resolved `platform` (a camera label like "Google Pixel" would mis-normalize to "Google"). All real single-origin fixtures (chatgpt/firefly/doubao/grok/mj) verified to produce **zero** clashes (false-positive guard in `test_identify.py::TestRealSamplesHaveNoClash`). +- `metadata.py` — `scan_head(path, size=1MB)` is the shared input for every C2PA/AIGC/IPTC byte scan: first `size` bytes plus the payloads of any provenance metadata found beyond that window — for ISOBMFF, the late provenance boxes from `isobmff.scan_c2pa_region` (catches a manifest after a large `mdat`); for **PNG**, the late `tEXt`/`iTXt`/`zTXt`/`eXIf`/`iCCP` chunks from `_png_late_metadata` (catches an XMP/EXIF packet appended after a large `IDAT`, e.g. a TC260 AIGC label at ~2.7 MB). Behavior-neutral (`f.read(size)`) for non-ISOBMFF inputs and for any file that fits within `size`. Use it instead of `open().read(1MB)` for any new marker scan. `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. `xai_signature(path)` detects xAI/Grok's EXIF-only scheme (`ImageDescription` = `Signature: ` + UUID `Artist`); it feeds `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify`. `iptc_ai_system(path)` detects the IPTC Photo Metadata 2025.1 AI-disclosure XMP properties (`IPTC_AI_FIELD_MARKERS` = `AISystemUsed`/`AISystemVersionUsed`/`AIPromptInformation`/`AIPromptWriterName`) and returns the `AISystemUsed` generator name (or `"fields present"`). `remove_ai_metadata` routes **ISOBMFF video** (`.mp4`/`.mov`/`.m4v`) through the same `isobmff.strip_c2pa_boxes` as AVIF/HEIF (MP4 is ISOBMFF), and `_scrub_ai_exif` removes the xAI signature + AI-generator EXIF tags on JPEG output. `strip_c2pa_boxes` is **fail-safe** on a malformed box: it returns the original bytes unchanged with a logged warning instead of truncating the tail to EOF (detection-only `scan_c2pa_region` still stops at a malformed box). `_png_late_metadata` clamps each late-chunk read to the remaining file size (`safe_length = min(length, remaining)`) so a malformed `length` cannot drive a multi-GB allocation. +- `identify.py` — the OpenAI rollout caveat is keyed on `_vendor_of(synthid) == "OpenAI"` (not a raw substring over the issuer + verdict blob). `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, C2PA soft-binding forensic-watermark vendor, IPTC "Made with AI" + IPTC 2025.1 `AISystemUsed`, embedded SD/ComfyUI params, SynthID proxy, xAI/Grok EXIF signature via `metadata.xai_signature`, the China TC260 AIGC label via `metadata.aigc_label`, the HuggingFace `hf-job-id` job marker via `metadata.huggingface_job`, the Samsung Galaxy AI editing marker via `metadata.samsung_genai`, visible Gemini sparkle, open invisible watermark, Adobe TrustMark via `trustmark_detector`) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). The `hf_job`, visible-sparkle, and Samsung `samsung_genai` signals are **medium** confidence: each lifts an otherwise-Unknown verdict to a tentative AI (`hf_only` / `visible_only` / `samsung_only`, parallel branches) but is excluded from the high-confidence `ai_from_metadata` set, so none overrides a hard metadata signal. Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. **C2PA platform attribution is device-token-first, issuer-scan fallback** (`_device_platform` scans manifest bytes for `_DEVICE_C2PA_PLATFORM` tokens, then `_attribute_platform`/`_ISSUER_PLATFORM`). **Why, verified on real signed files 2026-05-26:** the old issuer-only byte-scan matched ANY issuer substring anywhere, so multi-entity manifests mis-attributed -- Leica→"Truepic" (a signing authority in the trust chain), Nikon→"Adobe Firefly" (XMP-toolkit "Adobe" + the sample's "Adobe_MAX" name), Pixel→"Google (Gemini)" ("Google LLC" cert org), Truepic→"Google". A distinctive device token wins instead. **Token distinctiveness is load-bearing:** bare `b"Truepic"` mis-fires (it appears in unrelated trust chains -- it mis-attributed the OpenAI `chatgpt-1.png` fixture), so the token is the specific `b"Truepic_Lens"` from the Lens SDK claim generator; likewise `b"Pixel Camera"` (cert CN) not bare `b"Pixel"`. `_DEVICE_C2PA_PLATFORM` lists ONLY tokens **verified against a real C2PA file**: Leica (`lc_c2pa`/`Leica Camera`), Nikon (`NIKON`), Pixel (`Pixel Camera` -- from a real Pixel 10 Pro file attached to c2pa-rs issue #1609/#1554), Sony (`sony.sig`/`sony.cert` -- Sony's own C2PA assertion namespace, verified on a real Sony PXW-Z300 file; NOT bare "Sony" which is a common EXIF Make), Truepic (`Truepic_Lens`). Canon/Bria have **no public direct-download C2PA sample** (checked exhaustively: GitHub issue/PR attachments, contentcredentials gallery, HF datasets -- all upload-to-verify or token-gated; Canon's only public file was a self-signed hobbyist CR3, not factory), so they stay unmapped until a real file is captured (same fixture discipline as Grok/Doubao). The Sony sample is video (MP4) -- our ISOBMFF C2PA path detects it; Sony Alpha stills likely share the `sony.*` namespace but are not separately verified. **Samsung Galaxy + ASUS Gallery live in a separate `_SIGNER_C2PA_PLATFORM` (scanned after `_device_platform`, before the issuer fallback), NOT in `_DEVICE_C2PA_PLATFORM`** — verified on real signed files 2026-05-29. Reason: a Galaxy phone stamps BOTH its device cert AND a `trainedAlgorithmicMedia`/genAIType AI marker on a Generative-Edit image, so treating it as a "genuine camera capture" would false-fire integrity-clash rule 2 on every Galaxy AI edit. The signer tokens (`b"Samsung Galaxy"` cert org — distinct from the EXIF `SM-xxxx` model string on ordinary Samsung photos; `b"com.asus.gallery"` claim generator) only resolve the platform label; the AI verdict still comes from the source-type / genAIType. ASUS Gallery is a C2PA-signed edit with no AI marker, so it attributes the platform without asserting `is_ai`. **Samsung's `genAIType` (in the proprietary `PhotoEditor_Re_Edit_Data` JSON) is an undocumented Galaxy-AI editing marker** (`metadata.samsung_genai`, gated on the `PhotoEditor_Re_Edit_Data` container; non-zero value = AI tool used, values {1,5} observed): medium-confidence because the field has no public spec (verified 2026-05-29: absent from C2PA spec + Samsung docs), but it co-occurred with `trainedAlgorithmicMedia` in 3/3 verified files that record a source-type and was the SOLE AI marker on a Galaxy S24 file that omits the source type. Camera C2PA marks capture authenticity, not AI (Pixel carries `computationalCapture`, not `trainedAlgorithmicMedia`), so these never set `is_ai` -- that stays driven by digital-source-type. `c2pa.cbor_text_after` (now public) is best-effort for the `generator` detail string only and can be None when the manifest keys it `claim_generator_info` (Pixel). **Issuer→generator mapping is `is_ai`-gated** (`_attribute_platform(issuers, is_ai=c2pa_is_ai)`): a specific AI-generator platform is named only when the digital-source-type is `trainedAlgorithmicMedia`; on a non-AI source an issuer substring is treated as incidental (an "Adobe XMP" toolkit string in an *unmapped* Canon/Sony capture would otherwise mislabel it "Adobe Firefly"), so it degrades to the neutral "C2PA signer: X" label. Real Firefly/OpenAI/Google output carries the AI source-type, so it is unaffected (verified: chatgpt-1.png→OpenAI, firefly-1.png→Adobe Firefly still attribute). `_attribute_platform` defaults `is_ai=True` so the mapping stays unit-testable in isolation. Add capture-camera tokens to `_DEVICE_C2PA_PLATFORM`, editing-app/AI-device signer tokens to `_SIGNER_C2PA_PLATFORM`, generator/issuer platforms to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. **Integrity-clash detection** (`_integrity_clashes`, surfaced as `ProvenanceReport.integrity_clashes`, printed in red by `identify` and serialized to `--json`): contradictions between independent generator stamps are a laundering/spoofing tell. Two rules: (1) two or more distinct AI-origin vendors named by independent signals (e.g. C2PA OpenAI + EXIF `Make="Ideogram AI"`), and (2) a camera-capture C2PA device (`_DEVICE_C2PA_PLATFORM`) coexisting with any AI-generation marker. Vendor normalization is `_vendor_of` over `_AI_VENDOR_TOKENS` (so a C2PA "Google (Gemini)" issuer and a SynthID-Google proxy agree, while different vendors clash). **High-precision by design:** only hard generator stamps feed it (C2PA-issuer when source is AI, SynthID, EXIF/XMP generator, IPTC `AISystemUsed`, xAI, AIGC); the fuzzy visible sparkle and the open invisible watermark are **excluded** (the latter can be a by-product of our own SDXL removal pass). The c2pa vendor is classified from the issuer attribution / generator, NOT the resolved `platform` (a camera label like "Google Pixel" would mis-normalize to "Google"). All real single-origin fixtures (chatgpt/firefly/doubao/grok/mj) verified to produce **zero** clashes (false-positive guard in `test_identify.py::TestRealSamplesHaveNoClash`). - `watermark_registry.py` — **single catalog of known visible watermarks**, the unified "find known marks in their usual places, recognize, remove" entry. **Reverse-alpha only by policy**: a mark is listed only once a real alpha map has been captured for it, and removal inverts that map (`original = (wm - a*logo)/(1-a)`, exact recovery) — no inpaint/heuristic removal here (arbitrary-region inpainting lives in `region_eraser`/`erase`). Each `KnownMark` ties a key to {usual `location`, `in_auto` flag, `recovery` (="reverse-alpha"), a `detect` adapter → uniform `MarkDetection`, a `remove` adapter}. Entries today: `gemini` (bottom-right sparkle) and `doubao` (bottom-right "豆包AI生成"). `detect_marks` scans all; `best_auto_mark` picks the highest-confidence detection. **Cross-engine confidences aren't directly comparable**, so the gemini adapter applies the corpus-validated 0.5 sparkle threshold (`_GEMINI_AUTO_MIN_CONF`) for its `detected` flag — otherwise the gemini engine's loose internal threshold weakly fires (~0.36) on the Doubao text and hijacks `auto`. `cli.cmd_visible` is registry-driven: `--mark auto` → `best_auto_mark`, `--mark ` → that mark; `--mark` choices come from `mark_keys()`. `_doubao_remove` applies reverse-alpha only when the mark is detected AND `reverse_alpha_available` (resolution in the alpha band); outside that, removal is **skipped** (not inpainted). Add a new visible mark = one `KnownMark` entry + its engine (with a captured alpha map); do not re-add per-mark `if` branches in the CLI. **Alpha-on-save policy (issue #30):** `cli._write_bgr_with_alpha` rejoins the input's alpha plane **unchanged** — it must NOT zero alpha in the watermark bbox. Reverse-alpha (and `erase` inpaint) recover real pixels there, so zeroing alpha punched a transparent hole that renders as a solid **white box** on any non-transparent viewer (Gemini app exports are opaque RGBA, so every user hit it; regression-guarded by `test_visible_keeps_alpha_opaque_in_watermark_region`). The registry `remove()` still returns its region (used for `inpaint_residual` positioning), but the CLI no longer uses it to clear alpha. -- `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`. -- `doubao_engine.py` — visible Doubao "豆包AI生成" remover/detector (cv2/numpy, no GPU), **reverse-alpha only**. `DoubaoEngine.locate` anchors a bottom-right box by **geometry** (mark scales with image WIDTH), `extract_mask` pulls the light low-saturation glyphs (the detection candidate). `detect` is **reverse-alpha-consistent**: it matches the bundled alpha glyph silhouette (`assets/doubao_alpha.png`, the exact shape we invert) against the candidate via zero-mean normalized correlation (`_template_match_score`, cv2 `TM_CCOEFF_NORMED`), gated at `DETECT_NCC_THRESHOLD` 0.4 over a small `DETECT_MIN_COVERAGE` floor. Keying on glyph SHAPE (not coverage/structure heuristics) fixed #23: corpus FP fell to 7/1243 (0.6%); old coverage-only fired on ~28%. **Removal is exact reverse-alpha** (`remove_watermark_reverse_alpha`): `original = (wm - a*logo)/(1-a)` from the bundled alpha map + `_ALPHA_LOGO_BGR` (near-white ~253) + `_ALPHA_*_FRAC` geometry. The alpha map + logo were **solved from real black+gray Doubao captures** (`data/doubao_capture/captures/`, gitignored): on black `captured = a*logo`, the black/gray pair solves `a` per-pixel without assuming the logo colour (white capture cross-validates: mark → flat fill). The single captured alpha map (at width 2048) **generalizes to any resolution**: at (near) the captured width (`_ALPHA_NATIVE_BAND` of `_ALPHA_NATIVE_WIDTH`) `_fixed_alpha_map` places it by exact width-relative geometry (pixel-exact recovery, ~0.9 mean error — the whole point of reverse-alpha); off that width it **tries BOTH placements -- fixed geometry AND `_aligned_alpha_map`'s `TM_CCOEFF_NORMED` scale+position search (`_ALPHA_ALIGN_SEARCH`) -- and keeps whichever leaves the least residual mark** (re-`detect` confidence on the bare reverse-alpha). On a faint/busy-background mark the NCC peak wanders a few px and geometry wins; on a clear mark alignment wins -- no magic threshold, it just picks the better removal. Verified **56/56 real detected-Doubao removed clean across all corpus resolutions** (2048 fixed 27/27, 1773 22/22, plus 1185/1187/1535/1672); a single fixed-vs-aligned choice left 2/56 busy-background residuals, try-both fixed them. `reverse_alpha_available` is just "asset present"; the registry still gates removal on `detect` so a clean corner is never touched. **Residual inpaint is off-native-only:** at the captured width the fixed-geometry recovery is exact, so it is returned untouched -- inpainting over exactly-recovered interior pixels only swaps them for a cv2 hallucination (measured worse, native textured-bg error vs true bg **1.6 reverse-alpha-only vs 2.6 with the old always-on full-footprint inpaint**; regression-guarded by `test_native_returns_exact_reverse_alpha_no_inpaint`). Off-native the NCC alignment is only sub-pixel-approximate, so the interior is no longer exact and a residual inpaint over the glyph footprint cleans the seam (costs nothing there and reliably clears the mark). The shipped third-party `_refs/zhengsuanfa_doubao_alpha_120x20.png` is NOT a usable alpha (≈0.85 everywhere → blacks out on inversion; wrong resolution/version), verified 2026-05-29. There is no inpaint-based removal here (removed 2026-05-29; arbitrary-region inpainting is `region_eraser`/`erase`). -- `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)`: `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal. +- `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`. The public entry points normalize a grayscale (2D) or RGBA (4-channel) input to BGR up front so a non-BGR image does not crash the cv2 pipeline. +- `doubao_engine.py` — visible Doubao "豆包AI生成" remover/detector (cv2/numpy, no GPU), **reverse-alpha only**. `DoubaoEngine.locate` anchors a bottom-right box by **geometry** (mark scales with image WIDTH), `extract_mask` pulls the light, low-chroma glyphs (the detection candidate) using a per-pixel channel-spread proxy `sat = roi.max(axis=2) - roi.min(axis=2)` (no HSV conversion). `detect` is **reverse-alpha-consistent**: it matches the bundled alpha glyph silhouette (`assets/doubao_alpha.png`, the exact shape we invert) against the candidate via zero-mean normalized correlation (`_template_match_score`, cv2 `TM_CCOEFF_NORMED`), gated at `DETECT_NCC_THRESHOLD` 0.4 over a small `DETECT_MIN_COVERAGE` floor. Keying on glyph SHAPE (not coverage/structure heuristics) fixed #23: corpus FP fell to 7/1243 (0.6%); old coverage-only fired on ~28%. **Removal is exact reverse-alpha** (`remove_watermark_reverse_alpha`): `original = (wm - a*logo)/(1-a)` from the bundled alpha map + `_ALPHA_LOGO_BGR` (near-white ~253) + `_ALPHA_*_FRAC` geometry. The alpha map + logo were **solved from real black+gray Doubao captures** (`data/doubao_capture/captures/`, gitignored): on black `captured = a*logo`, the black/gray pair solves `a` per-pixel without assuming the logo colour (white capture cross-validates: mark → flat fill). The single captured alpha map (at width 2048) **generalizes to any resolution**: at (near) the captured width (`_ALPHA_NATIVE_BAND` of `_ALPHA_NATIVE_WIDTH`) `_fixed_alpha_map` clamps the glyph box to the image bounds (no crash on a degenerate width-vs-height) and places it by exact width-relative geometry (pixel-exact recovery, ~0.9 mean error — the whole point of reverse-alpha); off that width it **tries BOTH placements -- fixed geometry AND `_aligned_alpha_map`'s `TM_CCOEFF_NORMED` scale+position search (`_ALPHA_ALIGN_SEARCH`) -- and keeps whichever leaves the least residual mark** (re-`detect` confidence on the bare reverse-alpha). On a faint/busy-background mark the NCC peak wanders a few px and geometry wins; on a clear mark alignment wins -- no magic threshold, it just picks the better removal. Verified **56/56 real detected-Doubao removed clean across all corpus resolutions** (2048 fixed 27/27, 1773 22/22, plus 1185/1187/1535/1672); a single fixed-vs-aligned choice left 2/56 busy-background residuals, try-both fixed them. `reverse_alpha_available` is just "asset present"; the registry still gates removal on `detect` so a clean corner is never touched. **Residual inpaint is off-native-only:** at the captured width the fixed-geometry recovery is exact, so it is returned untouched -- inpainting over exactly-recovered interior pixels only swaps them for a cv2 hallucination (measured worse, native textured-bg error vs true bg **1.6 reverse-alpha-only vs 2.6 with the old always-on full-footprint inpaint**; regression-guarded by `test_native_returns_exact_reverse_alpha_no_inpaint`). Off-native the NCC alignment is only sub-pixel-approximate, so the interior is no longer exact and a residual inpaint over the glyph footprint cleans the seam (costs nothing there and reliably clears the mark). The shipped third-party `_refs/zhengsuanfa_doubao_alpha_120x20.png` is NOT a usable alpha (≈0.85 everywhere → blacks out on inversion; wrong resolution/version), verified 2026-05-29. There is no inpaint-based removal here (removed 2026-05-29; arbitrary-region inpainting is `region_eraser`/`erase`). +- `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)` normalizes grayscale (2D) and RGBA (4-channel) inputs up front (`erase_cv2` splits off any alpha plane and re-attaches it on the result): `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal. - `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. The `detect` extra pulls **torch** transitively (invisible-watermark declares torch a hard dep, and `WatermarkDecoder` eagerly imports `rivaGan` -> `torch` at import time), so detection needs torch present even though dwtDct runs CPU-only on cv2/numpy/pywavelets — no GPU and no separate `gpu` extra required. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. -- `trustmark_detector.py` — `detect_trustmark(path)` decodes the OPEN, keyless **Adobe TrustMark** watermark (the soft binding behind Adobe Durable Content Credentials, `alg` `com.adobe.trustmark.P`) via the optional `trustmark` package (extra `trustmark`; pulls torch, downloads model weights on first use). Mirrors `invisible_watermark.py` (lazy singleton, top-of-module pyright pragma, returns None when absent). It detects *provenance*, not AI origin as such (TrustMark also marks human-authored content), so `identify` lists it as a watermark without setting `is_ai_generated`. Other soft-binding vendors (Digimarc/Imatag/Steg.AI/...) have no public decoder — they are only *named* via the `C2PA_SOFT_BINDINGS` scan, not decoded. **False-positive gate (added 2026-05-29):** TrustMark's `wm_present` is a BCH error-correction validity flag that spuriously validates on a content-correlated fraction of un-watermarked images — AI-generated textures trip it far more than camera photos (verified 2026-05-29 on real files: it fires on Gemini/OpenAI/Doubao output that *cannot* carry Adobe's watermark, with a random-bytes decoded secret, while signal-free camera photos did not trip it). A genuine TrustMark is a *durable* soft binding engineered to survive re-encoding, so `detect_trustmark` re-decodes after a mild JPEG round-trip (`_survives_reencode`, `_REENCODE_QUALITY` 95) and requires the same schema both times; every observed false positive collapsed (none survived even q95), so the gate is the durability property the watermark guarantees. The second decode runs only on the rare initial hit, so the cost is negligible. Do NOT remove the gate to "catch more" — a lone TrustMark hit without it is almost always content noise. -- `text_protector.py` — text-region protection for the `invisible` SDXL img2img pass (issue #21: CJK/small text deforms at watermark-removal strengths). `is_available()` gates on `cv2.dnn.TextDetectionModel_DB`; `TextProtector.detect_text_boxes(bgr)` runs the **PP-OCRv3 DB** ONNX detector (~2.4 MB, Apache-2.0, opencv_zoo, returns rotated quad polygons) — downloaded+cached to `~/.cache/remove-ai-watermarks` on first use via atomic temp-rename, never bundled, **no torch (cv2.dnn only)**. **Detection is script-agnostic** (DB segments text *regions*, not characters), so Latin / Cyrillic / CJK / Hangul / Arabic / digits all detect identically — language was never the recall lever, **resolution was**. `_detection_input_size(h, w)` (pure, unit-tested) detects at the **native long side capped at `_DET_MAX_LONG_SIDE` (1536), never upscaled**: the old fixed 736 downscaled large canvases so small text fell below the detector and was missed (issue #14, e.g. ~16 px text on a 2048 image). `scripts/text_detection_benchmark.py` measures recall across scripts × sizes × canvas: the cap fix lifts overall hit-rate 0.91 → 1.00 (worst cell 2048/16 px: 0.06 → 1.00) at ~100 ms CPU. Very large canvases with tiny text may still need tiling (documented limit, not built). `build_change_map(boxes, h, w, preserve=0.9, feather=15)` paints a Differential-Diffusion change map. **Polarity (verified empirically):** white(1.0)=PRESERVE original pixels, black(0.0)=MAX change; map is black bg + `preserve` inside text polygons, Gaussian-feathered edges, clipped to [0,1]. `preserve` stays below a hard 1.0 freeze by default so text still scrubs lightly (SynthID survives cropping). **Default text protection is `watermark_remover._run_region_hires`, NOT the differential change map.** Differential Diffusion froze text in latent space (`preserve`<1.0), so the watermark survived *inside* text — violating the "remove SynthID everywhere" requirement; and the SDXL VAE's 8px latent cell softens sub-8px strokes regardless of `preserve` (architectural limit, confirmed by the DD authors — see `docs/text-protection-research.md`). `_run_region_hires` instead: (1) scrubs the whole image (plain img2img), (2) RE-scrubs each detected text block at HIGH resolution and feather-composites it back. `merge_text_regions(boxes,h,w)` groups boxes into local blocks; each crop is upscaled by an **integer** factor (`_REGION_HIRES_SCALE` 3, capped so a region stays under `_REGION_MAX_MEGAPIXELS` 1.3 to avoid OOM; skipped if it can't reach 2x — very large text areas then fall back to the global scrub, tiling is the future fix), img2img-scrubbed, downscaled, **phase-correlated back to the original crop to null the ~1-2px round-trip offset** (a sub-pixel shift garbles the composite even when text is crisp; integer scale alone did NOT fix it because the diffusion pipeline rounds dims to a multiple of 8), then `feather_paste`d. Every pixel is regenerated, so the watermark is removed everywhere AND small text stays crisp (high-res strokes span >1 latent cell). Validated on synthetic 18px multilingual text: text-region SSIM 0.28 (plain) → 0.48 (region-hires), visually garbled → readable across Latin/Cyrillic/CJK, residual shift ~0.5px. Gated to the SDXL `DEFAULT_MODEL_ID` + detector (`_can_protect_text`); no text → plain global scrub (text-free inputs pay only the cheap cv2 detection). CLI off-switch `--no-protect-text` on `invisible`/`all`. `merge_text_regions` + `feather_paste` are pure, unit-tested without a model (`tests/test_text_protector.py`). **MUST still be confirmed by the SynthID oracle** (openai.com/verify / Gemini app) that a region-rescrubbed text zone reads watermark-free before trusting it in prod. The legacy `_run_differential` / `build_change_map` / `_load_differential_pipeline` (community `pipeline_stable_diffusion_xl_differential_img2img`, `custom_revision="0.38.0"`) remain in the file but are no longer the default; the diff pipeline upcasts the VAE to fp32 internally, so do **not** add `upcast_vae()`/`enable_attention_slicing` there (NaN/black on fp16 MPS). `build_change_map` is still unit-tested. +- `trustmark_detector.py` — `detect_trustmark(path)` decodes the OPEN, keyless **Adobe TrustMark** watermark (the soft binding behind Adobe Durable Content Credentials, `alg` `com.adobe.trustmark.P`) via the optional `trustmark` package (extra `trustmark`; pulls torch, downloads model weights on first use). Mirrors `invisible_watermark.py` (lazy singleton guarded by a double-checked `threading.Lock` so concurrent callers do not double-download the weights, top-of-module pyright pragma, returns None when absent). It detects *provenance*, not AI origin as such (TrustMark also marks human-authored content), so `identify` lists it as a watermark without setting `is_ai_generated`. Other soft-binding vendors (Digimarc/Imatag/Steg.AI/...) have no public decoder — they are only *named* via the `C2PA_SOFT_BINDINGS` scan, not decoded. **False-positive gate (added 2026-05-29):** TrustMark's `wm_present` is a BCH error-correction validity flag that spuriously validates on a content-correlated fraction of un-watermarked images — AI-generated textures trip it far more than camera photos (verified 2026-05-29 on real files: it fires on Gemini/OpenAI/Doubao output that *cannot* carry Adobe's watermark, with a random-bytes decoded secret, while signal-free camera photos did not trip it). A genuine TrustMark is a *durable* soft binding engineered to survive re-encoding, so `detect_trustmark` re-decodes after a mild JPEG round-trip (`_survives_reencode`, `_REENCODE_QUALITY` 95) and requires the same schema both times; every observed false positive collapsed (none survived even q95), so the gate is the durability property the watermark guarantees. The second decode runs only on the rare initial hit, so the cost is negligible. Do NOT remove the gate to "catch more" — a lone TrustMark hit without it is almost always content noise. +- `text_protector.py` — text-region protection for the `invisible` SDXL img2img pass (issue #21: CJK/small text deforms at watermark-removal strengths). `is_available()` gates on `cv2.dnn.TextDetectionModel_DB`; `TextProtector.detect_text_boxes(bgr)` runs the **PP-OCRv3 DB** ONNX detector (~2.4 MB, Apache-2.0, opencv_zoo, returns rotated quad polygons) — downloaded+cached to `~/.cache/remove-ai-watermarks` on first use via atomic temp-rename, never bundled, **no torch (cv2.dnn only)**. **Detection is script-agnostic** (DB segments text *regions*, not characters), so Latin / Cyrillic / CJK / Hangul / Arabic / digits all detect identically — language was never the recall lever, **resolution was**. `_detection_input_size(h, w)` (pure, unit-tested) detects at the **native long side capped at `_DET_MAX_LONG_SIDE` (1536), never upscaled**: the old fixed 736 downscaled large canvases so small text fell below the detector and was missed (issue #14, e.g. ~16 px text on a 2048 image). `scripts/text_detection_benchmark.py` measures recall across scripts × sizes × canvas: the cap fix lifts overall hit-rate 0.91 → 1.00 (worst cell 2048/16 px: 0.06 → 1.00) at ~100 ms CPU. Very large canvases with tiny text may still need tiling (documented limit, not built). `build_change_map(boxes, h, w, preserve=0.9, feather=15)` paints a Differential-Diffusion change map. **Polarity (verified empirically):** white(1.0)=PRESERVE original pixels, black(0.0)=MAX change; map is black bg + `preserve` inside text polygons, Gaussian-feathered edges, clipped to [0,1]. `preserve` stays below a hard 1.0 freeze by default so text still scrubs lightly (SynthID survives cropping). **Default text protection is `watermark_remover._run_region_hires`, NOT the differential change map.** Differential Diffusion froze text in latent space (`preserve`<1.0), so the watermark survived *inside* text — violating the "remove SynthID everywhere" requirement; and the SDXL VAE's 8px latent cell softens sub-8px strokes regardless of `preserve` (architectural limit, confirmed by the DD authors — see `docs/text-protection-research.md`). `_run_region_hires` instead: (1) scrubs the whole image (plain img2img), (2) RE-scrubs each detected text block at HIGH resolution and feather-composites it back. `merge_text_regions(boxes,h,w)` groups boxes into local blocks; each crop is upscaled by `_REGION_HIRES_SCALE` 3.0 (applied as an integer factor via int(...), capped so a region stays under `_REGION_MAX_MEGAPIXELS` 1.3 to avoid OOM; skipped if it can't reach 2x — very large text areas then fall back to the global scrub, tiling is the future fix), img2img-scrubbed, downscaled, **phase-correlated back to the original crop to null the ~1-2px round-trip offset** (the shift is applied only on a confident, small correlation -- `response > 0.3` and `|shift| < 4` -- so a spurious large offset on a flat crop no longer garbles the composite; and after a CPU fallback the generator is dropped before the per-region passes to avoid an MPS-vs-CPU generator device mismatch) (a sub-pixel shift garbles the composite even when text is crisp; integer scale alone did NOT fix it because the diffusion pipeline rounds dims to a multiple of 8), then `feather_paste`d. Every pixel is regenerated, so the watermark is removed everywhere AND small text stays crisp (high-res strokes span >1 latent cell). Validated on synthetic 18px multilingual text: text-region SSIM 0.28 (plain) → 0.48 (region-hires), visually garbled → readable across Latin/Cyrillic/CJK, residual shift ~0.5px. Gated to the SDXL `DEFAULT_MODEL_ID` + detector (`_can_protect_text`); no text → plain global scrub (text-free inputs pay only the cheap cv2 detection). CLI off-switch `--no-protect-text` on `invisible`/`all`. `merge_text_regions` + `feather_paste` are pure, unit-tested without a model (`tests/test_text_protector.py`). **MUST still be confirmed by the SynthID oracle** (openai.com/verify / Gemini app) that a region-rescrubbed text zone reads watermark-free before trusting it in prod. The legacy `_run_differential` / `build_change_map` / `_load_differential_pipeline` (community `pipeline_stable_diffusion_xl_differential_img2img`, `custom_revision="0.38.0"`) remain in the file but are no longer the default; the diff pipeline upcasts the VAE to fp32 internally, so do **not** add `upcast_vae()`/`enable_attention_slicing` there (NaN/black on fp16 MPS). `build_change_map` is still unit-tested. - `face_protector.py` — YOLO detect + soft-blend pattern; mirror this for any "protect region during diffusion" features -- `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. +- `humanizer.py` — optional post-process "humanize" effects (cv2/numpy). The chromatic-shift step replicates the border instead of wrapping opposite-edge pixels, so a shifted channel no longer bleeds the far edge into the near one. +- `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** `imwrite` returns `False` on an unwritable path (`OSError` caught) instead of raising, matching `cv2.imwrite` semantics. macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. ### Doubao clean-reverse-alpha distillation (re-investigated 2026-05-29) diff --git a/README.md b/README.md index 894c2e7..0553d73 100644 --- a/README.md +++ b/README.md @@ -98,11 +98,11 @@ The removal pipeline (default profile, SDXL): ```text image → encode to latent space (VAE) at native resolution → add controlled noise (forward diffusion) - → denoise (reverse diffusion, ~50 steps at strength 0.05) + → denoise (reverse diffusion, ~50 steps at strength 0.10) → decode back to pixels (VAE) ``` -By default the image is processed at its **native resolution** with no pre-downscale, matching the hosted raiw.cc backend (fal `fast-sdxl`, which is `stabilityai/stable-diffusion-xl-base-1.0` — the same checkpoint the CLI defaults to). At strength ~0.05 SDXL img2img does not need the input shrunk, and the old forced downscale-to-1024 then upscale-back round-trip was the main quality loss. Pass `--max-resolution N` to cap the long side only when a very large image runs out of GPU/MPS memory (it reintroduces that lossy round-trip). +- Native resolution avoids shrinking the input to 1024 px first; that down-then-up round-trip was the main quality loss (issue #10). Use `--max-resolution N` only to cap GPU/MPS memory on very large inputs. SDXL is the default since May 2026: empirically defeats SynthID v2 on Gemini 3 Pro outputs, where the older SD-1.5 pipeline at 768 px did not. The SD-1.5 path was removed once it was verified not to handle v2. Note the scope: this defeats the SynthID *verifier*, which is not the same as being forensically indistinguishable from a real photo. Recent work ([arXiv:2605.09203](https://arxiv.org/abs/2605.09203)) shows watermark-removal pipelines leave detectable traces, so a separate "this image was processed" classifier can still flag the output. diff --git a/data/doubao_capture/README.md b/data/doubao_capture/README.md index ec3a8d1..e55adbb 100644 --- a/data/doubao_capture/README.md +++ b/data/doubao_capture/README.md @@ -1,5 +1,11 @@ # Doubao visible watermark capture +> **Status (completed 2026-05-29):** the capture described below was carried out (black + gray +> Doubao captures) and the exact alpha map was solved. Removal is now **reverse-alpha only**: at the +> captured native width recovery is pixel-exact and inpaint is OFF; a residual inpaint runs off-native +> only. See the `doubao_engine.py` notes in the root `CLAUDE.md`. The text below is kept as the +> historical capture plan. + Goal: capture the Doubao "豆包AI生成" visible watermark over known flat backgrounds so we can build a per-pixel alpha map and a reverse-alpha-blend remover, the same way the Gemini sparkle engine works (`src/remove_ai_watermarks/gemini_engine.py`). @@ -16,8 +22,10 @@ engine works (`src/remove_ai_watermarks/gemini_engine.py`). - Size **scales with resolution**. Third-party numbers (~90x18 at <=1024, ~180x40 at >1024) are approximate and calibrated for ~1024-1280 outputs; at 2048 the strip is much larger. A shipped third-party alpha map is only 120x20, too small for our 2K/4K target -> capture fresh. -- In practice clean inversion leaves residue on textured backgrounds, so the remover pairs the alpha - map with inpainting (our Gemini engine already does gradient-masked inpainting for residual edges). +- The planning assumption was that clean inversion leaves residue on textured backgrounds, so the + remover would pair the alpha map with inpainting. After the capture this turned out unnecessary at + the native width (recovery is pixel-exact there and inpaint is off); the shipped remover is + reverse-alpha only, with a residual inpaint applied off-native only. ## Use doubao.com specifically diff --git a/src/remove_ai_watermarks/cli.py b/src/remove_ai_watermarks/cli.py index f270e24..f32ac6c 100644 --- a/src/remove_ai_watermarks/cli.py +++ b/src/remove_ai_watermarks/cli.py @@ -160,7 +160,6 @@ def main(ctx: click.Context, verbose: bool) -> None: ) @click.option("--inpaint-strength", type=float, default=0.85, help="Inpainting blend strength (0.0-1.0).") @click.option("--detect/--no-detect", default=True, help="Detect watermark before removal.") -@click.option("--detect-threshold", type=float, default=0.25, help="Detection confidence threshold.") @click.option( "--mark", type=click.Choice(["auto", *watermark_registry.mark_keys()]), @@ -178,7 +177,6 @@ def cmd_visible( inpaint_method: Literal["ns", "telea", "gaussian"], inpaint_strength: float, detect: bool, - detect_threshold: float, mark: str, strip_metadata: bool, ) -> None: diff --git a/src/remove_ai_watermarks/doubao_engine.py b/src/remove_ai_watermarks/doubao_engine.py index e80a5ca..e50cb91 100644 --- a/src/remove_ai_watermarks/doubao_engine.py +++ b/src/remove_ai_watermarks/doubao_engine.py @@ -222,7 +222,14 @@ class DoubaoEngine: """ h, w = image.shape[:2] x, y, bw, bh = loc.bbox - roi = image[y : y + bh, x : x + bw].astype(np.float32) + # Normalize the ROI to 3-channel BGR: a 2D grayscale or 4-channel BGRA + # input would otherwise break the axis=2 channel reductions below. + roi = image[y : y + bh, x : x + bw] + if roi.ndim == 2: + roi = cv2.cvtColor(roi, cv2.COLOR_GRAY2BGR) + elif roi.shape[2] == 4: + roi = cv2.cvtColor(roi, cv2.COLOR_BGRA2BGR) + roi = roi.astype(np.float32) luma = roi.mean(axis=2) sat = roi.max(axis=2) - roi.min(axis=2) @@ -290,7 +297,12 @@ class DoubaoEngine: if at is None: return None h, w = image.shape[:2] - gw, gh = max(1, int(_ALPHA_WIDTH_FRAC * w)), max(1, int(_ALPHA_HEIGHT_FRAC * w)) + # Glyph box scales with WIDTH; on a wide/short image the height-from-width + # box can exceed the image height. Clamp both dims so the slice assignment + # below cannot overflow (a degenerate 2048x1 input otherwise raised + # ValueError on the broadcast). Normal images are unaffected. + gw = min(w, max(1, int(_ALPHA_WIDTH_FRAC * w))) + gh = min(h, max(1, int(_ALPHA_HEIGHT_FRAC * w))) ax = max(0, w - int(_ALPHA_MARGIN_RIGHT_FRAC * w) - gw) ay = max(0, h - int(_ALPHA_MARGIN_BOTTOM_FRAC * w) - gh) amap = np.zeros((h, w), np.float32) @@ -353,6 +365,12 @@ class DoubaoEngine: inpaint there costs nothing and reliably clears the mark). Call only when :meth:`reverse_alpha_available` and the mark is detected. """ + # Normalize to 3-channel BGR so a 2D grayscale or 4-channel BGRA input + # does not break the reverse-alpha math (which assumes a 3-channel logo). + if image.ndim == 2: + image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + elif image.shape[2] == 4: + image = cv2.cvtColor(image, cv2.COLOR_BGRA2BGR) at_native = abs(image.shape[1] / _ALPHA_NATIVE_WIDTH - 1.0) <= _ALPHA_NATIVE_BAND if at_native: amap = self._fixed_alpha_map(image) diff --git a/src/remove_ai_watermarks/gemini_engine.py b/src/remove_ai_watermarks/gemini_engine.py index d64c152..5ef7048 100644 --- a/src/remove_ai_watermarks/gemini_engine.py +++ b/src/remove_ai_watermarks/gemini_engine.py @@ -329,16 +329,21 @@ class GeminiEngine: """ result = image.copy() - # Handle alpha channel - if result.shape[2] == 4: + # Normalize to 3-channel BGR up front: 2D grayscale (no channel axis) and + # 4-channel BGRA both reach this public entry point and would otherwise + # crash on the channel-count checks / downstream 3-channel math. + if result.ndim == 2: + result = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR) + elif result.shape[2] == 4: result = cv2.cvtColor(result, cv2.COLOR_BGRA2BGR) elif result.shape[2] == 1: result = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR) size = force_size or get_watermark_size(result.shape[1], result.shape[0]) - # Detect dynamic position & size - detection = self.detect_watermark(image, force_size=size) + # Detect dynamic position & size (on the normalized 3-channel image so a + # grayscale/BGRA input does not crash the detector). + detection = self.detect_watermark(result, force_size=size) if not detection.detected: logger.debug( diff --git a/src/remove_ai_watermarks/humanizer.py b/src/remove_ai_watermarks/humanizer.py index 4d14666..c835ed7 100644 --- a/src/remove_ai_watermarks/humanizer.py +++ b/src/remove_ai_watermarks/humanizer.py @@ -36,10 +36,14 @@ def apply_analog_humanizer(image: NDArray, grain_intensity: float = 4.0, chromat b, g, r = cv2.split(image) # 1. Chromatic Aberration - # Shift R channel left, B channel right + # Shift R channel left, B channel right. np.roll is circular, so it wraps + # the opposite edge into a thin colored fringe at the L/R borders; replicate + # the original edge columns there to keep the intended offset interior-only. if chromatic_shift > 0: r = np.roll(r, -chromatic_shift, axis=1) + r[:, -chromatic_shift:] = r[:, -chromatic_shift - 1 : -chromatic_shift] b = np.roll(b, chromatic_shift, axis=1) + b[:, :chromatic_shift] = b[:, chromatic_shift : chromatic_shift + 1] merged = cv2.merge((b, g, r)) diff --git a/src/remove_ai_watermarks/identify.py b/src/remove_ai_watermarks/identify.py index 2b54ba6..81fa929 100644 --- a/src/remove_ai_watermarks/identify.py +++ b/src/remove_ai_watermarks/identify.py @@ -431,7 +431,7 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b if synthid: watermarks.append(f"SynthID pixel watermark ({synthid})") caveats.append(_SYNTHID_CAVEAT) - if "OpenAI" in (" ".join(issuers) + synthid): + if _vendor_of(synthid) == "OpenAI": caveats.append(_OPENAI_CAVEAT) if v := _vendor_of(synthid): ai_vendor_claims["synthid"] = v diff --git a/src/remove_ai_watermarks/image_io.py b/src/remove_ai_watermarks/image_io.py index 78cfd47..fdb3dc8 100644 --- a/src/remove_ai_watermarks/image_io.py +++ b/src/remove_ai_watermarks/image_io.py @@ -54,7 +54,8 @@ def imwrite(path: str | Path, img: NDArray[Any]) -> bool: The output format is taken from the path extension (e.g. ``.png``), exactly like ``cv2.imwrite``. Returns ``True`` on success, ``False`` if the codec - rejects the image. + rejects the image or the path cannot be written (matching ``cv2.imwrite``, + which returns ``False`` rather than raising on an unwritable path). """ import cv2 @@ -62,5 +63,8 @@ def imwrite(path: str | Path, img: NDArray[Any]) -> bool: ok, buf = cv2.imencode(ext, img) if not ok: return False - buf.tofile(str(path)) + try: + buf.tofile(str(path)) + except OSError: + return False return True diff --git a/src/remove_ai_watermarks/invisible_engine.py b/src/remove_ai_watermarks/invisible_engine.py index 146feac..4ae6531 100644 --- a/src/remove_ai_watermarks/invisible_engine.py +++ b/src/remove_ai_watermarks/invisible_engine.py @@ -73,7 +73,7 @@ class InvisibleEngine: """ # SDXL base is the default since May 2026: empirically defeats SynthID v2 - # at strength=0.05 / steps=50 / native ~1024px. See CLAUDE.md "Known + # at strength=0.10 / steps=50 / native ~1024px. See CLAUDE.md "Known # limitations" for the regression evidence ruling out SD-1.5 pipelines. DEFAULT_MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0" CTRLREGEN_MODEL_ID = "yepengliu/ctrlregen" @@ -227,6 +227,8 @@ class InvisibleEngine: from remove_ai_watermarks import image_io out_cv = image_io.imread(out_path, cv2.IMREAD_COLOR) + if out_cv is None: + return out_path if protect_faces and original_faces: if self._progress_callback: diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py index b6379b6..54c6179 100644 --- a/src/remove_ai_watermarks/metadata.py +++ b/src/remove_ai_watermarks/metadata.py @@ -190,6 +190,8 @@ def _png_late_metadata(image_path: Path, window: int) -> bytes: with open(image_path, "rb") as f: if f.read(8) != b"\x89PNG\r\n\x1a\n": return b"" + f.seek(0, 2) + file_size = f.tell() pos = 8 while True: f.seek(pos) @@ -201,9 +203,12 @@ def _png_late_metadata(image_path: Path, window: int) -> bytes: if chunk_type == b"IEND": break data_start = pos + 8 + # Clamp the attacker-controlled 32-bit length to the bytes that + # actually remain, so a malformed huge length can't allocate GBs. + safe_length = max(0, min(length, file_size - data_start)) if chunk_type in _PNG_META_CHUNKS and data_start >= window: f.seek(data_start) - out += f.read(length) + out += f.read(safe_length) pos = data_start + length + 4 # data + CRC except OSError as exc: logger.debug("PNG late-metadata scan failed on %s: %s", image_path, exc) diff --git a/src/remove_ai_watermarks/noai/c2pa.py b/src/remove_ai_watermarks/noai/c2pa.py index a0fa632..4bb1e0f 100644 --- a/src/remove_ai_watermarks/noai/c2pa.py +++ b/src/remove_ai_watermarks/noai/c2pa.py @@ -55,6 +55,9 @@ def has_c2pa_metadata(image_path: Path) -> bool: if signature != PNG_SIGNATURE: return False + file_size = f.seek(0, 2) + f.seek(8) + while True: chunk_header = f.read(8) if len(chunk_header) < 8: @@ -62,9 +65,12 @@ def has_c2pa_metadata(image_path: Path) -> bool: length = struct.unpack(">I", chunk_header[:4])[0] chunk_type = chunk_header[4:8] + # Clamp the attacker-controlled 32-bit length to the bytes that + # actually remain, so a malformed huge length can't allocate GBs. + safe_length = max(0, min(length, file_size - f.tell())) if chunk_type == C2PA_CHUNK_TYPE: - chunk_data = f.read(length) + chunk_data = f.read(safe_length) # Check for any C2PA signature for sig in C2PA_SIGNATURES: if sig in chunk_data: @@ -74,7 +80,7 @@ def has_c2pa_metadata(image_path: Path) -> bool: return True f.read(4) else: - f.read(length + 4) + f.seek(safe_length + 4, 1) if chunk_type == b"IEND": break @@ -108,6 +114,9 @@ def extract_c2pa_info(image_path: Path) -> dict[str, Any]: if signature != PNG_SIGNATURE: return c2pa_info + file_size = f.seek(0, 2) + f.seek(8) + while True: chunk_header = f.read(8) if len(chunk_header) < 8: @@ -115,13 +124,16 @@ def extract_c2pa_info(image_path: Path) -> dict[str, Any]: length = struct.unpack(">I", chunk_header[:4])[0] chunk_type = chunk_header[4:8] + # Clamp the attacker-controlled 32-bit length to the bytes that + # actually remain, so a malformed huge length can't allocate GBs. + safe_length = max(0, min(length, file_size - f.tell())) if chunk_type == C2PA_CHUNK_TYPE: - chunk_data = f.read(length) + chunk_data = f.read(safe_length) _parse_c2pa_chunk(chunk_data, c2pa_info) f.read(4) else: - f.read(length + 4) + f.seek(safe_length + 4, 1) if chunk_type == b"IEND": break @@ -278,6 +290,9 @@ def extract_c2pa_chunk(image_path: Path) -> bytes | None: if signature != PNG_SIGNATURE: return None + file_size = f.seek(0, 2) + f.seek(8) + while True: chunk_header = f.read(8) if len(chunk_header) < 8: @@ -285,9 +300,12 @@ def extract_c2pa_chunk(image_path: Path) -> bytes | None: length = struct.unpack(">I", chunk_header[:4])[0] chunk_type = chunk_header[4:8] + # Clamp the attacker-controlled 32-bit length to the bytes that + # actually remain, so a malformed huge length can't allocate GBs. + safe_length = max(0, min(length, file_size - f.tell())) if chunk_type == C2PA_CHUNK_TYPE: - chunk_data = f.read(length) + chunk_data = f.read(safe_length) crc = f.read(4) # Check for any C2PA signature @@ -299,7 +317,7 @@ def extract_c2pa_chunk(image_path: Path) -> bytes | None: if b"jumb" in chunk_data.lower() or b"c2pa" in chunk_data.lower(): return chunk_header + chunk_data + crc else: - f.read(length + 4) + f.seek(safe_length + 4, 1) if chunk_type == b"IEND": break diff --git a/src/remove_ai_watermarks/noai/ctrlregen/tiling.py b/src/remove_ai_watermarks/noai/ctrlregen/tiling.py index 52a7f5c..63d86fa 100644 --- a/src/remove_ai_watermarks/noai/ctrlregen/tiling.py +++ b/src/remove_ai_watermarks/noai/ctrlregen/tiling.py @@ -20,6 +20,8 @@ from PIL import Image def tile_positions(total: int, tile: int, overlap: int) -> list[int]: """Compute evenly-spaced tile start positions covering *total* pixels.""" + if not (0 <= overlap < tile): + raise ValueError(f"overlap must satisfy 0 <= overlap < tile (got overlap={overlap}, tile={tile})") if total <= tile: return [0] n = max(2, math.ceil((total - overlap) / (tile - overlap))) diff --git a/src/remove_ai_watermarks/noai/isobmff.py b/src/remove_ai_watermarks/noai/isobmff.py index a86f238..c2d0d6d 100644 --- a/src/remove_ai_watermarks/noai/isobmff.py +++ b/src/remove_ai_watermarks/noai/isobmff.py @@ -17,6 +17,7 @@ Reference: ISO/IEC 14496-12 (ISOBMFF) and C2PA 2.1 spec §11. from __future__ import annotations +import logging import re import struct from typing import TYPE_CHECKING @@ -32,6 +33,8 @@ from remove_ai_watermarks.metadata import ( IPTC_AI_MARKERS, ) +log = logging.getLogger(__name__) + # Top-level box types that may carry AI provenance. ``uuid`` boxes are checked # against ``C2PA_UUID`` / AI-label markers before being stripped; ``jumb`` boxes # are always stripped (JPEG-XL uses them exclusively for JUMBF). @@ -126,6 +129,8 @@ def scan_c2pa_region(path: str | Path, *, max_total: int = 4 * 1024 * 1024) -> b else: size = size32 if size < (payload_off - pos) or pos + size > file_size: + # Detection-only: a malformed box halts the walk, so a manifest + # placed after it is missed (best-effort scan; no resync). break if box_type in C2PA_BOX_TYPES: f.seek(payload_off) @@ -162,7 +167,9 @@ def strip_c2pa_boxes(data: bytes) -> tuple[bytes, int]: out = bytearray() stripped = 0 + consumed = 0 for start, end, box_type, payload_off in _iter_top_level_boxes(data): + consumed = end if box_type == b"uuid": # uuid boxes carry the 16-byte UUID immediately after the type. is_c2pa = payload_off + 16 <= end and data[payload_off : payload_off + 16] == C2PA_UUID @@ -174,6 +181,20 @@ def strip_c2pa_boxes(data: bytes) -> tuple[bytes, int]: stripped += 1 continue out.extend(data[start:end]) + + # Fail-safe: the walker returns early on a malformed box (bad size, or a box + # that runs past EOF), so anything after it was never visited. Emitting `out` + # would silently truncate the file from the bad box to EOF -- worse than not + # stripping. If the walk did not consume the whole input, return it unchanged. + if consumed != len(data): + log.warning( + "ISOBMFF box walk stopped at offset %d of %d (malformed box); " + "returning input unchanged to avoid truncation", + consumed, + len(data), + ) + return data, 0 + return bytes(out), stripped diff --git a/src/remove_ai_watermarks/noai/watermark_remover.py b/src/remove_ai_watermarks/noai/watermark_remover.py index d0498a4..adb3c28 100644 --- a/src/remove_ai_watermarks/noai/watermark_remover.py +++ b/src/remove_ai_watermarks/noai/watermark_remover.py @@ -272,6 +272,12 @@ def _make_seed_generator(device: str, seed: int) -> Any: return torch.Generator().manual_seed(seed) # type: ignore +def _generator_device(generator: Any) -> str: + """Best-effort device type of a ``torch.Generator`` (e.g. ``"cpu"``, ``"mps"``).""" + device = getattr(generator, "device", None) + return getattr(device, "type", str(device)) if device is not None else "cpu" + + # Keep legacy name available for backwards compatibility _detect_model_profile_from_id = detect_model_profile @@ -677,6 +683,14 @@ class WatermarkRemover: base = self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator) + # The base pass may have fallen back from MPS to CPU (it flips + # self.device). The generator was built for the original device, and + # diffusers rejects a device-mismatched generator ("Expected a 'cpu' + # device generator but found 'mps'"), so drop it for the per-region + # passes -- they then seed from the global RNG, which is fine here. + if generator is not None and self.device == "cpu" and _generator_device(generator) != "cpu": + generator = None + bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR) try: boxes = text_protector.TextProtector().detect_text_boxes(bgr) @@ -718,8 +732,13 @@ class WatermarkRemover: # the composite even though the text is crisp. cg = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY).astype(np.float32) dg = cv2.cvtColor(down, cv2.COLOR_BGR2GRAY).astype(np.float32) - (sx, sy), _resp = cv2.phaseCorrelate(cg, dg) - if abs(sx) > 0.1 or abs(sy) > 0.1: + (sx, sy), resp = cv2.phaseCorrelate(cg, dg) + # Only correct for the real 1-2px round-trip shift. On a near-flat / + # low-contrast crop phaseCorrelate returns a spurious large offset at + # a tiny response (e.g. (19,19) at resp ~0.005); warping by that + # garbles the composite -- the exact failure this was meant to + # prevent. Gate on both a confident response and a plausible offset. + if resp > 0.3 and abs(sx) < 4 and abs(sy) < 4 and (abs(sx) > 0.1 or abs(sy) > 0.1): m = np.float32([[1, 0, -sx], [0, 1, -sy]]) down = cv2.warpAffine(down, m, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE) out_bgr = text_protector.feather_paste(out_bgr, down, x, y) diff --git a/src/remove_ai_watermarks/region_eraser.py b/src/remove_ai_watermarks/region_eraser.py index b3cc6ee..cfbf95c 100644 --- a/src/remove_ai_watermarks/region_eraser.py +++ b/src/remove_ai_watermarks/region_eraser.py @@ -67,8 +67,16 @@ def erase_cv2( method: Literal["telea", "ns"] = "telea", radius: int = 6, ) -> NDArray[Any]: - """Inpaint ``mask`` with classical cv2 inpainting (CPU, no extra deps).""" + """Inpaint ``mask`` with classical cv2 inpainting (CPU, no extra deps). + + Accepts 1-/3-channel BGR (passed straight to ``cv2.inpaint``) and 4-channel + BGRA: ``cv2.inpaint`` rejects 4 channels, so the alpha plane is split off, + the BGR is inpainted, and alpha is re-attached unchanged. + """ flag = cv2.INPAINT_TELEA if method == "telea" else cv2.INPAINT_NS + if image_bgr.ndim == 3 and image_bgr.shape[2] == 4: + bgr = cv2.inpaint(image_bgr[:, :, :3], mask, radius, flag) + return np.dstack([bgr, image_bgr[:, :, 3]]) return cv2.inpaint(image_bgr, mask, radius, flag) diff --git a/src/remove_ai_watermarks/trustmark_detector.py b/src/remove_ai_watermarks/trustmark_detector.py index d3249bc..ed32a49 100644 --- a/src/remove_ai_watermarks/trustmark_detector.py +++ b/src/remove_ai_watermarks/trustmark_detector.py @@ -22,6 +22,7 @@ signal, not proof of AI origin. from __future__ import annotations import logging +import threading from typing import TYPE_CHECKING, Any if TYPE_CHECKING: @@ -32,7 +33,9 @@ log = logging.getLogger(__name__) # Adobe ships Variant P in production (com.adobe.trustmark.P). _MODEL_TYPE = "P" # Lazily constructed singleton -- model load + first-use download is expensive. +# Guarded by a lock so concurrent callers don't double-construct/double-download. _tm: Any = None +_tm_lock = threading.Lock() def is_available() -> bool: @@ -45,9 +48,11 @@ def is_available() -> bool: def _decoder() -> Any: global _tm if _tm is None: - from trustmark import TrustMark + with _tm_lock: + if _tm is None: + from trustmark import TrustMark - _tm = TrustMark(verbose=False, model_type=_MODEL_TYPE) + _tm = TrustMark(verbose=False, model_type=_MODEL_TYPE) return _tm diff --git a/tests/test_cli.py b/tests/test_cli.py index 08e0339..98dc520 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -541,3 +541,78 @@ class TestGpuHintMarkup: with patch("remove_ai_watermarks.invisible_engine.is_available", return_value=False): result = runner.invoke(main, ["all", str(sample_png)]) assert "remove-ai-watermarks[gpu]" in result.output + + +class TestEraseCommand: + """Tests for the 'erase' universal region eraser subcommand.""" + + def test_erase_help(self, runner): + result = runner.invoke(main, ["erase", "--help"]) + assert result.exit_code == 0 + assert "--region" in result.output + assert "--backend" in result.output + + def test_erase_single_region(self, runner, sample_png, tmp_path): + output = tmp_path / "erased.png" + result = runner.invoke( + main, + ["erase", str(sample_png), "--region", "10,10,40,40", "-o", str(output)], + ) + assert result.exit_code == 0, result.output + assert output.exists() + + def test_erase_two_regions(self, runner, sample_png, tmp_path): + output = tmp_path / "erased2.png" + result = runner.invoke( + main, + [ + "erase", + str(sample_png), + "--region", + "10,10,30,30", + "--region", + "120,120,30,30", + "-o", + str(output), + ], + ) + assert result.exit_code == 0, result.output + assert output.exists() + # The banner reports the region count it processed. + assert "2 region(s)" in result.output + + def test_erase_default_output_name(self, runner, sample_png): + result = runner.invoke(main, ["erase", str(sample_png), "--region", "10,10,40,40"]) + assert result.exit_code == 0, result.output + assert sample_png.with_stem(sample_png.stem + "_clean").exists() + + def test_erase_malformed_region_exits_nonzero(self, runner, sample_png, tmp_path): + output = tmp_path / "x.png" + # Only three values: click.BadParameter -> non-zero exit, no output file. + result = runner.invoke( + main, + ["erase", str(sample_png), "--region", "1,2,3", "-o", str(output)], + ) + assert result.exit_code != 0 + assert not output.exists() + + def test_erase_nonexistent_file(self, runner): + result = runner.invoke(main, ["erase", "/nonexistent/file.png", "--region", "0,0,10,10"]) + assert result.exit_code != 0 + + def test_erase_lama_backend_without_onnxruntime(self, runner, sample_png, tmp_path): + # The LaMa backend needs onnxruntime; without it the CLI must surface a + # clear error and exit non-zero rather than crash. When onnxruntime IS + # installed there is no missing-dep path to exercise, so skip. + from remove_ai_watermarks.region_eraser import lama_available + + if lama_available(): + pytest.skip("onnxruntime installed; missing-dep error path not reachable") + output = tmp_path / "y.png" + result = runner.invoke( + main, + ["erase", str(sample_png), "--region", "10,10,40,40", "--backend", "lama", "-o", str(output)], + ) + assert result.exit_code != 0 + assert "onnxruntime" in result.output.lower() + assert not output.exists() diff --git a/tests/test_doubao_engine.py b/tests/test_doubao_engine.py index 8d27c58..f55c2fa 100644 --- a/tests/test_doubao_engine.py +++ b/tests/test_doubao_engine.py @@ -161,3 +161,28 @@ class TestReverseAlpha: assert float(np.abs(wm.astype(np.float32)[mark] - 100.0).mean()) > 15 # mark visible out = eng.remove_watermark_reverse_alpha(wm).astype(np.float32) assert float(np.abs(out[mark] - 100.0).mean()) < max_err + + +class TestDegenerateAndChannelInputs: + """Removal must not crash on degenerate sizes or non-3-channel inputs.""" + + @pytest.mark.parametrize(("w", "h"), [(2048, 1), (1, 2048), (2048, 8)]) + def test_wide_short_does_not_raise(self, w, h): + """A wide/short image at native width makes the width-derived glyph box + taller than the image; the slice assignment must not ValueError.""" + eng = DoubaoEngine() + img = np.zeros((h, w, 3), np.uint8) + out = eng.remove_watermark_reverse_alpha(img) + assert out.shape == img.shape + + def test_grayscale_2d_does_not_raise(self): + eng = DoubaoEngine() + gray = np.zeros((2048, 2048), np.uint8) + out = eng.remove_watermark_reverse_alpha(gray) + assert out.shape == (2048, 2048, 3) + + def test_bgra_4channel_does_not_raise(self): + eng = DoubaoEngine() + bgra = np.zeros((2048, 2048, 4), np.uint8) + out = eng.remove_watermark_reverse_alpha(bgra) + assert out.shape == (2048, 2048, 3) diff --git a/tests/test_humanizer.py b/tests/test_humanizer.py index 0f92fdf..c868b60 100644 --- a/tests/test_humanizer.py +++ b/tests/test_humanizer.py @@ -50,3 +50,23 @@ def test_invalid_shape(): img[0, 0] = 50 result = apply_analog_humanizer(img) assert np.array_equal(img, result) + + +def test_chromatic_shift_does_not_wrap_opposite_edge(): + # On a horizontal gradient (dark left, bright right), a circular np.roll + # would wrap the bright right edge into the R channel's left border and the + # dark left edge into the B channel's right border, producing a colored + # fringe. After the fix the border columns must replicate their own edge. + ramp = np.linspace(0, 255, 64, dtype=np.uint8) + gray = np.broadcast_to(ramp, (32, 64)) + img = np.stack([gray, gray, gray], axis=2).copy() # B, G, R + + shift = 3 + result = apply_analog_humanizer(img, grain_intensity=0.0, chromatic_shift=shift) + + # B (index 0) rolled right -> its left border must stay dark (near 0), + # NOT wrap the bright right edge. + assert result[:, :shift, 0].max() < 60 + # R (index 2) rolled left -> its right border must stay bright (near 255), + # NOT wrap the dark left edge. + assert result[:, -shift:, 2].min() > 195 diff --git a/tests/test_identify.py b/tests/test_identify.py index 17991e9..6807026 100644 --- a/tests/test_identify.py +++ b/tests/test_identify.py @@ -389,6 +389,56 @@ class TestIdentifyCaveats: assert len(r.caveats) == len(set(r.caveats)) +class TestOpenAiCaveatVendorScoped: + """The OpenAI rollout caveat keys on the normalized SynthID vendor, not a raw + "OpenAI" substring over the issuer + verdict blob -- so a Google-SynthID + manifest with an incidental "OpenAI" byte elsewhere is not mislabeled, while + a genuine OpenAI manifest still gets the hedge. + """ + + @staticmethod + def _png_chunk(ctype: bytes, data: bytes) -> bytes: + import struct + import zlib + + return struct.pack(">I", len(data)) + ctype + data + struct.pack(">I", zlib.crc32(ctype + data) & 0xFFFFFFFF) + + def _png(self, tmp_path: Path, name: str, *extra: bytes) -> Path: + import struct + import zlib + + ihdr = struct.pack(">IIBBBBB", 1, 1, 8, 6, 0, 0, 0) + body = ( + b"\x89PNG\r\n\x1a\n" + + self._png_chunk(b"IHDR", ihdr) + + self._png_chunk(b"IDAT", zlib.compress(b"\x00" * 6, 9)) + + b"".join(extra) + + self._png_chunk(b"IEND", b"") + ) + path = tmp_path / name + path.write_bytes(body) + return path + + def test_google_synthid_with_incidental_openai_byte_no_caveat(self, tmp_path: Path): + # Google C2PA/SynthID manifest in caBX; the byte "OpenAI" lives in a + # separate tEXt chunk (e.g. a trust-chain note), not as a SynthID vendor. + png = self._png( + tmp_path, + "g.png", + self._png_chunk(b"caBX", b"jumbc2pa Google ... trainedAlgorithmicMedia"), + self._png_chunk(b"tEXt", b"note\x00signed via OpenAI trust chain"), + ) + r = identify(png, check_visible=False, check_invisible=False) + assert any("SynthID pixel watermark (likely present (Google" in w for w in r.watermarks) + assert not any("before the rollout" in c for c in r.caveats) + + def test_openai_synthid_still_gets_caveat(self, tmp_path: Path): + png = self._png(tmp_path, "oa.png", self._png_chunk(b"caBX", b"jumbc2pa OpenAI ... trainedAlgorithmicMedia")) + r = identify(png, check_visible=False, check_invisible=False) + assert any("SynthID pixel watermark (likely present (OpenAI" in w for w in r.watermarks) + assert any("before the rollout" in c for c in r.caveats) + + class TestReportSerializable: def test_report_is_json_serializable(self, tmp_png_with_ai_metadata: Path): # The CLI --json path relies on asdict + json.dumps(default=str). @@ -657,6 +707,19 @@ class TestIntegrityClashEndToEnd: r = identify(path, check_visible=False, check_invisible=False) assert r.integrity_clashes == [] + def test_camera_device_plus_ai_marker_clash(self, tmp_path: Path): + # Integrity-clash rule #2: a camera-capture C2PA device token (Pixel + # Camera) coexisting with an independent AI-generation marker (a China + # TC260 AIGC label) -- a genuine camera capture is not AI-generated, so + # the provenance is inconsistent (a laundering / spoofing tell). + path = self._c2pa_jpeg( + tmp_path, + b'Pixel Camera ... {"Label":"1","ContentProducer":"BYTEDANCE001"}', + ) + r = identify(path, check_visible=False, check_invisible=False) + assert r.platform == "Google Pixel (camera, C2PA capture)" + assert any("Camera-capture C2PA credentials" in c and "AI-generation markers" in c for c in r.integrity_clashes) + def test_clash_serializes_to_json(self, tmp_path: Path): path = self._c2pa_jpeg(tmp_path, b"OpenAI ... trainedAlgorithmicMedia ... TC260:AIGC label") r = identify(path, check_visible=False, check_invisible=False) diff --git a/tests/test_image_io.py b/tests/test_image_io.py index 9ff65ed..c23c8bd 100644 --- a/tests/test_image_io.py +++ b/tests/test_image_io.py @@ -72,3 +72,8 @@ class TestFailureSemantics: path = tmp_path / "garbage.png" path.write_bytes(b"not an image") assert image_io.imread(path) is None + + def test_imwrite_to_missing_directory_returns_false(self, tmp_path: Path) -> None: + # An unwritable path must return False (cv2.imwrite contract), not raise. + path = tmp_path / "no-such-dir" / "out.png" + assert image_io.imwrite(path, _make_bgr()) is False diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d04279d..bfc146e 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -445,6 +445,31 @@ class TestRemoveAiMetadata: assert isinstance(result, Path) assert result == output + def _sd_png(self, tmp_path: Path) -> Path: + img = Image.new("RGB", (32, 32), color=(80, 80, 80)) + pnginfo = PngInfo() + pnginfo.add_text("parameters", "Steps: 20, Sampler: Euler") + img.save(tmp_path / "sd.png", pnginfo=pnginfo) + return tmp_path / "sd.png" + + def test_png_to_jpeg_strips_ai(self, tmp_path): + # Cross-format output: the AI text chunk must not survive the PNG->JPEG + # re-encode, by detection AND by raw bytes. + out = tmp_path / "clean.jpg" + remove_ai_metadata(self._sd_png(tmp_path), out) + assert not has_ai_metadata(out) + body = out.read_bytes() + assert b"parameters" not in body + assert b"Steps" not in body + + def test_png_to_webp_strips_ai(self, tmp_path): + out = tmp_path / "clean.webp" + remove_ai_metadata(self._sd_png(tmp_path), out) + assert not has_ai_metadata(out) + body = out.read_bytes() + assert b"parameters" not in body + assert b"Steps" not in body + def _img_with_software(tmp_path: Path, fmt: str, software: str) -> Path: """Write a tiny image carrying an EXIF Software tag.""" @@ -617,6 +642,41 @@ class TestRemoveAiExif: kept = piexif.load(Image.open(out).info["exif"])["0th"] assert kept.get(piexif.ImageIFD.Make) == b"Apple" + def test_xai_pair_stripped_but_genuine_camera_tags_kept(self, tmp_path: Path): + # An image carrying BOTH the xAI Signature pair (ImageDescription = + # "Signature: " + UUID Artist) AND genuine non-AI camera tags. + # The scrub must delete only the xAI pair, leaving the camera tags intact. + sig = "Signature: " + "A" * 120 + artist = "12345678-1234-1234-1234-123456789abc" + exif = piexif.dump( + { + "0th": { + piexif.ImageIFD.ImageDescription: sig.encode(), + piexif.ImageIFD.Artist: artist.encode(), + piexif.ImageIFD.Make: b"Canon", + piexif.ImageIFD.Model: b"EOS R5", + }, + "Exif": {piexif.ExifIFD.DateTimeOriginal: b"2024:01:01 12:00:00"}, + "GPS": {piexif.GPSIFD.GPSLatitudeRef: b"N"}, + "1st": {}, + } + ) + src = tmp_path / "grok_plus_cam.jpg" + Image.new("RGB", (32, 32)).save(src, exif=exif) + out = tmp_path / "scrubbed.jpg" + remove_ai_metadata(src, out) + + # xAI signature pair is gone (xai_signature returns a bool, not None). + assert xai_signature(out) is False + kept = piexif.load(Image.open(out).info["exif"]) + assert kept["0th"].get(piexif.ImageIFD.ImageDescription) is None + assert kept["0th"].get(piexif.ImageIFD.Artist) is None + # Genuine camera tags are preserved. + assert kept["0th"].get(piexif.ImageIFD.Make) == b"Canon" + assert kept["0th"].get(piexif.ImageIFD.Model) == b"EOS R5" + assert kept["Exif"].get(piexif.ExifIFD.DateTimeOriginal) == b"2024:01:01 12:00:00" + assert kept["GPS"].get(piexif.GPSIFD.GPSLatitudeRef) == b"N" + class TestAIGCLabel: """China TC260 AIGC labeling (Doubao and other China-served generators).""" diff --git a/tests/test_noai.py b/tests/test_noai.py index 1435ce9..37238d3 100644 --- a/tests/test_noai.py +++ b/tests/test_noai.py @@ -328,9 +328,12 @@ class TestISOBMFF: def test_truncated_largesize_terminates_safely(self): # size32==1 promises a 64-bit largesize, but the box ends after 8 bytes; # iteration must stop rather than read the missing largesize past EOF. - cleaned, stripped = strip_c2pa_boxes(FTYP + b"\x00\x00\x00\x01uuid") + # The walk halts before EOF, so the fail-safe returns the input unchanged + # (emitting only FTYP would silently truncate the file). + data = FTYP + b"\x00\x00\x00\x01uuid" + cleaned, stripped = strip_c2pa_boxes(data) assert stripped == 0 - assert cleaned == FTYP + assert cleaned == data class TestC2PAInvalidSignature: diff --git a/tests/test_region_eraser.py b/tests/test_region_eraser.py index 181b378..09ceefb 100644 --- a/tests/test_region_eraser.py +++ b/tests/test_region_eraser.py @@ -66,6 +66,23 @@ class TestEraseCv2: assert np.array_equal(img, out) +class TestNonBgrInputs: + """cv2.inpaint rejects 4-channel BGRA and 2D-only entry points must work.""" + + def test_grayscale_2d_does_not_raise(self): + gray = np.full((100, 100), 120, np.uint8) + out = erase(gray, boxes=[(40, 40, 20, 20)], backend="cv2") + assert out.shape == gray.shape + + def test_bgra_preserves_alpha_and_does_not_raise(self): + bgra = np.full((100, 100, 4), 120, np.uint8) + bgra[..., 3] = 200 # opaque-ish alpha plane + out = erase(bgra, boxes=[(40, 40, 20, 20)], backend="cv2", dilate=0) + assert out.shape == bgra.shape + # alpha plane is carried through unchanged + assert np.array_equal(out[..., 3], bgra[..., 3]) + + class TestLamaBackend: def test_lama_raises_when_unavailable(self): img = np.full((100, 100, 3), 50, np.uint8) diff --git a/tests/test_security_clamp.py b/tests/test_security_clamp.py new file mode 100644 index 0000000..9e3c194 --- /dev/null +++ b/tests/test_security_clamp.py @@ -0,0 +1,130 @@ +"""Regression guards for malformed-length DoS and removal-truncation bugs. + +Three verified bugs are locked in here: + +1. PNG C2PA parsers (``c2pa.has_c2pa_metadata`` / ``extract_c2pa_info`` and + ``metadata._png_late_metadata`` via ``scan_head``) used the raw 32-bit chunk + ``length`` field directly in ``f.read(length)``. A crafted file can declare + ``length = 0x7FFFFFFF`` (~2 GiB) on a 60-byte file, forcing a multi-GB + allocation. The fix clamps ``length`` to the bytes actually remaining. + +2. ISOBMFF ``strip_c2pa_boxes`` truncated the file from a malformed box to EOF + (the box walker returns early), so ``remove_ai_metadata`` could emit a + shorter file and report success. The fix returns the input unchanged when the + walk does not reach EOF. +""" + +from __future__ import annotations + +import struct +import tracemalloc + +from remove_ai_watermarks import metadata +from remove_ai_watermarks.noai import c2pa, isobmff + +PNG_SIG = b"\x89PNG\r\n\x1a\n" +_HUGE = 0x7FFFFFFF # ~2 GiB declared length on a tiny file + + +def _png_with_huge_c2pa_chunk() -> bytes: + """A ~60-byte 'PNG' whose caBX chunk header lies about its length.""" + header = struct.pack(">I", _HUGE) + c2pa.C2PA_CHUNK_TYPE + body = b"jumbc2pa-not-really" # far shorter than the declared length + return PNG_SIG + header + body + + +class TestPngLengthClampNoAlloc: + """Clamping makes the parsers read only the real bytes, not the lie.""" + + def test_has_c2pa_metadata_is_bounded(self, tmp_path): + path = tmp_path / "evil.png" + path.write_bytes(_png_with_huge_c2pa_chunk()) + + tracemalloc.start() + try: + # Must return quickly without allocating gigabytes and without raising. + c2pa.has_c2pa_metadata(path) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + assert peak < 50 * 1024 * 1024 # < 50 MB locks in the clamp + + def test_extract_c2pa_info_is_bounded(self, tmp_path): + path = tmp_path / "evil.png" + path.write_bytes(_png_with_huge_c2pa_chunk()) + + tracemalloc.start() + try: + c2pa.extract_c2pa_info(path) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + assert peak < 50 * 1024 * 1024 + + def test_extract_c2pa_chunk_is_bounded(self, tmp_path): + path = tmp_path / "evil.png" + path.write_bytes(_png_with_huge_c2pa_chunk()) + + tracemalloc.start() + try: + c2pa.extract_c2pa_chunk(path) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + assert peak < 50 * 1024 * 1024 + + def test_png_late_metadata_scan_is_bounded(self, tmp_path): + # A PNG with a real IDAT pushing the late-scan window past 1 MB, then a + # tEXt chunk lying about its length. scan_head() -> _png_late_metadata(). + idat = b"\x00" * (1024 * 1024 + 16) + text_header = struct.pack(">I", _HUGE) + b"tEXt" + blob = ( + PNG_SIG + + struct.pack(">I", len(idat)) + + b"IDAT" + + idat + + b"\x00\x00\x00\x00" # fake CRC + + text_header + + b"AIGC short" + ) + path = tmp_path / "evil_late.png" + path.write_bytes(blob) + + tracemalloc.start() + try: + metadata.scan_head(path) + _, peak = tracemalloc.get_traced_memory() + finally: + tracemalloc.stop() + # head itself is ~1 MB; the clamp keeps the late read tiny. Generous cap. + assert peak < 50 * 1024 * 1024 + + +def _box(box_type: bytes, payload: bytes) -> bytes: + return struct.pack(">I", 8 + len(payload)) + box_type + payload + + +class TestIsobmffStripFailSafe: + def test_well_formed_file_still_strips_uuid(self): + ftyp = _box(b"ftyp", b"isom\x00\x00\x00\x00mp42") + c2pa_box = _box(b"uuid", isobmff.C2PA_UUID + b"manifest-bytes") + mdat = _box(b"mdat", b"\x00" * 32) + data = ftyp + c2pa_box + mdat + + cleaned, stripped = isobmff.strip_c2pa_boxes(data) + assert stripped == 1 + assert len(cleaned) == len(data) - len(c2pa_box) + assert isobmff.C2PA_UUID not in cleaned + + def test_malformed_box_does_not_truncate_tail(self): + ftyp = _box(b"ftyp", b"isom\x00\x00\x00\x00mp42") + c2pa_box = _box(b"uuid", isobmff.C2PA_UUID + b"manifest-bytes") + # A box claiming ~2 GiB before EOF: the walker returns early here. + bad_box = struct.pack(">I", _HUGE) + b"free" + b"\x00" * 16 + data = ftyp + c2pa_box + bad_box + + cleaned, stripped = isobmff.strip_c2pa_boxes(data) + # Fail-safe: input returned unchanged, nothing stripped, no truncation. + assert stripped == 0 + assert cleaned == data + assert len(cleaned) == len(data) diff --git a/tests/test_tiling.py b/tests/test_tiling.py index 5e14d64..83ee609 100644 --- a/tests/test_tiling.py +++ b/tests/test_tiling.py @@ -43,6 +43,16 @@ class TestTilePositions: # 1024 wide, 512 tile, no overlap -> two tiles butting at 512. assert tile_positions(1024, 512, 0) == [0, 512] + def test_overlap_equal_to_tile_raises(self): + # overlap == tile makes the stride denominator (tile - overlap) zero; + # reject up front instead of dividing by zero. + with pytest.raises(ValueError, match="overlap"): + tile_positions(2000, 512, 512) + + def test_overlap_greater_than_tile_raises(self): + with pytest.raises(ValueError, match="overlap"): + tile_positions(2000, 512, 600) + class TestMakeBlendWeight: def test_zero_overlap_is_all_ones(self):