From 223cbcf1716d891144cb3b60360536266125be14 Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Thu, 28 May 2026 12:40:17 -0700 Subject: [PATCH] feat(metadata): detect China TC260 AIGC PNG chunk and HuggingFace hf-job-id aigc_label now reads the TC260 label from a raw-JSON `AIGC` PNG tEXt chunk (as Doubao/ByteDance write it, with no namespaced XMP marker) in addition to the `` XMP block, via a shared _parse helper gated on a TC260 field so a generic AIGC key cannot false-positive. New huggingface_job() reads the hf-job-id PNG chunk; identify surfaces it as a medium-confidence hf_job signal (parallel to the visible sparkle, never overriding a hard metadata verdict). Both wired into has_ai_metadata/get_ai_metadata; the PNG save whitelist already strips them on removal. Found by auditing 646 corpus originals: 28 AIGC and 3 hf-job files the library previously reported as Unknown. Co-Authored-By: Claude Opus 4.7 --- CLAUDE.md | 6 +- README.md | 6 +- src/remove_ai_watermarks/identify.py | 30 +++++++- src/remove_ai_watermarks/metadata.py | 100 ++++++++++++++++++++++++--- tests/test_identify.py | 72 +++++++++++++++++++ tests/test_metadata.py | 82 ++++++++++++++++++++++ 6 files changed, 280 insertions(+), 16 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 00421d2..094caed 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,6 +16,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `bash maintain.sh` — uv-outdated, uv-secure, ruff check/fix, ruff format, pyright, pytest -n auto - `maintain.sh` may not finish fully green (pre-existing, not per-change): strict pyright carries debt in `remove_ai_metadata` / `cli.py` (untyped piexif/PIL/click/rich). (`uv-secure` is clean since idna was bumped 3.11 -> 3.16, fixing GHSA-65pc-fj4g-8rjx.) To gate a change, run `uv run ruff check`, `uv run pyright `, `uv run pytest` directly. - Run `uv run` from the repo root — from another cwd it falls back to a bare env without numpy/cv2/torch. +- To add a dev tool (pytest/ruff/pyright) into the env, use `uv sync --frozen --extra dev --extra gpu`, **never `uv pip install`** — `uv pip install` re-resolves and rewrites `uv.lock`, which silently bumped `transformers` to a build incompatible with the pinned `diffusers` (`cannot import name 'Qwen3VLForConditionalGeneration'`) and broke every `identify`/metadata import. Recovery: `git checkout uv.lock && uv sync --frozen --extra gpu --extra dev`. The `gpu` extra holds `diffusers`/`transformers`/`torch`, so a bare `uv sync` (no extras) removes them and `noai/__init__` (eager pipeline import) then fails. `maintain.sh`'s `uv sync --all-extras` also pulls the heavy `trustmark`/`lama` wheels (pytorch-lightning, onnxruntime) — fine on a good connection, but on flaky DNS sync only `--extra gpu --extra dev` and run the lint/test steps by hand. - Metadata/C2PA tests assert against real committed fixtures in `data/samples/` (`chatgpt-*.png` = OpenAI C2PA, `firefly-1.png` = Adobe, `mj-*` = Midjourney IPTC, `doubao-1.png` = ByteDance Doubao with the China TC260 `` XMP label **and** a visible "豆包AI生成" text mark bottom-right; `grok-1.jpg` = xAI Grok with its EXIF-only `Signature:` blob + UUID `Artist` and no C2PA/SynthID/IPTC); synthetic byte blobs cover the JPEG/ISOBMFF format paths. The "non-AI / clean photo" control is no longer in `data/samples/` -- the `clean_photo` conftest fixture serves a verified-negative image from the corpus `neg/` set (skips if the corpus is absent). - SynthID reference corpus: `scripts/synthid_corpus.py` ingests labeled images into `data/synthid_corpus/`. The labeled `images/` (`pos/` `neg/` `cleaned/`) are **committed** (public repo -- review every image for private content before adding; `manifest.csv` is kept in sync with the files on disk, one row per tracked image); only the synthetic `refs/` calibration fills are gitignored. See its README for the collection protocol and verification oracles. @@ -30,7 +31,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `noai/c2pa.py` — PNG chunk parser; use `extract_c2pa_chunk(path)` to get raw caBX payload, `has_c2pa_metadata(path)` to detect. Do not reimplement chunk parsing. `extract_c2pa_info(path)` sets `synthid_watermark`/`synthid_vendors` when the manifest is signed by a SynthID-using vendor, and `soft_binding`/`soft_binding_vendors` when a `c2pa.soft-binding` `alg` names a forensic-watermark vendor (`soft_binding_vendors_in(buffer)` is the shared byte-scan, used by both the PNG parser and the non-PNG binary path). - `noai/constants.py` — PNG_SIGNATURE, C2PA_CHUNK_TYPE, C2PA_SIGNATURES, C2PA_ISSUERS, `SYNTHID_C2PA_ISSUERS` (issuers that pair SynthID with C2PA: Google, OpenAI), and `C2PA_SOFT_BINDINGS` (soft-binding `alg` prefix → forensic-watermark vendor: Adobe TrustMark, Digimarc, Imatag, Steg.AI, Microsoft, ...). Add a new issuer/binding here, not inline. - `metadata.py` — `scan_head(path, size=1MB)` is the shared input for every C2PA/AIGC/IPTC byte scan: first `size` bytes plus, for ISOBMFF, the late provenance-box payloads from `isobmff.scan_c2pa_region` (catches a manifest after a large `mdat`); behavior-neutral (`f.read(size)`) for non-ISOBMFF. Use it instead of `open().read(1MB)` for any new marker scan. `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. `xai_signature(path)` detects xAI/Grok's EXIF-only scheme (`ImageDescription` = `Signature: ` + UUID `Artist`); it feeds `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify`. `iptc_ai_system(path)` detects the IPTC Photo Metadata 2025.1 AI-disclosure XMP properties (`IPTC_AI_FIELD_MARKERS` = `AISystemUsed`/`AISystemVersionUsed`/`AIPromptInformation`/`AIPromptWriterName`) and returns the `AISystemUsed` generator name (or `"fields present"`). `remove_ai_metadata` routes **ISOBMFF video** (`.mp4`/`.mov`/`.m4v`) through the same `isobmff.strip_c2pa_boxes` as AVIF/HEIF (MP4 is ISOBMFF), and `_scrub_ai_exif` removes the xAI signature + AI-generator EXIF tags on JPEG output. -- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, C2PA soft-binding forensic-watermark vendor, IPTC "Made with AI" + IPTC 2025.1 `AISystemUsed`, embedded SD/ComfyUI params, SynthID proxy, xAI/Grok EXIF signature via `metadata.xai_signature`, visible Gemini sparkle, open invisible watermark, Adobe TrustMark via `trustmark_detector`) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. **C2PA platform attribution is device-token-first, issuer-scan fallback** (`_device_platform` scans manifest bytes for `_DEVICE_C2PA_PLATFORM` tokens, then `_attribute_platform`/`_ISSUER_PLATFORM`). **Why, verified on real signed files 2026-05-26:** the old issuer-only byte-scan matched ANY issuer substring anywhere, so multi-entity manifests mis-attributed -- Leica→"Truepic" (a signing authority in the trust chain), Nikon→"Adobe Firefly" (XMP-toolkit "Adobe" + the sample's "Adobe_MAX" name), Pixel→"Google (Gemini)" ("Google LLC" cert org), Truepic→"Google". A distinctive device token wins instead. **Token distinctiveness is load-bearing:** bare `b"Truepic"` mis-fires (it appears in unrelated trust chains -- it mis-attributed the OpenAI `chatgpt-1.png` fixture), so the token is the specific `b"Truepic_Lens"` from the Lens SDK claim generator; likewise `b"Pixel Camera"` (cert CN) not bare `b"Pixel"`. `_DEVICE_C2PA_PLATFORM` lists ONLY tokens **verified against a real C2PA file**: Leica (`lc_c2pa`/`Leica Camera`), Nikon (`NIKON`), Pixel (`Pixel Camera` -- from a real Pixel 10 Pro file attached to c2pa-rs issue #1609/#1554), Sony (`sony.sig`/`sony.cert` -- Sony's own C2PA assertion namespace, verified on a real Sony PXW-Z300 file; NOT bare "Sony" which is a common EXIF Make), Truepic (`Truepic_Lens`). Canon/Samsung/Bria have **no public direct-download C2PA sample** (checked exhaustively: GitHub issue/PR attachments, contentcredentials gallery, HF datasets -- all upload-to-verify or token-gated; Canon's only public file was a self-signed hobbyist CR3, not factory), so they stay unmapped until a real file is captured (same fixture discipline as Grok/Doubao). The Sony sample is video (MP4) -- our ISOBMFF C2PA path detects it; Sony Alpha stills likely share the `sony.*` namespace but are not separately verified. Camera C2PA marks capture authenticity, not AI (Pixel carries `computationalCapture`, not `trainedAlgorithmicMedia`), so these never set `is_ai` -- that stays driven by digital-source-type. `c2pa.cbor_text_after` (now public) is best-effort for the `generator` detail string only and can be None when the manifest keys it `claim_generator_info` (Pixel). **Issuer→generator mapping is `is_ai`-gated** (`_attribute_platform(issuers, is_ai=c2pa_is_ai)`): a specific AI-generator platform is named only when the digital-source-type is `trainedAlgorithmicMedia`; on a non-AI source an issuer substring is treated as incidental (an "Adobe XMP" toolkit string in an *unmapped* Canon/Sony capture would otherwise mislabel it "Adobe Firefly"), so it degrades to the neutral "C2PA signer: X" label. Real Firefly/OpenAI/Google output carries the AI source-type, so it is unaffected (verified: chatgpt-1.png→OpenAI, firefly-1.png→Adobe Firefly still attribute). `_attribute_platform` defaults `is_ai=True` so the mapping stays unit-testable in isolation. Add device tokens to `_DEVICE_C2PA_PLATFORM`, generator/issuer platforms to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. **Integrity-clash detection** (`_integrity_clashes`, surfaced as `ProvenanceReport.integrity_clashes`, printed in red by `identify` and serialized to `--json`): contradictions between independent generator stamps are a laundering/spoofing tell. Two rules: (1) two or more distinct AI-origin vendors named by independent signals (e.g. C2PA OpenAI + EXIF `Make="Ideogram AI"`), and (2) a camera-capture C2PA device (`_DEVICE_C2PA_PLATFORM`) coexisting with any AI-generation marker. Vendor normalization is `_vendor_of` over `_AI_VENDOR_TOKENS` (so a C2PA "Google (Gemini)" issuer and a SynthID-Google proxy agree, while different vendors clash). **High-precision by design:** only hard generator stamps feed it (C2PA-issuer when source is AI, SynthID, EXIF/XMP generator, IPTC `AISystemUsed`, xAI, AIGC); the fuzzy visible sparkle and the open invisible watermark are **excluded** (the latter can be a by-product of our own SDXL removal pass). The c2pa vendor is classified from the issuer attribution / generator, NOT the resolved `platform` (a camera label like "Google Pixel" would mis-normalize to "Google"). All real single-origin fixtures (chatgpt/firefly/doubao/grok/mj) verified to produce **zero** clashes (false-positive guard in `test_identify.py::TestRealSamplesHaveNoClash`). +- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, C2PA soft-binding forensic-watermark vendor, IPTC "Made with AI" + IPTC 2025.1 `AISystemUsed`, embedded SD/ComfyUI params, SynthID proxy, xAI/Grok EXIF signature via `metadata.xai_signature`, the China TC260 AIGC label via `metadata.aigc_label`, the HuggingFace `hf-job-id` job marker via `metadata.huggingface_job`, visible Gemini sparkle, open invisible watermark, Adobe TrustMark via `trustmark_detector`) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). The `hf_job` and visible-sparkle signals are **medium** confidence: each lifts an otherwise-Unknown verdict to a tentative AI (`hf_only` / `visible_only`, parallel branches) but is excluded from the high-confidence `ai_from_metadata` set, so neither overrides a hard metadata signal. Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. **C2PA platform attribution is device-token-first, issuer-scan fallback** (`_device_platform` scans manifest bytes for `_DEVICE_C2PA_PLATFORM` tokens, then `_attribute_platform`/`_ISSUER_PLATFORM`). **Why, verified on real signed files 2026-05-26:** the old issuer-only byte-scan matched ANY issuer substring anywhere, so multi-entity manifests mis-attributed -- Leica→"Truepic" (a signing authority in the trust chain), Nikon→"Adobe Firefly" (XMP-toolkit "Adobe" + the sample's "Adobe_MAX" name), Pixel→"Google (Gemini)" ("Google LLC" cert org), Truepic→"Google". A distinctive device token wins instead. **Token distinctiveness is load-bearing:** bare `b"Truepic"` mis-fires (it appears in unrelated trust chains -- it mis-attributed the OpenAI `chatgpt-1.png` fixture), so the token is the specific `b"Truepic_Lens"` from the Lens SDK claim generator; likewise `b"Pixel Camera"` (cert CN) not bare `b"Pixel"`. `_DEVICE_C2PA_PLATFORM` lists ONLY tokens **verified against a real C2PA file**: Leica (`lc_c2pa`/`Leica Camera`), Nikon (`NIKON`), Pixel (`Pixel Camera` -- from a real Pixel 10 Pro file attached to c2pa-rs issue #1609/#1554), Sony (`sony.sig`/`sony.cert` -- Sony's own C2PA assertion namespace, verified on a real Sony PXW-Z300 file; NOT bare "Sony" which is a common EXIF Make), Truepic (`Truepic_Lens`). Canon/Samsung/Bria have **no public direct-download C2PA sample** (checked exhaustively: GitHub issue/PR attachments, contentcredentials gallery, HF datasets -- all upload-to-verify or token-gated; Canon's only public file was a self-signed hobbyist CR3, not factory), so they stay unmapped until a real file is captured (same fixture discipline as Grok/Doubao). The Sony sample is video (MP4) -- our ISOBMFF C2PA path detects it; Sony Alpha stills likely share the `sony.*` namespace but are not separately verified. Camera C2PA marks capture authenticity, not AI (Pixel carries `computationalCapture`, not `trainedAlgorithmicMedia`), so these never set `is_ai` -- that stays driven by digital-source-type. `c2pa.cbor_text_after` (now public) is best-effort for the `generator` detail string only and can be None when the manifest keys it `claim_generator_info` (Pixel). **Issuer→generator mapping is `is_ai`-gated** (`_attribute_platform(issuers, is_ai=c2pa_is_ai)`): a specific AI-generator platform is named only when the digital-source-type is `trainedAlgorithmicMedia`; on a non-AI source an issuer substring is treated as incidental (an "Adobe XMP" toolkit string in an *unmapped* Canon/Sony capture would otherwise mislabel it "Adobe Firefly"), so it degrades to the neutral "C2PA signer: X" label. Real Firefly/OpenAI/Google output carries the AI source-type, so it is unaffected (verified: chatgpt-1.png→OpenAI, firefly-1.png→Adobe Firefly still attribute). `_attribute_platform` defaults `is_ai=True` so the mapping stays unit-testable in isolation. Add device tokens to `_DEVICE_C2PA_PLATFORM`, generator/issuer platforms to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. **Integrity-clash detection** (`_integrity_clashes`, surfaced as `ProvenanceReport.integrity_clashes`, printed in red by `identify` and serialized to `--json`): contradictions between independent generator stamps are a laundering/spoofing tell. Two rules: (1) two or more distinct AI-origin vendors named by independent signals (e.g. C2PA OpenAI + EXIF `Make="Ideogram AI"`), and (2) a camera-capture C2PA device (`_DEVICE_C2PA_PLATFORM`) coexisting with any AI-generation marker. Vendor normalization is `_vendor_of` over `_AI_VENDOR_TOKENS` (so a C2PA "Google (Gemini)" issuer and a SynthID-Google proxy agree, while different vendors clash). **High-precision by design:** only hard generator stamps feed it (C2PA-issuer when source is AI, SynthID, EXIF/XMP generator, IPTC `AISystemUsed`, xAI, AIGC); the fuzzy visible sparkle and the open invisible watermark are **excluded** (the latter can be a by-product of our own SDXL removal pass). The c2pa vendor is classified from the issuer attribution / generator, NOT the resolved `platform` (a camera label like "Google Pixel" would mis-normalize to "Google"). All real single-origin fixtures (chatgpt/firefly/doubao/grok/mj) verified to produce **zero** clashes (false-positive guard in `test_identify.py::TestRealSamplesHaveNoClash`). - `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`. - `doubao_engine.py` — visible Doubao "豆包AI生成" remover/detector (cv2/numpy, no GPU). `DoubaoEngine.locate` anchors a bottom-right box by **geometry** (mark scales with image WIDTH, fractions in module constants; no bundled template), `extract_mask` pulls the light low-saturation glyphs with a **polarity-aware white top-hat** (brighter-than-blurred-local-bg, so white-paper documents are left untouched instead of smeared), `detect` thresholds glyph coverage (`DETECT_MIN_COVERAGE` 0.16 separates real marks ≥0.20 from corner noise, which stays ≤0.06 on large images but can spike to ~0.15 on tiny ones), `remove_watermark` inpaints (cv2 Telea/NS) and **bails when coverage > `MAX_INPAINT_COVERAGE` 0.50** (dense-text background → would smear). Wired into `visible --mark` via `cli._run_doubao_if_selected`. **Logo is near-white (~253), not the gray some third-party tools assume.** Best on photo/illustration backgrounds; high-contrast edges leave faint residue (cv2-inpaint limit). Clean per-pixel reverse-alpha (Gemini-style) is the future upgrade but needs a captured/distilled alpha map — see below. - `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)`: `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal. @@ -51,7 +52,8 @@ Who embeds what, and whether it is locally detectable (so we know which gaps are - **C2PA / IPTC (covered by the issuer/marker scan):** OpenAI, Google, Adobe Firefly, Microsoft (Designer + **Bing Image Creator** — collected 2026-05-24; Bing now runs Microsoft's own **MAI-Image** model, signs C2PA as "Microsoft", NOT OpenAI/DALL-E), and **Stability AI** (collected from Brand Studio / DreamStudio successor; signs C2PA as "Stability AI Ltd", no SynthID, no imwatermark on its current Stable Image model — issuer added to `C2PA_ISSUERS`). Still unsampled: Canva (its downloads are re-encoded design *exports* that strip C2PA, so a Canva "positive" is inconclusive — skipped), Getty, Shutterstock. Midjourney embeds NO C2PA and no invisible watermark (our `mj-*` sample carried only the IPTC tag). - **EXIF/XMP generator tag (caught by `exif_generator`):** **Ideogram** writes EXIF `Make="Ideogram AI"` (collected 2026-05-24 — no C2PA, no SynthID, no imwatermark; the Make tag is the only signal). - **xAI / Grok — its own EXIF signature scheme, NOT C2PA (DETECTED by `metadata.xai_signature`, built 2026-05-26).** Grok JPEG downloads (Aurora model) carry **no C2PA, no XMP, no SynthID, no IPTC** — only EXIF `Artist` = a UUID and EXIF `ImageDescription` = `Signature: ` (a crypto signature, unverifiable locally without xAI's public key). This empirically kills the earlier unverified "xAI signs C2PA as xAI" lead — xAI is not even a C2PA member. `exif_generator` misses it (neither field holds an `AI_GENERATOR_TOKENS` token), so a dedicated detector `xai_signature(path)` matches the pair (`ImageDescription ~ ^Signature: [A-Za-z0-9+/=]{64,}` AND UUID `Artist`); wired into `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify` (signal `xai_signature`, platform "xAI (Grok / Aurora)"). **Format confirmed stable across n=3 genuine generations:** exactly three EXIF tags (`Artist`, `ExifOffset`, `ImageDescription`), `Signature:` prefix constant, base64 payload 300-1004 chars. Two capture facts: (a) the `Artist` UUID **equals the public image id** in the asset URL (`https://imagine-public.x.ai/imagine-public/images/.jpg`), so it is NOT a private per-user secret — only the `Signature` blob is; (b) the Grok web-UI image is a re-encoded **WebP with no signature** — the EXIF survives only in the *original* JPEG (download button or that public tokenless URL), which is why screenshots / re-encodes are metadata-stripped. A real fixture `data/samples/grok-1.jpg` plus **synthetic** JPEG fixtures (fake UUID + fake `Signature:` blob) cover the detector; never add a real Grok image carrying private content (the repo is public). **Stripped on removal too:** `remove_ai_metadata` now calls `_scrub_ai_exif` on the JPEG EXIF, which deletes the xAI Signature+UUID-Artist pair **and** any `Software`/`Make`/`Artist`/`ImageDescription` tag holding an `AI_GENERATOR_TOKENS` token (so Ideogram's `Make="Ideogram AI"` is scrubbed too), while keeping genuine camera/editor EXIF. The shared `_is_xai_signature_pair` helper (module-level compiled regexes) is the single source of truth for the pattern, used by both `xai_signature` and `_scrub_ai_exif`. (AVIF/HEIF/JXL still strip only C2PA boxes via `isobmff`, not EXIF — unchanged.) -- **China TC260 AIGC label (caught by `AIGC_MARKERS` / `metadata.aigc_label`, surfaced by `identify` as the `aigc` signal):** China-served generators embed an XMP `{"Label":"1","ContentProducer":...}` block — China's mandatory AI-content labeling (TC260 namespace `tc260.org.cn/ns/AIGC`). **Doubao** (ByteDance) uses it (verified on the real #13 sample 2026-05-25; `ContentProducer` `001191110102MACQD9K64010000`, no C2PA/SynthID/imwatermark — the XMP block is the only signal; GitHub attachment upload did NOT strip it). The same standard is mandatory for Jimeng/Kling/Qwen/Ernie etc., so the one marker covers the whole China-AIGC-labeled ecosystem. `aigc_label` json-decodes the (HTML-entity-encoded) block; container-agnostic raw-byte scan. +- **China TC260 AIGC label (caught by `AIGC_MARKERS` / `metadata.aigc_label`, surfaced by `identify` as the `aigc` signal):** China-served generators embed an XMP `{"Label":"1","ContentProducer":...}` block — China's mandatory AI-content labeling (TC260 namespace `tc260.org.cn/ns/AIGC`). **Doubao** (ByteDance) uses it (verified on the real #13 sample 2026-05-25; `ContentProducer` `001191110102MACQD9K64010000`, no C2PA/SynthID/imwatermark — the XMP block is the only signal; GitHub attachment upload did NOT strip it). The same standard is mandatory for Jimeng/Kling/Qwen/Ernie etc., so the one marker covers the whole China-AIGC-labeled ecosystem. `aigc_label` reads **two serializations** through a shared `_parse` helper: the HTML-entity-encoded XMP `` block (container-agnostic raw-byte scan, any JSON object accepted) **and** a raw-JSON PNG `AIGC` tEXt chunk — Doubao also writes the label this way, with no namespaced marker at all (confirmed on the corpus 2026-05-28, `ContentProducer="doubao"`). The PNG-chunk path is gated on at least one TC260 field (`_TC260_FIELDS`) so a generic `AIGC` key cannot false-positive. In `identify`, `aigc` fires on the parsed label **or** the `AIGC_MARKERS` byte scan (the latter preserves the laundering-tell case where the JSON payload is truncated). +- **HuggingFace-hosted job (caught by `metadata.huggingface_job`, surfaced by `identify` as the `hf_job` signal, MEDIUM confidence):** HuggingFace Jobs / Spaces stamp generated PNGs with an `hf-job-id` tEXt chunk holding the job UUID (3 on the corpus 2026-05-28, no other signal). It marks the *hosting job*, not a model — most commonly diffusion output — so it lifts an Unknown verdict to a tentative AI via `hf_only` (parallel to the visible sparkle) but never overrides a hard metadata signal; `_HF_JOB_CAVEAT` states the limit (job, not model; not proof of AI pixels). Stripped on removal (the PNG save whitelist keeps only `STANDARD_METADATA_KEYS`, so `hf-job-id` and the `AIGC` chunk are both dropped). The exact writer is not authoritatively documented (HF Jobs are generic GPU jobs), hence medium not high. - **No detectable signal on download (correctly reported `unknown`):** **Recraft** (PNG export is a re-encoded design export — strips everything), **Krea hosting FLUX 2** (no imwatermark despite FLUX — the host omits the encoder, same as Stability's hosted SDXL), and Midjourney (embeds nothing). Lesson: the imwatermark detector only fires on *pristine* output from a pipeline that runs the encoder (diffusers default, official BFL), not from re-hosts (Krea/Stability) or re-encoded exports (Recraft/Canva). - **Invisible but NOT locally detectable (proprietary, API/oracle only — same wall as SynthID):** Amazon Titan Image Generator + Nova Canvas (Bedrock `DetectGeneratedContent` API), Kakao (new SynthID image adopter, May 2026), NVIDIA Cosmos (SynthID video). No local detector possible; treat like SynthID. - **C2PA 2.4 "Durable Content Credentials" (April 2026; verified against the spec) raise the bar for metadata stripping.** 2.4 defines soft bindings (an invisible watermark or a content fingerprint) plus a server-side manifest repository and a new `c2pa.repository-receipt` assertion. Per the spec: "if a C2PA manifest is removed from an asset, but a copy of that manifest remains in a provenance store elsewhere, the manifest and asset may be matched using available soft bindings." So our local `metadata --remove` deletes the *embedded* manifest, but a fingerprint/watermark soft binding can still re-link the image to its manifest in a repository server-side. Stripping the file is becoming necessary-but-not-sufficient against durable provenance. (Our parsers target the stable embedded-manifest format documented in C2PA 2.1 §11; that format is unchanged in 2.4 -- the new pieces are repository/soft-binding infra, not the on-file box layout, so no parser change is implied.) Spec: https://spec.c2pa.org/specifications/specifications/2.4/specs/C2PA_Specification.html We now READ the soft-binding `alg` (`C2PA_SOFT_BINDINGS` / `soft_binding_vendors_in`) to name the forensic-watermark vendor, and locally DECODE the one open scheme, Adobe TrustMark (`trustmark_detector`); the rest (Digimarc/Imatag/Steg.AI/...) stay name-only (proprietary decoders). diff --git a/README.md b/README.md index 93fcb3f..56fff98 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ If this tool saves you time, consider [sponsoring its development](https://githu - **Smart Face Protection** — automatic extraction and blending of human faces to prevent AI distortion - **Batch processing** — process entire directories - **Detection** — three-stage NCC watermark detection with confidence scoring -- **Provenance detection (`identify`)** — aggregate C2PA issuer, the C2PA soft-binding forensic-watermark vendor (Adobe TrustMark, Digimarc, Imatag, ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, embedded SD/ComfyUI params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the SynthID metadata proxy, the visible sparkle, the open SD/SDXL/FLUX invisible watermark, and (with the `trustmark` extra) the open Adobe TrustMark watermark into one origin-platform + watermark-inventory verdict (`--json` for machine output) +- **Provenance detection (`identify`)** — aggregate C2PA issuer, the C2PA soft-binding forensic-watermark vendor (Adobe TrustMark, Digimarc, Imatag, ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, embedded SD/ComfyUI params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the China TC260 AIGC label (XMP or PNG chunk), the HuggingFace `hf-job-id` job marker, the SynthID metadata proxy, the visible sparkle, the open SD/SDXL/FLUX invisible watermark, and (with the `trustmark` extra) the open Adobe TrustMark watermark into one origin-platform + watermark-inventory verdict (`--json` for machine output) ## Examples @@ -48,13 +48,13 @@ If this tool saves you time, consider [sponsoring its development](https://githu | **xAI Grok (Aurora)** | — | — | ✅ EXIF signature scheme (no C2PA): `Signature:` blob + UUID `Artist` | Detected (`identify`); metadata strip | | **Midjourney** | — | — | ✅ EXIF + XMP (prompt, model, seed) | Metadata strip | | **Meta AI** | — | — | ✅ IPTC "Made with AI" (digitalSourceType) | Metadata strip (removes the label) | -| **Doubao** (ByteDance) / China AIGC generators | ✅ "豆包AI生成" text strip (bottom-right) | — | ✅ TC260 `` XMP label (China's mandatory AI labeling) | Locate + mask + inpaint (cv2, CPU) + metadata strip | +| **Doubao** (ByteDance) / China AIGC generators | ✅ "豆包AI生成" text strip (bottom-right) | — | ✅ TC260 AIGC label — `` XMP **or** `AIGC` PNG chunk (China's mandatory AI labeling) | Locate + mask + inpaint (cv2, CPU) + metadata strip | | **StableSignature** (Meta) | — | ✅ In-model watermark | — | Diffusion regeneration | | **TreeRing** | — | ✅ Latent space watermark | — | Diffusion regeneration | > Visible overlays are used by Google Gemini / Nano Banana (sparkle logo) and by Doubao / China AIGC generators (the mandated "...AI生成" corner text). Both are removed deterministically on CPU. Other services rely on invisible watermarks and/or metadata; our diffusion-based regeneration works against any invisible watermark in pixel or frequency domain. For a visible mark from any other source (any position, any colour), use the universal `erase --region` command. -> **Detection:** `remove-ai-watermarks identify ` reports the origin platform and watermark inventory for all the signals above — C2PA issuer, the C2PA soft-binding forensic-watermark vendor (TrustMark / Digimarc / Imatag / ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, the China TC260 AIGC label, embedded generation params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the SynthID metadata proxy, the visible sparkle, and (with the `[detect]` / `[trustmark]` extras) the open SD/SDXL/FLUX and Adobe TrustMark invisible watermarks. SynthID and the proprietary soft-binding watermarks (Digimarc etc.) have no local decoder, so they are reported by metadata proxy / vendor name only. +> **Detection:** `remove-ai-watermarks identify ` reports the origin platform and watermark inventory for all the signals above — C2PA issuer, the C2PA soft-binding forensic-watermark vendor (TrustMark / Digimarc / Imatag / ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, the China TC260 AIGC label (XMP or PNG chunk), the HuggingFace `hf-job-id` job marker, embedded generation params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the SynthID metadata proxy, the visible sparkle, and (with the `[detect]` / `[trustmark]` extras) the open SD/SDXL/FLUX and Adobe TrustMark invisible watermarks. SynthID and the proprietary soft-binding watermarks (Digimarc etc.) have no local decoder, so they are reported by metadata proxy / vendor name only. ## How it works diff --git a/src/remove_ai_watermarks/identify.py b/src/remove_ai_watermarks/identify.py index 2ec6291..51cc055 100644 --- a/src/remove_ai_watermarks/identify.py +++ b/src/remove_ai_watermarks/identify.py @@ -31,6 +31,7 @@ from remove_ai_watermarks.metadata import ( aigc_label, exif_generator, get_ai_metadata, + huggingface_job, iptc_ai_system, scan_head, xai_signature, @@ -89,6 +90,11 @@ _INVISIBLE_WM_CAVEAT = ( "The open invisible watermark is fragile: it does not survive JPEG re-encoding " "or resizing, so it confirms origin only on a pristine (un-re-encoded) file." ) +_HF_JOB_CAVEAT = ( + "The hf-job-id tag marks a HuggingFace-hosted job (commonly diffusion " + "generation) but names neither the model nor the content type, so it is a " + "medium-confidence signal, not proof the pixels are AI-generated." +) @dataclass @@ -423,9 +429,14 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b ai_vendor_claims["iptc_ai_system"] = v # ── China TC260 AIGC label (Doubao and other China-served gens) ── - aigc = any(m in head for m in AIGC_MARKERS) + # Fire on either the namespaced byte marker (``TC260:AIGC`` / the TC260 ns + # URL, present in XMP and as a laundering tell even when the JSON payload is + # truncated) OR the parsed label, which additionally catches the raw-JSON + # PNG ``AIGC`` tEXt chunk that carries no namespaced marker at all. + aigc_data = aigc_label(image_path) + aigc = aigc_data is not None or any(m in head for m in AIGC_MARKERS) if aigc: - producer = (aigc_label(image_path) or {}).get("ContentProducer", "") + producer = (aigc_data or {}).get("ContentProducer", "") signals.append(Signal("aigc", f"TC260 AIGC label{f' (producer {producer})' if producer else ''}", "high")) watermarks.append("China AIGC label (TC260 standard)") if platform is None: @@ -461,6 +472,18 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b platform = "xAI (Grok / Aurora)" ai_vendor_claims["xai"] = "xAI" + # ── HuggingFace-hosted job marker (hf-job-id PNG text chunk) ───── + # Marks the hosting job, not a model -- medium confidence (commonly diffusion + # output). Like the visible sparkle, it lifts an otherwise-Unknown verdict to + # a tentative AI, but never overrides a high-confidence metadata signal. + hf_job = huggingface_job(image_path) + if hf_job: + signals.append(Signal("hf_job", f"HuggingFace job {hf_job}", "medium")) + watermarks.append("HuggingFace-hosted job (hf-job-id)") + caveats.append(_HF_JOB_CAVEAT) + if platform is None: + platform = "HuggingFace-hosted job (model not identified)" + # ── Open invisible watermark (SD / SDXL / FLUX, dwtDct) ────────── # Public decoder, no key -- a definitive embedded signal on pristine files. if check_invisible and (scheme := _invisible_watermark(image_path)) is not None: @@ -503,11 +526,12 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b platform = "Google Gemini family (visible sparkle detected)" visible_only = any(s.name == "visible_sparkle" for s in signals) and not ai_from_metadata + hf_only = bool(hf_job) and not ai_from_metadata if ai_from_metadata: is_ai: bool | None = True confidence = "high" - elif visible_only: + elif visible_only or hf_only: is_ai = True confidence = "medium" else: diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py index d2f44d9..7363730 100644 --- a/src/remove_ai_watermarks/metadata.py +++ b/src/remove_ai_watermarks/metadata.py @@ -108,6 +108,27 @@ AIGC_MARKERS: tuple[bytes, ...] = ( b"TC260:AIGC", ) +# TC260 AIGC-label JSON fields (the standard's labeling object). Doubao writes +# the same object as a PNG ``tEXt`` chunk keyed ``AIGC`` (raw JSON, not XMP), so +# a JSON object carrying at least one of these is accepted as a valid TC260 +# label even when the namespaced XMP element is absent. +_TC260_FIELDS: frozenset[str] = frozenset( + { + "Label", + "ContentProducer", + "ProduceID", + "ContentPropagator", + "PropagateID", + "ReservedCode1", + "ReservedCode2", + } +) + +# HuggingFace-hosted GPU jobs (Jobs / Spaces) stamp generated PNGs with this +# ``tEXt`` chunk key holding the job UUID. It marks the hosting job, not a +# specific model -- a medium-confidence AI signal (commonly diffusion output). +_HF_JOB_KEY: str = "hf-job-id" + STANDARD_METADATA_KEYS: frozenset[str] = frozenset( [ "Author", @@ -202,31 +223,90 @@ def has_ai_metadata(image_path: Path) -> bool: # IPTC 2025.1 AI-disclosure XMP properties (their presence flags AI content). if any(marker in data for marker in IPTC_AI_FIELD_MARKERS): return True + # China TC260 AIGC label as a PNG text chunk (the byte scan above catches + # only the XMP form; the raw-JSON tEXt chunk needs the PIL-based parse). + if aigc_label(image_path): + return True + # HuggingFace-hosted job marker (hf-job-id PNG text chunk). + if huggingface_job(image_path): + return True # xAI / Grok: no C2PA/IPTC/XMP -- only the EXIF Signature + UUID-Artist pair. return xai_signature(image_path) def aigc_label(image_path: Path) -> dict[str, str] | None: - """Parse a China TC260 ```` AI-labeling block, if present. + """Parse a China TC260 AI-labeling block, if present. + + Two serializations are recognized: + + - a PNG ``tEXt``/``iTXt`` chunk keyed ``AIGC`` carrying the raw JSON object + (as written by Doubao / ByteDance), read via PIL; and + - an XMP ``{...}`` block (HTML-entity encoded text), + found by a container-agnostic raw-byte scan (PNG/JPEG/WebP alike). Returns the decoded JSON (e.g. ``{"Label": "1", "ContentProducer": ...}``) - or None. The block is XMP text (HTML-entity encoded), so it is found by a - container-agnostic raw-byte scan and works for PNG/JPEG/WebP alike. + or None. The PNG-chunk key ``AIGC`` is generic, so a JSON object there is + accepted only if it carries at least one known TC260 field (``_TC260_FIELDS``); + the namespaced XMP element is unambiguous, so any JSON object is accepted. """ import html import json - import re + from typing import cast + def _parse(text: str, *, require_tc260_field: bool) -> dict[str, str] | None: + try: + parsed = json.loads(text) + except ValueError: + return None + if not isinstance(parsed, dict): + return None + fields = {str(k): str(v) for k, v in cast("dict[object, object]", parsed).items()} + if require_tc260_field and not (_TC260_FIELDS & fields.keys()): + return None + return fields + + # PNG tEXt chunk keyed "AIGC" with raw JSON (Doubao and other China gens). + # The key is generic, so require a TC260 field to avoid a false positive. + try: + from PIL import Image + + with Image.open(image_path) as img: + value = img.info.get("AIGC") + except Exception as exc: + logger.debug("PIL could not open %s for AIGC chunk scan: %s", image_path, exc) + value = None + if isinstance(value, str) and (result := _parse(value, require_tc260_field=True)): + return result + + # XMP {...} block (namespaced element, unambiguous). data = scan_head(image_path) match = re.search(rb"(.*?)", data, re.DOTALL) if not match: return None - raw = html.unescape(match.group(1).decode("utf-8", "replace")) + return _parse(html.unescape(match.group(1).decode("utf-8", "replace")), require_tc260_field=False) + + +def huggingface_job(image_path: Path) -> str | None: + """Return the HuggingFace job id if the image carries an ``hf-job-id`` PNG + text chunk, else None. + + HuggingFace-hosted GPU jobs (Jobs / Spaces) stamp generated PNGs with an + ``hf-job-id`` ``tEXt`` chunk holding the job's UUID. It identifies the + *hosting job*, not a specific model, and is most commonly seen on diffusion- + generation output -- a medium-confidence AI signal, not proof of AI pixels + on its own. + """ try: - parsed = json.loads(raw) - except ValueError: + from PIL import Image + + with Image.open(image_path) as img: + value = img.info.get(_HF_JOB_KEY) + except Exception as exc: + logger.debug("PIL could not open %s for hf-job-id scan: %s", image_path, exc) return None - return {str(k): str(v) for k, v in parsed.items()} if isinstance(parsed, dict) else None + if isinstance(value, str) and value.strip(): + return value.strip() + return None def iptc_ai_system(image_path: Path) -> str | None: @@ -500,6 +580,10 @@ def get_ai_metadata(image_path: Path) -> dict[str, str]: # IPTC 2025.1 AI-disclosure XMP fields (Iptc4xmpExt:AISystemUsed etc.). if system := iptc_ai_system(image_path): result.setdefault("ai_system", f"IPTC 2025.1 AI disclosure ({system})") + + # HuggingFace-hosted job marker (hf-job-id PNG text chunk). + if job := huggingface_job(image_path): + result.setdefault("huggingface_job", f"HuggingFace-hosted job ({job})") return result diff --git a/tests/test_identify.py b/tests/test_identify.py index fb9104c..019e761 100644 --- a/tests/test_identify.py +++ b/tests/test_identify.py @@ -201,6 +201,78 @@ class TestIdentifyLocalParams: assert r.signals == [] +# ── China TC260 AIGC label as a PNG text chunk (Doubao) ───────────── + + +class TestIdentifyAigcPngChunk: + """The raw-JSON ``AIGC`` PNG chunk (no namespaced XMP marker) is a high- + confidence AI verdict, same as the XMP form.""" + + def _aigc_chunk_png(self, tmp_path: Path) -> Path: + from PIL import Image + from PIL.PngImagePlugin import PngInfo + + p = tmp_path / "doubao_chunk.png" + pnginfo = PngInfo() + pnginfo.add_text("AIGC", json.dumps({"Label": "1", "ContentProducer": "doubao"})) + Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo) + return p + + def test_png_chunk_detected_high(self, tmp_path: Path): + r = identify(self._aigc_chunk_png(tmp_path), check_visible=False) + assert r.is_ai_generated is True + assert r.confidence == "high" + assert r.platform is not None + assert "AIGC" in r.platform + signal = next(s for s in r.signals if s.name == "aigc") + assert "doubao" in signal.detail + + +# ── HuggingFace-hosted job marker (medium confidence) ─────────────── + + +class TestIdentifyHuggingFaceJob: + """The hf-job-id chunk lifts an otherwise-Unknown verdict to a tentative + (medium) AI, never overriding a high-confidence metadata signal.""" + + def _hf_png(self, tmp_path: Path) -> Path: + from PIL import Image + from PIL.PngImagePlugin import PngInfo + + p = tmp_path / "hfjob.png" + pnginfo = PngInfo() + pnginfo.add_text("hf-job-id", "ec8380a6-2091-423a-b835-209420f99ee1") + Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo) + return p + + def test_hf_job_promotes_to_medium(self, tmp_path: Path): + r = identify(self._hf_png(tmp_path), check_visible=False) + assert r.is_ai_generated is True + assert r.confidence == "medium" + assert r.platform is not None + assert "HuggingFace" in r.platform + signal = next(s for s in r.signals if s.name == "hf_job") + assert signal.confidence == "medium" + + def test_hf_job_caveat_present(self, tmp_path: Path): + r = identify(self._hf_png(tmp_path), check_visible=False) + assert any("hf-job-id" in c for c in r.caveats) + + def test_metadata_keeps_high_even_with_hf_job(self, tmp_png_with_ai_metadata: Path): + # A high-confidence metadata verdict is not downgraded by an hf-job hit. + from PIL import Image + from PIL.PngImagePlugin import PngInfo + + img = Image.open(tmp_png_with_ai_metadata) + pnginfo = PngInfo() + for k, v in img.text.items(): + pnginfo.add_text(k, v) + pnginfo.add_text("hf-job-id", "ec8380a6-2091-423a-b835-209420f99ee1") + img.save(tmp_png_with_ai_metadata, pnginfo=pnginfo) + r = identify(tmp_png_with_ai_metadata, check_visible=False) + assert r.confidence == "high" + + # ── Visible-sparkle fallback (mocked detector) ────────────────────── diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 245c515..b93fe0a 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -554,6 +554,88 @@ class TestAIGCLabel: assert "aigc_label" in meta assert "TC260" in meta["aigc_label"] + def _aigc_chunk_png(self, tmp_path: Path, producer: str = "doubao") -> Path: + """Doubao writes the TC260 object as a PNG ``tEXt`` chunk keyed ``AIGC`` + with raw JSON (no XMP, no namespaced marker).""" + import json + + p = tmp_path / "doubao_chunk.png" + pnginfo = PngInfo() + pnginfo.add_text( + "AIGC", + json.dumps({"Label": "1", "ContentProducer": producer, "ProduceID": "abc123"}), + ) + Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo) + return p + + def test_parses_png_text_chunk_form(self, tmp_path: Path): + from remove_ai_watermarks.metadata import aigc_label + + info = aigc_label(self._aigc_chunk_png(tmp_path)) + assert info is not None + assert info["Label"] == "1" + assert info["ContentProducer"] == "doubao" + + def test_png_chunk_without_tc260_field_ignored(self, tmp_path: Path): + """A generic ``AIGC`` chunk with no TC260 field must not false-positive.""" + import json + + from remove_ai_watermarks.metadata import aigc_label + + p = tmp_path / "unrelated.png" + pnginfo = PngInfo() + pnginfo.add_text("AIGC", json.dumps({"unrelated": "value"})) + Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo) + assert aigc_label(p) is None + + def test_has_ai_metadata_detects_png_chunk_form(self, tmp_path: Path): + assert has_ai_metadata(self._aigc_chunk_png(tmp_path)) + + def test_remove_strips_png_chunk_form(self, tmp_path: Path): + from remove_ai_watermarks.metadata import aigc_label, remove_ai_metadata + + out = tmp_path / "clean.png" + remove_ai_metadata(self._aigc_chunk_png(tmp_path), out) + assert aigc_label(out) is None + assert not has_ai_metadata(out) + + +class TestHuggingFaceJob: + """HuggingFace-hosted job marker (``hf-job-id`` PNG text chunk).""" + + def _hf_png(self, tmp_path: Path, job_id: str = "ec8380a6-2091-423a-b835-209420f99ee1") -> Path: + p = tmp_path / "hfjob.png" + pnginfo = PngInfo() + pnginfo.add_text("hf-job-id", job_id) + Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo) + return p + + def test_returns_job_id(self, tmp_path: Path): + from remove_ai_watermarks.metadata import huggingface_job + + assert huggingface_job(self._hf_png(tmp_path)) == "ec8380a6-2091-423a-b835-209420f99ee1" + + def test_none_when_absent(self, tmp_clean_png): + from remove_ai_watermarks.metadata import huggingface_job + + assert huggingface_job(tmp_clean_png) is None + + def test_has_ai_metadata_detects_hf_job(self, tmp_path: Path): + assert has_ai_metadata(self._hf_png(tmp_path)) + + def test_get_ai_metadata_surfaces_hf_job(self, tmp_path: Path): + meta = get_ai_metadata(self._hf_png(tmp_path)) + assert "huggingface_job" in meta + assert "ec8380a6" in meta["huggingface_job"] + + def test_remove_strips_hf_job(self, tmp_path: Path): + from remove_ai_watermarks.metadata import huggingface_job, remove_ai_metadata + + out = tmp_path / "clean.png" + remove_ai_metadata(self._hf_png(tmp_path), out) + assert huggingface_job(out) is None + assert not has_ai_metadata(out) + @pytest.mark.skipif(not (SAMPLES_DIR / "doubao-1.png").exists(), reason="doubao sample not present") class TestAIGCRealSample: