From c7f0d71f906bae5ab952cdf248bb1ab81b44ff3a Mon Sep 17 00:00:00 2001 From: test-user Date: Mon, 25 May 2026 12:29:51 -0700 Subject: [PATCH] feat(identify): detect China TC260 AIGC label (Doubao et al.) China-served generators embed an XMP {"Label":"1",...} block (China's mandatory AI-content labeling, TC260 standard). Doubao (ByteDance) uses it -- verified on the real #13 sample. It's none of C2PA / SynthID / imwatermark / IPTC, so identify() previously returned unknown. - metadata: AIGC_MARKERS + aigc_label() (json-decodes the HTML-entity-encoded block); has_ai_metadata + get_ai_metadata now surface it. - identify: new 'aigc' signal -> is_ai True, platform 'China AIGC-labeled generator (TC260; e.g. Doubao)', carries the ContentProducer code. - Container-agnostic raw-byte scan, so it covers the whole China-AIGC ecosystem (Jimeng/Kling/Qwen/Ernie share the standard). - Tests: synthetic TC260 block (metadata + identify). Docs updated. Addresses #13. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 3 +- README.md | 3 +- pyproject.toml | 7 +++-- src/remove_ai_watermarks/identify.py | 15 +++++++++- src/remove_ai_watermarks/metadata.py | 42 +++++++++++++++++++++++++++ tests/test_identify.py | 32 +++++++++++++++++++++ tests/test_metadata.py | 43 ++++++++++++++++++++++++++++ 7 files changed, 140 insertions(+), 5 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 7e57349..5e0dd92 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,7 +29,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `metadata.py` — `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. - `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, IPTC "Made with AI", embedded SD/ComfyUI params, SynthID proxy, visible Gemini sparkle) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. Add platform mappings to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. - `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`. -- `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. +- `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. The `detect` extra pulls **torch** transitively (invisible-watermark declares torch a hard dep, and `WatermarkDecoder` eagerly imports `rivaGan` -> `torch` at import time), so detection needs torch present even though dwtDct runs CPU-only on cv2/numpy/pywavelets — no GPU and no separate `gpu` extra required. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. - `face_protector.py` — YOLO detect + soft-blend pattern; mirror this for any "protect region during diffusion" features ## Watermarking landscape (research 2026-05-24) @@ -38,6 +38,7 @@ Who embeds what, and whether it is locally detectable (so we know which gaps are - **Locally detectable (open decoder, no key/API):** Stable Diffusion / SDXL / FLUX via `imwatermark` DWT-DCT (now covered by `invisible_watermark.py`). FLUX uses the same library (`black-forest-labs/flux2` `src/flux2/watermark.py`, 48-bit `0b001010101111111010000111100111001111010100101110`); SDXL is the diffusers `WATERMARK_MESSAGE` (`0b101100111110110010010000011110111011000110011110`). Caveat: fragile to re-encoding. - **C2PA / IPTC (covered by the issuer/marker scan):** OpenAI, Google, Adobe Firefly, Microsoft (Designer + **Bing Image Creator** — collected 2026-05-24; Bing now runs Microsoft's own **MAI-Image** model, signs C2PA as "Microsoft", NOT OpenAI/DALL-E), and **Stability AI** (collected from Brand Studio / DreamStudio successor; signs C2PA as "Stability AI Ltd", no SynthID, no imwatermark on its current Stable Image model — issuer added to `C2PA_ISSUERS`). Still unsampled: Canva (its downloads are re-encoded design *exports* that strip C2PA, so a Canva "positive" is inconclusive — skipped), Getty, Shutterstock. Midjourney embeds NO C2PA and no invisible watermark (our `mj-*` sample carried only the IPTC tag). - **EXIF/XMP generator tag (caught by `exif_generator`):** **Ideogram** writes EXIF `Make="Ideogram AI"` (collected 2026-05-24 — no C2PA, no SynthID, no imwatermark; the Make tag is the only signal). +- **China TC260 AIGC label (caught by `AIGC_MARKERS` / `metadata.aigc_label`, surfaced by `identify` as the `aigc` signal):** China-served generators embed an XMP `{"Label":"1","ContentProducer":...}` block — China's mandatory AI-content labeling (TC260 namespace `tc260.org.cn/ns/AIGC`). **Doubao** (ByteDance) uses it (verified on the real #13 sample 2026-05-25; `ContentProducer` `001191110102MACQD9K64010000`, no C2PA/SynthID/imwatermark — the XMP block is the only signal; GitHub attachment upload did NOT strip it). The same standard is mandatory for Jimeng/Kling/Qwen/Ernie etc., so the one marker covers the whole China-AIGC-labeled ecosystem. `aigc_label` json-decodes the (HTML-entity-encoded) block; container-agnostic raw-byte scan. - **No detectable signal on download (correctly reported `unknown`):** **Recraft** (PNG export is a re-encoded design export — strips everything), **Krea hosting FLUX 2** (no imwatermark despite FLUX — the host omits the encoder, same as Stability's hosted SDXL), and Midjourney (embeds nothing). Lesson: the imwatermark detector only fires on *pristine* output from a pipeline that runs the encoder (diffusers default, official BFL), not from re-hosts (Krea/Stability) or re-encoded exports (Recraft/Canva). - **Invisible but NOT locally detectable (proprietary, API/oracle only — same wall as SynthID):** Amazon Titan Image Generator + Nova Canvas (Bedrock `DetectGeneratedContent` API), Kakao (new SynthID image adopter, May 2026), NVIDIA Cosmos (SynthID video). No local detector possible; treat like SynthID. diff --git a/README.md b/README.md index 18fad26..de565b0 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,13 @@ Strips SynthID, C2PA Content Credentials, EXIF/XMP "Made with AI" labels, and vi | **Microsoft Designer / Bing Image Creator** | — | ✅ SynthID via DALL-E backend (Designer) | ✅ C2PA (Bing runs MAI-Image, signed "Microsoft") | Metadata strip | | **Midjourney** | — | — | ✅ EXIF + XMP (prompt, model, seed) | Metadata strip | | **Meta AI** | — | — | ✅ IPTC "Made with AI" (digitalSourceType) | Metadata strip (removes the label) | +| **Doubao** (ByteDance) / China AIGC generators | — | — | ✅ TC260 `` XMP label (China's mandatory AI labeling) | Metadata strip | | **StableSignature** (Meta) | — | ✅ In-model watermark | — | Diffusion regeneration | | **TreeRing** | — | ✅ Latent space watermark | — | Diffusion regeneration | > Visible watermarks (logo overlays) are currently used only by Google Gemini / Nano Banana. Other services rely on invisible watermarks and/or metadata. Our diffusion-based regeneration works against any invisible watermark in pixel or frequency domain. -> **Detection:** `remove-ai-watermarks identify ` reports the origin platform and watermark inventory for all the signals above — C2PA issuer, IPTC "Made with AI", embedded generation params, EXIF/XMP generator tags, the SynthID metadata proxy, the visible sparkle, and (with the `[detect]` extra) the open SD/SDXL/FLUX invisible watermark. The SynthID *pixel* watermark has no local decoder, so it is reported as a metadata proxy only. +> **Detection:** `remove-ai-watermarks identify ` reports the origin platform and watermark inventory for all the signals above — C2PA issuer, IPTC "Made with AI", the China TC260 AIGC label, embedded generation params, EXIF/XMP generator tags, the SynthID metadata proxy, the visible sparkle, and (with the `[detect]` extra) the open SD/SDXL/FLUX invisible watermark. The SynthID *pixel* watermark has no local decoder, so it is reported as a metadata proxy only. ## How it works diff --git a/pyproject.toml b/pyproject.toml index 9b022ac..849eec7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,11 @@ gpu = [ ] # Open invisible-watermark (imwatermark) decoder for detecting the DWT-DCT # watermarks embedded by Stable Diffusion / SDXL / FLUX. Optional because it -# pulls non-headless opencv; identify() guards the import and skips the signal -# when absent. +# pulls non-headless opencv AND torch (invisible-watermark declares torch a hard +# dependency, and WatermarkDecoder eagerly imports rivaGan -> torch at import +# time, so the dwtDct-only detect path still needs torch present even though it +# never runs on GPU). So `detect` alone pulls torch -- no need to add `gpu` for +# detection. identify() guards the import and skips the signal when absent. detect = [ "invisible-watermark>=0.2.0", ] diff --git a/src/remove_ai_watermarks/identify.py b/src/remove_ai_watermarks/identify.py index 0fb58b4..c23cc89 100644 --- a/src/remove_ai_watermarks/identify.py +++ b/src/remove_ai_watermarks/identify.py @@ -24,8 +24,10 @@ from typing import TYPE_CHECKING from remove_ai_watermarks.metadata import ( AI_METADATA_KEYS, + AIGC_MARKERS, C2PA_UUID, IPTC_AI_MARKERS, + aigc_label, exif_generator, get_ai_metadata, ) @@ -219,6 +221,15 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b if platform is None: platform = "Made-with-AI tag (e.g. Meta AI); platform not specified" + # ── China TC260 AIGC label (Doubao and other China-served gens) ── + aigc = any(m in head for m in AIGC_MARKERS) + if aigc: + producer = (aigc_label(image_path) or {}).get("ContentProducer", "") + signals.append(Signal("aigc", f"TC260 AIGC label{f' (producer {producer})' if producer else ''}", "high")) + watermarks.append("China AIGC label (TC260 standard)") + if platform is None: + platform = "China AIGC-labeled generator (TC260; e.g. Doubao)" + # ── Local diffusion parameters (Stable Diffusion / ComfyUI) ────── local_keys = sorted(k for k in meta if k.lower() in _LOCAL_GEN_KEYS) if local_keys: @@ -247,7 +258,9 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b # ── Verdict so far (metadata + embedded watermark) ────────────── invisible_wm = any(s.name == "invisible_watermark" for s in signals) exif_gen = any(s.name == "exif_generator" for s in signals) - ai_from_metadata = bool((has_c2pa and (c2pa_is_ai or synthid)) or iptc or local_keys or invisible_wm or exif_gen) + ai_from_metadata = bool( + (has_c2pa and (c2pa_is_ai or synthid)) or iptc or aigc or local_keys or invisible_wm or exif_gen + ) # ── Visible Gemini sparkle (fallback for stripped-metadata case) ─ if check_visible and (conf := _visible_sparkle(image_path)) is not None and conf >= _SPARKLE_THRESHOLD: diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py index 6263967..a21594d 100644 --- a/src/remove_ai_watermarks/metadata.py +++ b/src/remove_ai_watermarks/metadata.py @@ -73,6 +73,17 @@ IPTC_AI_MARKERS: tuple[bytes, ...] = ( b"compositeWithTrainedAlgorithmicMedia", ) +# China's mandatory AI-content labeling (TC260, the national cybersecurity +# standards committee). AI generators serving China embed an XMP block in the +# TC260 namespace -- ``{"Label":"1",...}``. Doubao (ByteDance) uses +# this; the same standard is mandatory for Jimeng, Kling, Qwen, Ernie, etc., +# so the marker covers the whole China-AIGC-labeled ecosystem. Container- +# agnostic (XMP is text), so a raw-byte scan catches it in PNG/JPEG/etc. +AIGC_MARKERS: tuple[bytes, ...] = ( + b"tc260.org.cn/ns/AIGC", + b"TC260:AIGC", +) + STANDARD_METADATA_KEYS: frozenset[str] = frozenset( [ "Author", @@ -139,9 +150,35 @@ def has_ai_metadata(image_path: Path) -> bool: return True if C2PA_UUID in data: return True + if any(marker in data for marker in AIGC_MARKERS): + return True return any(marker in data for marker in IPTC_AI_MARKERS) +def aigc_label(image_path: Path) -> dict[str, str] | None: + """Parse a China TC260 ```` AI-labeling block, if present. + + Returns the decoded JSON (e.g. ``{"Label": "1", "ContentProducer": ...}``) + or None. The block is XMP text (HTML-entity encoded), so it is found by a + container-agnostic raw-byte scan and works for PNG/JPEG/WebP alike. + """ + import html + import json + import re + + with open(image_path, "rb") as f: + data = f.read(1024 * 1024) + match = re.search(rb"(.*?)", data, re.DOTALL) + if not match: + return None + raw = html.unescape(match.group(1).decode("utf-8", "replace")) + try: + parsed = json.loads(raw) + except ValueError: + return None + return {str(k): str(v) for k, v in parsed.items()} if isinstance(parsed, dict) else None + + def synthid_source(image_path: Path) -> str | None: """Return the vendor name(s) if the image carries a SynthID pixel watermark. @@ -286,6 +323,11 @@ def get_ai_metadata(image_path: Path) -> dict[str, str]: # fall back to the format-agnostic source check for the SynthID verdict. if "synthid_watermark" not in result and (vendor := synthid_source(image_path)): result.setdefault("synthid_watermark", synthid_verdict(vendor)) + + # China TC260 AI-content label (Doubao and other China-served generators). + if aigc := aigc_label(image_path): + producer = aigc.get("ContentProducer", "") + result["aigc_label"] = f"China AIGC label (TC260){f'; producer {producer}' if producer else ''}" return result diff --git a/tests/test_identify.py b/tests/test_identify.py index ff31f5a..9ef7a33 100644 --- a/tests/test_identify.py +++ b/tests/test_identify.py @@ -317,3 +317,35 @@ class TestIdentifyInvisibleWatermark: def test_check_invisible_false_skips(self, tmp_path: Path): r = identify(self._sdxl_watermarked(tmp_path), check_visible=False, check_invisible=False) assert not any(s.name == "invisible_watermark" for s in r.signals) + + +class TestIdentifyAIGC: + """China TC260 AIGC label is detected and attributed (e.g. Doubao).""" + + def _aigc_png(self, tmp_path: Path) -> Path: + from PIL import Image + + p = tmp_path / "doubao.png" + Image.new("RGB", (32, 32)).save(p) + xmp = ( + '' + '' + "{"Label":"1","ContentProducer":"BYTEDANCE001"}" + "" + ) + with open(p, "ab") as f: + f.write(xmp.encode()) + return p + + def test_aigc_detected(self, tmp_path: Path): + r = identify(self._aigc_png(tmp_path), check_visible=False) + assert r.is_ai_generated is True + assert r.platform is not None + assert "AIGC" in r.platform or "TC260" in r.platform + assert any("AIGC" in w for w in r.watermarks) + + def test_aigc_signal_carries_producer(self, tmp_path: Path): + r = identify(self._aigc_png(tmp_path), check_visible=False) + sig = next(s for s in r.signals if s.name == "aigc") + assert "BYTEDANCE001" in sig.detail diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 268ab45..7dafa8a 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -390,3 +390,46 @@ class TestExifGenerator: def test_clean_image_is_none(self, tmp_clean_png: Path): assert exif_generator(tmp_clean_png) is None + + +class TestAIGCLabel: + """China TC260 AIGC labeling (Doubao and other China-served generators).""" + + def _aigc_png(self, tmp_path: Path, label: str = "1", producer: str = "TESTPRODUCER001") -> Path: + from remove_ai_watermarks.metadata import aigc_label # noqa: F401 (import-time guard) + + p = tmp_path / "doubao.png" + Image.new("RGB", (32, 32)).save(p) + # XMP is HTML-entity encoded in real files; aigc_label must unescape it. + xmp = ( + '' + '' + f"{{"Label":"{label}","ContentProducer":"{producer}"}}" + "" + ) + with open(p, "ab") as f: + f.write(xmp.encode()) + return p + + def test_parses_label_and_producer(self, tmp_path: Path): + from remove_ai_watermarks.metadata import aigc_label + + info = aigc_label(self._aigc_png(tmp_path)) + assert info is not None + assert info["Label"] == "1" + assert info["ContentProducer"] == "TESTPRODUCER001" + + def test_none_when_absent(self, tmp_clean_png): + from remove_ai_watermarks.metadata import aigc_label + + assert aigc_label(tmp_clean_png) is None + + def test_has_ai_metadata_detects_aigc(self, tmp_path: Path): + assert has_ai_metadata(self._aigc_png(tmp_path)) + + def test_get_ai_metadata_surfaces_aigc(self, tmp_path: Path): + meta = get_ai_metadata(self._aigc_png(tmp_path)) + assert "aigc_label" in meta + assert "TC260" in meta["aigc_label"]