From ba94de82752b0d985585d9fb7dfbea22a54ea08c Mon Sep 17 00:00:00 2001 From: test-user Date: Tue, 26 May 2026 14:26:20 -0700 Subject: [PATCH] feat: strip AI-provenance EXIF tags on removal (v0.5.6) remove_ai_metadata now scrubs AI tags from the JPEG EXIF instead of passing the block through wholesale. Closes the v0.5.5 follow-up: the xAI/Grok Signature + UUID-Artist pair was detected but not removed. - metadata._scrub_ai_exif(): deletes the xAI signature pair and any Software/Make/Artist/ImageDescription tag carrying an AI_GENERATOR_TOKENS token (so Ideogram's Make="Ideogram AI" is scrubbed too), keeping genuine camera/editor EXIF intact. - Shared _is_xai_signature_pair / _exif_text helpers (module-level compiled regexes) are now the single source of truth, used by both xai_signature and _scrub_ai_exif. - Tests: Grok signature stripped on JPEG output, Ideogram Make stripped, real-camera Make ("Apple") preserved. 325 passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 2 +- pyproject.toml | 2 +- src/remove_ai_watermarks/__init__.py | 2 +- src/remove_ai_watermarks/metadata.py | 82 +++++++++++++++++++++++----- tests/test_metadata.py | 38 +++++++++++++ uv.lock | 2 +- 6 files changed, 110 insertions(+), 18 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index d214851..93544e9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,7 +38,7 @@ Who embeds what, and whether it is locally detectable (so we know which gaps are - **Locally detectable (open decoder, no key/API):** Stable Diffusion / SDXL / FLUX via `imwatermark` DWT-DCT (now covered by `invisible_watermark.py`). FLUX uses the same library (`black-forest-labs/flux2` `src/flux2/watermark.py`, 48-bit `0b001010101111111010000111100111001111010100101110`); SDXL is the diffusers `WATERMARK_MESSAGE` (`0b101100111110110010010000011110111011000110011110`). Caveat: fragile to re-encoding. - **C2PA / IPTC (covered by the issuer/marker scan):** OpenAI, Google, Adobe Firefly, Microsoft (Designer + **Bing Image Creator** — collected 2026-05-24; Bing now runs Microsoft's own **MAI-Image** model, signs C2PA as "Microsoft", NOT OpenAI/DALL-E), and **Stability AI** (collected from Brand Studio / DreamStudio successor; signs C2PA as "Stability AI Ltd", no SynthID, no imwatermark on its current Stable Image model — issuer added to `C2PA_ISSUERS`). Still unsampled: Canva (its downloads are re-encoded design *exports* that strip C2PA, so a Canva "positive" is inconclusive — skipped), Getty, Shutterstock. Midjourney embeds NO C2PA and no invisible watermark (our `mj-*` sample carried only the IPTC tag). - **EXIF/XMP generator tag (caught by `exif_generator`):** **Ideogram** writes EXIF `Make="Ideogram AI"` (collected 2026-05-24 — no C2PA, no SynthID, no imwatermark; the Make tag is the only signal). -- **xAI / Grok — its own EXIF signature scheme, NOT C2PA (DETECTED by `metadata.xai_signature`, built 2026-05-26).** Grok JPEG downloads (Aurora model) carry **no C2PA, no XMP, no SynthID, no IPTC** — only EXIF `Artist` = a UUID and EXIF `ImageDescription` = `Signature: ` (a crypto signature, unverifiable locally without xAI's public key). This empirically kills the earlier unverified "xAI signs C2PA as xAI" lead — xAI is not even a C2PA member. `exif_generator` misses it (neither field holds an `AI_GENERATOR_TOKENS` token), so a dedicated detector `xai_signature(path)` matches the pair (`ImageDescription ~ ^Signature: [A-Za-z0-9+/=]{64,}` AND UUID `Artist`); wired into `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify` (signal `xai_signature`, platform "xAI (Grok / Aurora)"). **Format confirmed stable across n=3 genuine generations:** exactly three EXIF tags (`Artist`, `ExifOffset`, `ImageDescription`), `Signature:` prefix constant, base64 payload 300-1004 chars. Two capture facts: (a) the `Artist` UUID **equals the public image id** in the asset URL (`https://imagine-public.x.ai/imagine-public/images/.jpg`), so it is NOT a private per-user secret — only the `Signature` blob is; (b) the Grok web-UI image is a re-encoded **WebP with no signature** — the EXIF survives only in the *original* JPEG (download button or that public tokenless URL), which is why screenshots / re-encodes are metadata-stripped. Tests use a **synthetic** JPEG fixture (fake UUID + fake `Signature:` blob), never a real Grok image (its repo is public). Not yet *stripped* on removal: `remove_ai_metadata` preserves the EXIF block wholesale for JPEG, so the signature is detected but not scrubbed — a follow-up. +- **xAI / Grok — its own EXIF signature scheme, NOT C2PA (DETECTED by `metadata.xai_signature`, built 2026-05-26).** Grok JPEG downloads (Aurora model) carry **no C2PA, no XMP, no SynthID, no IPTC** — only EXIF `Artist` = a UUID and EXIF `ImageDescription` = `Signature: ` (a crypto signature, unverifiable locally without xAI's public key). This empirically kills the earlier unverified "xAI signs C2PA as xAI" lead — xAI is not even a C2PA member. `exif_generator` misses it (neither field holds an `AI_GENERATOR_TOKENS` token), so a dedicated detector `xai_signature(path)` matches the pair (`ImageDescription ~ ^Signature: [A-Za-z0-9+/=]{64,}` AND UUID `Artist`); wired into `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify` (signal `xai_signature`, platform "xAI (Grok / Aurora)"). **Format confirmed stable across n=3 genuine generations:** exactly three EXIF tags (`Artist`, `ExifOffset`, `ImageDescription`), `Signature:` prefix constant, base64 payload 300-1004 chars. Two capture facts: (a) the `Artist` UUID **equals the public image id** in the asset URL (`https://imagine-public.x.ai/imagine-public/images/.jpg`), so it is NOT a private per-user secret — only the `Signature` blob is; (b) the Grok web-UI image is a re-encoded **WebP with no signature** — the EXIF survives only in the *original* JPEG (download button or that public tokenless URL), which is why screenshots / re-encodes are metadata-stripped. A real fixture `data/samples/grok-1.jpg` plus **synthetic** JPEG fixtures (fake UUID + fake `Signature:` blob) cover the detector; never add a real Grok image carrying private content (the repo is public). **Stripped on removal too:** `remove_ai_metadata` now calls `_scrub_ai_exif` on the JPEG EXIF, which deletes the xAI Signature+UUID-Artist pair **and** any `Software`/`Make`/`Artist`/`ImageDescription` tag holding an `AI_GENERATOR_TOKENS` token (so Ideogram's `Make="Ideogram AI"` is scrubbed too), while keeping genuine camera/editor EXIF. The shared `_is_xai_signature_pair` helper (module-level compiled regexes) is the single source of truth for the pattern, used by both `xai_signature` and `_scrub_ai_exif`. (AVIF/HEIF/JXL still strip only C2PA boxes via `isobmff`, not EXIF — unchanged.) - **China TC260 AIGC label (caught by `AIGC_MARKERS` / `metadata.aigc_label`, surfaced by `identify` as the `aigc` signal):** China-served generators embed an XMP `{"Label":"1","ContentProducer":...}` block — China's mandatory AI-content labeling (TC260 namespace `tc260.org.cn/ns/AIGC`). **Doubao** (ByteDance) uses it (verified on the real #13 sample 2026-05-25; `ContentProducer` `001191110102MACQD9K64010000`, no C2PA/SynthID/imwatermark — the XMP block is the only signal; GitHub attachment upload did NOT strip it). The same standard is mandatory for Jimeng/Kling/Qwen/Ernie etc., so the one marker covers the whole China-AIGC-labeled ecosystem. `aigc_label` json-decodes the (HTML-entity-encoded) block; container-agnostic raw-byte scan. - **No detectable signal on download (correctly reported `unknown`):** **Recraft** (PNG export is a re-encoded design export — strips everything), **Krea hosting FLUX 2** (no imwatermark despite FLUX — the host omits the encoder, same as Stability's hosted SDXL), and Midjourney (embeds nothing). Lesson: the imwatermark detector only fires on *pristine* output from a pipeline that runs the encoder (diffusers default, official BFL), not from re-hosts (Krea/Stability) or re-encoded exports (Recraft/Canva). - **Invisible but NOT locally detectable (proprietary, API/oracle only — same wall as SynthID):** Amazon Titan Image Generator + Nova Canvas (Bedrock `DetectGeneratedContent` API), Kakao (new SynthID image adopter, May 2026), NVIDIA Cosmos (SynthID video). No local detector possible; treat like SynthID. diff --git a/pyproject.toml b/pyproject.toml index 4fbdccf..82e89c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "remove-ai-watermarks" -version = "0.5.5" +version = "0.5.6" description = "Remove visible and invisible AI watermarks from images (Gemini / Nano Banana, ChatGPT, Stable Diffusion)" readme = "README.md" requires-python = ">=3.10" diff --git a/src/remove_ai_watermarks/__init__.py b/src/remove_ai_watermarks/__init__.py index 081f75f..72037a6 100644 --- a/src/remove_ai_watermarks/__init__.py +++ b/src/remove_ai_watermarks/__init__.py @@ -1,3 +1,3 @@ """Remove-AI-Watermarks: Unified tool for removing visible and invisible AI watermarks.""" -__version__ = "0.5.5" +__version__ = "0.5.6" diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py index a888f88..7517c5e 100644 --- a/src/remove_ai_watermarks/metadata.py +++ b/src/remove_ai_watermarks/metadata.py @@ -10,6 +10,7 @@ from __future__ import annotations import contextlib import logging +import re from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -277,6 +278,24 @@ def exif_generator(image_path: Path) -> str | None: return None +# xAI / Grok EXIF signature scheme. A 64+ char base64 blob after "Signature:" +# is far beyond any incidental description text, and the UUID Artist makes the +# pair xAI-specific -- both required keeps the false-positive rate near zero. +_XAI_SIGNATURE_RE = re.compile(r"Signature:\s*[A-Za-z0-9+/=]{64,}") +_UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE) + + +def _is_xai_signature_pair(description: str, artist: str) -> bool: + """True if an EXIF (ImageDescription, Artist) pair is xAI/Grok's scheme.""" + return _XAI_SIGNATURE_RE.match(description) is not None and _UUID_RE.fullmatch(artist) is not None + + +def _exif_text(ifd: dict, tag: int) -> str: + """Decode a piexif 0th-IFD byte tag to a stripped string ('' if absent).""" + value = ifd.get(tag) + return value.decode("latin1", "replace").strip() if isinstance(value, bytes) else "" + + def xai_signature(image_path: Path) -> bool: """Detect xAI / Grok's EXIF provenance signature scheme. @@ -289,8 +308,6 @@ def xai_signature(image_path: Path) -> bool: validity. It survives only on the *original* JPEG download -- the web-UI image is a re-encoded WebP that drops EXIF. """ - import re - try: import piexif from PIL import Image @@ -304,19 +321,52 @@ def xai_signature(image_path: Path) -> bool: logger.debug("xAI-signature EXIF read failed for %s: %s", image_path, exc) return False - def _text(tag: int) -> str: - value = tags.get(tag) - return value.decode("latin1", "replace").strip() if isinstance(value, bytes) else "" - - description = _text(piexif.ImageIFD.ImageDescription) - artist = _text(piexif.ImageIFD.Artist) - # A 64+ char base64 blob after "Signature:" is far beyond any incidental - # description text, and the UUID Artist makes the pair xAI-specific. - has_signature = re.match(r"Signature:\s*[A-Za-z0-9+/=]{64,}", description) is not None - is_uuid = ( - re.fullmatch(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", artist, re.IGNORECASE) is not None + return _is_xai_signature_pair( + _exif_text(tags, piexif.ImageIFD.ImageDescription), _exif_text(tags, piexif.ImageIFD.Artist) ) - return has_signature and is_uuid + + +def _scrub_ai_exif(exif_dict: dict) -> list[str]: + """Delete AI-provenance tags from a piexif dict's ``0th`` IFD, in place. + + Removes (a) the xAI/Grok signature pair (``ImageDescription`` "Signature: ..." + + UUID ``Artist``) and (b) any ``Software`` / ``Make`` / ``Artist`` / + ``ImageDescription`` tag whose value carries an ``AI_GENERATOR_TOKENS`` token + (Ideogram's ``Make``, Firefly's ``Software``, etc.). Mirrors the detection in + ``xai_signature`` / ``exif_generator`` so removal scrubs exactly what + ``identify`` flags, while leaving genuine camera/editor EXIF intact. Returns + the names of the removed tags (for logging). + """ + import piexif + + from remove_ai_watermarks.noai.constants import AI_GENERATOR_TOKENS + + ifd = exif_dict.get("0th") + if not ifd: + return [] + + drop: dict[int, str] = {} + + # (a) xAI / Grok: the Signature blob and the UUID Artist go together. + if _is_xai_signature_pair( + _exif_text(ifd, piexif.ImageIFD.ImageDescription), _exif_text(ifd, piexif.ImageIFD.Artist) + ): + drop[piexif.ImageIFD.ImageDescription] = "ImageDescription" + drop[piexif.ImageIFD.Artist] = "Artist" + + # (b) Known AI generator token in any of the text tags. + for tag, name in ( + (piexif.ImageIFD.Software, "Software"), + (piexif.ImageIFD.Make, "Make"), + (piexif.ImageIFD.Artist, "Artist"), + (piexif.ImageIFD.ImageDescription, "ImageDescription"), + ): + if any(token in _exif_text(ifd, tag).lower() for token in AI_GENERATOR_TOKENS): + drop[tag] = name + + for tag in drop: + ifd.pop(tag, None) + return list(drop.values()) def get_ai_metadata(image_path: Path) -> dict[str, str]: @@ -456,6 +506,10 @@ def remove_ai_metadata( save_kwargs["pnginfo"] = pnginfo if exif_data and save_kwargs["format"] == "JPEG": + # Scrub AI-provenance EXIF tags (xAI/Grok signature, generator tokens) + # while keeping genuine camera/editor EXIF; PNG output drops EXIF entirely. + if removed := _scrub_ai_exif(exif_data): + logger.info("Scrubbed AI EXIF tag(s): %s", ", ".join(removed)) with contextlib.suppress(Exception): save_kwargs["exif"] = piexif.dump(exif_data) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 47b417b..37c0664 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -471,6 +471,44 @@ class TestXaiSignature: assert has_ai_metadata(_grok_jpeg(tmp_path)) is True +class TestRemoveAiExif: + """remove_ai_metadata scrubs AI-provenance EXIF tags but keeps genuine EXIF.""" + + def test_grok_signature_stripped_on_jpeg_output(self, tmp_path: Path): + src = _grok_jpeg(tmp_path) + assert xai_signature(src) is True + out = tmp_path / "clean.jpg" + remove_ai_metadata(src, out) + assert xai_signature(out) is False + assert has_ai_metadata(out) is False + + def test_generator_make_token_stripped(self, tmp_path: Path): + # Ideogram's EXIF Make="Ideogram AI" must be scrubbed on removal. + exif = piexif.dump({"0th": {piexif.ImageIFD.Make: b"Ideogram AI"}, "Exif": {}, "GPS": {}, "1st": {}}) + src = tmp_path / "ideogram.jpg" + Image.new("RGB", (64, 64)).save(src, exif=exif) + out = tmp_path / "clean.jpg" + remove_ai_metadata(src, out) + assert exif_generator(out) is None + + def test_real_camera_exif_preserved(self, tmp_path: Path): + # A real-camera Make ("Apple") carries no AI token and must survive. + exif = piexif.dump( + { + "0th": {piexif.ImageIFD.Make: b"Apple", piexif.ImageIFD.Model: b"iPhone 15"}, + "Exif": {}, + "GPS": {}, + "1st": {}, + } + ) + src = tmp_path / "photo.jpg" + Image.new("RGB", (64, 64)).save(src, exif=exif) + out = tmp_path / "out.jpg" + remove_ai_metadata(src, out) + kept = piexif.load(Image.open(out).info["exif"])["0th"] + assert kept.get(piexif.ImageIFD.Make) == b"Apple" + + class TestAIGCLabel: """China TC260 AIGC labeling (Doubao and other China-served generators).""" diff --git a/uv.lock b/uv.lock index 94daf18..74b324c 100644 --- a/uv.lock +++ b/uv.lock @@ -2150,7 +2150,7 @@ wheels = [ [[package]] name = "remove-ai-watermarks" -version = "0.5.5" +version = "0.5.6" source = { editable = "." } dependencies = [ { name = "click" },