From ad3b8ee248333ed286b1a787acb65e3394eafc62 Mon Sep 17 00:00:00 2001 From: test-user Date: Sun, 24 May 2026 17:56:39 -0700 Subject: [PATCH] feat(identify): read EXIF Software / XMP CreatorTool generator tags Closes the documented gap where EXIF/XMP fields inside AVIF/HEIF/JXL went unparsed. metadata.exif_generator extracts the EXIF Software/Artist tag (via PIL+piexif, which opens AVIF natively) and the XMP CreatorTool (via a container-agnostic raw-byte scan that also covers HEIF/JXL that PIL can't open), and matches against AI_GENERATOR_TOKENS so only generator names (Firefly, DALL-E, Midjourney, ComfyUI, ...) fire -- a plain 'Adobe Photoshop' or 'GIMP' tag is not flagged. identify() surfaces it as a high-confidence signal and uses it for platform attribution when no C2PA names a platform, so an AVIF/HEIF whose only AI signal is an EXIF/XMP generator tag is now caught. Validated with synthesized fixtures (the 'no positive fixtures' blocker was self-imposed): real AVIF and JPEG written with EXIF Software via PIL, plus an XMP CreatorTool raw-scan fixture. Zero false positives across the 109-image corpus (real iPhone photos carry no AI generator token). Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 4 +- README.md | 7 ++-- src/remove_ai_watermarks/identify.py | 12 +++++- src/remove_ai_watermarks/metadata.py | 46 ++++++++++++++++++++++ src/remove_ai_watermarks/noai/constants.py | 26 ++++++++++++ tests/test_identify.py | 17 ++++++++ tests/test_metadata.py | 43 ++++++++++++++++++++ 7 files changed, 149 insertions(+), 6 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 769f641..ccfbe77 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,7 +27,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `noai/c2pa.py` — PNG chunk parser; use `extract_c2pa_chunk(path)` to get raw caBX payload, `has_c2pa_metadata(path)` to detect. Do not reimplement chunk parsing. `extract_c2pa_info(path)` sets `synthid_watermark`/`synthid_vendors` when the manifest is signed by a SynthID-using vendor. - `noai/constants.py` — PNG_SIGNATURE, C2PA_CHUNK_TYPE, C2PA_SIGNATURES, C2PA_ISSUERS, and `SYNTHID_C2PA_ISSUERS` (issuers that pair SynthID with C2PA: Google, OpenAI). Add a new issuer here, not inline. - `metadata.py` — `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. -- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, IPTC "Made with AI", embedded SD/ComfyUI params, SynthID proxy, visible Gemini sparkle) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. Add platform mappings to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB; EXIF/XMP *fields* inside ISOBMFF are still not parsed (no positive fixtures to validate against). +- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, IPTC "Made with AI", embedded SD/ComfyUI params, SynthID proxy, visible Gemini sparkle) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. Add platform mappings to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") are not flagged. - `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`. - `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. - `face_protector.py` — YOLO detect + soft-blend pattern; mirror this for any "protect region during diffusion" features @@ -44,7 +44,7 @@ Who embeds what, and whether it is locally detectable (so we know which gaps are - `invisible` pipeline downscales to model-native resolution (1024 px for SDXL) before diffusion. Degrades fine text in infographics. Tracked; fix is tile-based diffusion. - Pyright first run is slow (2-3 min) due to ML deps (torch/diffusers/transformers stubs); full-project `uv run pyright` can stall for many minutes — scope it to changed files. - `ultralytics` monkey-patches `PIL.Image.open` and tries to autoload `pi_heif`. When `pi_heif` is missing, opening files raises `ModuleNotFoundError`, not `UnidentifiedImageError`. Code that opens user-supplied or unknown-format files should `except Exception`, not just `OSError`/`UnidentifiedImageError`. -- Metadata detection for AVIF/HEIF/JPEG-XL relies on a binary scan for `C2PA_UUID` + `IPTC_AI_MARKERS`. C2PA removal in those containers is implemented via `noai/isobmff.py` (top-level ``uuid`` / ``jumb`` box stripper, no re-encoding). EXIF/XMP boxes inside those containers are not yet scrubbed. +- Metadata detection for AVIF/HEIF/JPEG-XL relies on a binary scan for `C2PA_UUID` + `IPTC_AI_MARKERS`, plus EXIF `Software` / XMP `CreatorTool` generator tags via `metadata.exif_generator` (validated with synthesized AVIF/JPEG fixtures + an XMP raw-scan fixture). C2PA removal in those containers is implemented via `noai/isobmff.py` (top-level ``uuid`` / ``jumb`` box stripper, no re-encoding). EXIF/XMP boxes inside those containers are read for detection but not yet **scrubbed** on removal. - **SynthID detection is metadata-only.** There is no reliable *local* detector of the SynthID *pixel* watermark — Google's decoder is proprietary, no public spec or API (only a waitlisted portal). We detect SynthID by its C2PA companion (`synthid_source` / `SYNTHID_C2PA_ISSUERS`), which is reliable while the manifest is intact but says nothing once C2PA is stripped. **Surface-dependent blind spot (verified 2026-05-24):** the same Google model emits different metadata per surface -- the Gemini *app* wraps outputs in Google C2PA, but the *API/playground* (AI Studio, Nano Banana / gemini-2.5-flash-image) emits the SynthID *pixel* watermark (confirmed via the Gemini-app oracle) + the visible sparkle but **no C2PA/IPTC at all**, so `synthid_source` returns None despite SynthID being present. Only the pixel oracle or the visible-sparkle detector catches those. (Meta AI is another surface mismatch: it writes the IPTC `digitalSourceType=trainedAlgorithmicMedia` marker, not C2PA and not SynthID.) Google→SynthID is long-standing; OpenAI→SynthID is confirmed by OpenAI's Help Center (ChatGPT/Codex/API "include both C2PA metadata and SynthID watermarks", updated 2026-05-21) but time-gated (pre-rollout OpenAI images carry C2PA without SynthID), so the OpenAI verdict is hedged "likely". Oracles: Gemini app "Verify with SynthID" (Google), openai.com/verify (OpenAI). The spectral phase-coherence approach from `github.com/aloshdenny/reverse-SynthID` was evaluated (May 2026) and **does not work for real-content detection**: on its own shipped codebook + validation set, watermarked and cleaned images were indistinguishable (conf within noise, cleaned often higher); it only fires on pure-black 1024x1024 reference images at exact resolution (the controlled case it was calibrated on). The README's "90% / conf=0.91" reproduces only in that lab condition. Do not build a production detector on it; if revisited, it is experimental/diagnostic only and needs a per-resolution, per-model reference corpus. A from-scratch gpt-image pilot (2026-05-24) confirmed this independently: 5 independent solid-black gpt-image outputs share a near-identical fixed signature (pairwise residual correlation **0.92**, avg-template retains 97% energy), so the watermark/carrier IS strongly present and consistent on flat content — but the carrier frequencies extracted from it do NOT discriminate real content (carrier-to-random ratio: cleaned 1.86 > watermarked 1.53; a non-gpt-image image scored highest at 3.67). The signature drowns in content texture. Net: a perfectly consistent solid-color signature still yields no real-content pixel detector with magnitude/carrier methods. A corpus discrimination test (2026-05-24, `scripts/synthid_pixel_probe.py`, raw zero-mean residual NCC) independently re-confirms this: at matched resolution, SynthID positives do NOT cluster apart from negatives (within-Gemini 0.07; at 1024 px pos-vs-neg >= pos-vs-pos). The only high correlations were near-duplicate *content* (5 ChatGPT renders of one prompt at ~0.92, while a distinct ChatGPT image scored ~0 against them) — content, not a carrier. The probe is solid-fills-only and EXPERIMENTAL/DIAGNOSTIC; do not use it on real content. - **External AI-vs-real classifier models are out of scope (decided 2026-05-24).** Generic HuggingFace detectors (`Organika/sdxl-detector` Swin Transformer, `umm-maybe/AI-image-detector`, and fine-tunes) exist and report ~0.98 on their *own* SDXL-vs-real validation sets, but they are per-generator and the model cards themselves note degraded accuracy off-distribution; they are untested on gpt-image / Gemini Nano Banana (the metadata-stripped surfaces we care about), and our own light SDXL pass would likely defeat them the same way it defeats SynthID. Detection here stays local + signal-based (metadata + visible sparkle); do not add a bundled classifier dependency. - **SynthID v2 vs default pipeline:** the SDXL-based default profile (since May 2026) defeats SynthID v2. **Verified end-to-end (May 2026):** local SDXL run on a Gemini 3 Pro output, checked via the Gemini app's "Verify with SynthID" feature, returned "no SynthID watermark detected". Also confirmed against **OpenAI's** SynthID (2026-05-23): a fresh ChatGPT/gpt-image output read "SynthID detected" on openai.com/verify before the local SDXL run and "SynthID not detected" after (corpus regression chain: pos `4ef377bd` -> cleaned `47188e88`). The same configuration is used in raiw-app production (`fal-ai/fast-sdxl` at native ~1024 px, strength 0.05, steps 50). SD-1.5 dreamshaper at 768 px was previously the default and does NOT defeat v2 — verified empirically against the same feature (strength 0.04, 0.10, and elastic warp α∈{5,8} all flagged positive). That SD-1.5 path was removed; only `default` (SDXL) and `ctrlregen` profiles remain. diff --git a/README.md b/README.md index 9fe3d23..8c359f1 100644 --- a/README.md +++ b/README.md @@ -188,9 +188,10 @@ remove-ai-watermarks batch ./images/ --mode all ```bash # Identify provenance: where an image was made + its watermark inventory. -# Aggregates C2PA, IPTC "Made with AI", embedded SD/ComfyUI params, the -# SynthID proxy, the visible Gemini sparkle, and (with the [detect] extra) the -# open SD/SDXL/FLUX invisible watermark into one verdict. Reports "unknown" +# Aggregates C2PA, IPTC "Made with AI", embedded SD/ComfyUI params, EXIF/XMP +# generator tags (incl. inside AVIF/HEIF), the SynthID proxy, the visible Gemini +# sparkle, and (with the [detect] extra) the open SD/SDXL/FLUX invisible +# watermark into one verdict. Reports "unknown" # (never "clean") when no signal is found, since stripped metadata is not proof # of a clean origin. Add --json for machine-readable output. remove-ai-watermarks identify image.png diff --git a/src/remove_ai_watermarks/identify.py b/src/remove_ai_watermarks/identify.py index 355c37c..0fb58b4 100644 --- a/src/remove_ai_watermarks/identify.py +++ b/src/remove_ai_watermarks/identify.py @@ -26,6 +26,7 @@ from remove_ai_watermarks.metadata import ( AI_METADATA_KEYS, C2PA_UUID, IPTC_AI_MARKERS, + exif_generator, get_ai_metadata, ) from remove_ai_watermarks.noai.c2pa import extract_c2pa_info @@ -226,6 +227,14 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b if platform is None: platform = "Stable Diffusion / local pipeline (Automatic1111, ComfyUI, InvokeAI)" + # ── EXIF Software / XMP CreatorTool generator (cross-format) ───── + # Catches a generator tag (incl. inside AVIF/HEIF/JXL) when there is no C2PA. + if generator_tag := exif_generator(image_path): + signals.append(Signal("exif_generator", f"EXIF/XMP generator: {generator_tag}", "high")) + watermarks.append(f"Embedded generator tag: {generator_tag}") + if platform is None: + platform = f"{generator_tag} (EXIF/XMP generator tag)" + # ── Open invisible watermark (SD / SDXL / FLUX, dwtDct) ────────── # Public decoder, no key -- a definitive embedded signal on pristine files. if check_invisible and (scheme := _invisible_watermark(image_path)) is not None: @@ -237,7 +246,8 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b # ── Verdict so far (metadata + embedded watermark) ────────────── invisible_wm = any(s.name == "invisible_watermark" for s in signals) - ai_from_metadata = bool((has_c2pa and (c2pa_is_ai or synthid)) or iptc or local_keys or invisible_wm) + exif_gen = any(s.name == "exif_generator" for s in signals) + ai_from_metadata = bool((has_c2pa and (c2pa_is_ai or synthid)) or iptc or local_keys or invisible_wm or exif_gen) # ── Visible Gemini sparkle (fallback for stripped-metadata case) ─ if check_visible and (conf := _visible_sparkle(image_path)) is not None and conf >= _SPARKLE_THRESHOLD: diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py index ebd6e6d..1995afe 100644 --- a/src/remove_ai_watermarks/metadata.py +++ b/src/remove_ai_watermarks/metadata.py @@ -183,6 +183,52 @@ def synthid_source(image_path: Path) -> str | None: return ", ".join(matched) if matched else None +def exif_generator(image_path: Path) -> str | None: + """Return an AI-generator name from the EXIF ``Software`` / XMP ``CreatorTool`` + field, if it matches a known generator (see ``AI_GENERATOR_TOKENS``), else None. + + Cross-format: EXIF is read via PIL + piexif for any container PIL can open + (JPEG/WebP/AVIF/PNG); an XMP ``CreatorTool`` raw-byte scan additionally covers + HEIF/JPEG-XL that PIL can't open without plugins. Only AI tokens match, so + ordinary editors (plain "Adobe Photoshop", "GIMP") are not flagged. + """ + import re + + from remove_ai_watermarks.noai.constants import AI_GENERATOR_TOKENS + + candidates: list[str] = [] + + # EXIF Software / Artist / ImageDescription (0th IFD) via PIL exif bytes. + try: + import piexif + from PIL import Image + + with Image.open(image_path) as img: + exif_bytes = img.info.get("exif") + if exif_bytes: + tags = piexif.load(exif_bytes).get("0th", {}) + for tag in (piexif.ImageIFD.Software, piexif.ImageIFD.Artist, piexif.ImageIFD.ImageDescription): + value = tags.get(tag) + if isinstance(value, bytes): + candidates.append(value.decode("latin1", "replace")) + except Exception as exc: # unopenable format / malformed EXIF + logger.debug("EXIF generator read failed for %s: %s", image_path, exc) + + # XMP CreatorTool: text, container-agnostic (covers HEIF/JXL via raw scan). + try: + with open(image_path, "rb") as f: + head = f.read(1024 * 1024) + for match in re.finditer(rb"CreatorTool[>\"'=\s]{1,4}([^<\"']{1,80})", head): + candidates.append(match.group(1).decode("latin1", "replace")) + except Exception as exc: + logger.debug("XMP CreatorTool scan failed for %s: %s", image_path, exc) + + for value in candidates: + if any(token in value.lower() for token in AI_GENERATOR_TOKENS): + return value.strip() + return None + + def get_ai_metadata(image_path: Path) -> dict[str, str]: """Extract AI-related metadata from an image. diff --git a/src/remove_ai_watermarks/noai/constants.py b/src/remove_ai_watermarks/noai/constants.py index 8dec2e8..b1026af 100644 --- a/src/remove_ai_watermarks/noai/constants.py +++ b/src/remove_ai_watermarks/noai/constants.py @@ -122,6 +122,32 @@ C2PA_AI_TOOLS = { b"Firefly": "Firefly", } +# Lowercased substrings that mark an AI generator when found in an EXIF +# ``Software`` / XMP ``CreatorTool`` value. Conservative on purpose: plain +# editors like "Adobe Photoshop" or "GIMP" must NOT match (no AI token), so only +# generator names land here. Add new generators here, not inline. +AI_GENERATOR_TOKENS: frozenset[str] = frozenset( + { + "firefly", + "dall-e", + "dalle", + "midjourney", + "stable diffusion", + "stable-diffusion", + "stablediffusion", + "comfyui", + "automatic1111", + "invokeai", + "imagen", + "gpt-image", + "nightcafe", + "ideogram", + "leonardo", + "flux", + "dreamstudio", + } +) + # C2PA action types C2PA_ACTIONS = { b"c2pa.created": "created", diff --git a/tests/test_identify.py b/tests/test_identify.py index e725bdc..ff31f5a 100644 --- a/tests/test_identify.py +++ b/tests/test_identify.py @@ -267,6 +267,23 @@ class TestReportSerializable: assert "is_ai_generated" in dumped +class TestIdentifyExifGenerator: + """An AI generator tag in EXIF/XMP (incl. AVIF) drives attribution.""" + + def test_avif_firefly_software_attributed(self, tmp_path: Path): + import piexif + from PIL import Image + + exif = piexif.dump({"0th": {piexif.ImageIFD.Software: b"Adobe Firefly"}, "Exif": {}, "GPS": {}, "1st": {}}) + path = tmp_path / "firefly.avif" + Image.new("RGB", (64, 64), (90, 80, 70)).save(path, exif=exif) + r = identify(path, check_visible=False) + assert r.is_ai_generated is True + assert r.platform is not None + assert "Firefly" in r.platform + assert any("generator tag" in w for w in r.watermarks) + + # ── Open invisible watermark (SD/SDXL/FLUX) integration ───────────── from remove_ai_watermarks.invisible_watermark import is_available as _wm_available # noqa: E402 diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 43f7da3..100e3b9 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -4,12 +4,14 @@ from __future__ import annotations from pathlib import Path +import piexif import pytest from PIL import Image from PIL.PngImagePlugin import PngInfo from remove_ai_watermarks.metadata import ( _is_ai_key, + exif_generator, get_ai_metadata, has_ai_metadata, remove_ai_metadata, @@ -332,3 +334,44 @@ class TestRemoveAiMetadata: result = remove_ai_metadata(tmp_clean_png, output) assert isinstance(result, Path) assert result == output + + +def _img_with_software(tmp_path: Path, fmt: str, software: str) -> Path: + """Write a tiny image carrying an EXIF Software tag.""" + exif = piexif.dump({"0th": {piexif.ImageIFD.Software: software.encode()}, "Exif": {}, "GPS": {}, "1st": {}}) + path = tmp_path / f"img.{fmt}" + Image.new("RGB", (64, 64), (100, 90, 80)).save(path, exif=exif) + return path + + +class TestExifGenerator: + """exif_generator extracts AI-tool names from EXIF/XMP across formats.""" + + def test_avif_software_ai_tool_detected(self, tmp_path: Path): + path = _img_with_software(tmp_path, "avif", "Adobe Firefly") + assert exif_generator(path) == "Adobe Firefly" + + def test_jpeg_software_ai_tool_detected(self, tmp_path: Path): + path = _img_with_software(tmp_path, "jpg", "ComfyUI v1.2") + result = exif_generator(path) + assert result is not None + assert "ComfyUI" in result + + def test_plain_editor_not_flagged(self, tmp_path: Path): + # An ordinary editor tag carries no AI token and must not be flagged. + path = _img_with_software(tmp_path, "jpg", "Adobe Photoshop 25.0") + assert exif_generator(path) is None + + def test_xmp_creatortool_scan_covers_unopenable(self, tmp_path: Path): + # PIL can't open this fake HEIF; the raw XMP CreatorTool scan still works. + path = tmp_path / "fake.heic" + path.write_bytes( + b"\x00\x00\x00\x18ftypheic\x00\x00\x00\x00" + b"Midjourney v7" + ) + result = exif_generator(path) + assert result is not None + assert "Midjourney" in result + + def test_clean_image_is_none(self, tmp_clean_png: Path): + assert exif_generator(tmp_clean_png) is None