From ede35a3db503ccfb76563c94664e597df73e1d99 Mon Sep 17 00:00:00 2001 From: test-user Date: Sun, 24 May 2026 18:38:56 -0700 Subject: [PATCH] feat(metadata): read EXIF Make tag; collect Ideogram/Recraft/Krea-FLUX Collected live samples from three popular generators we lacked: - Ideogram tags its downloads with EXIF Make="Ideogram AI" (no C2PA, no SynthID, no imwatermark) -- the Make tag is its only signal. exif_generator only read Software/Artist/ImageDescription, so it missed this; now reads Make too. Real cameras put "Apple"/"Canon" in Make (no AI token), so this stays low-false-positive. 4 originals ingested. - Recraft (PNG export) and Krea hosting FLUX 2: downloads carry NO detectable signal -- no C2PA/EXIF/IPTC, and notably no imwatermark despite Krea running FLUX. identify correctly reports 'unknown'. Both ingested as neg fixtures. Lesson recorded in CLAUDE.md: the imwatermark detector fires only on pristine output from a pipeline that runs the encoder (diffusers default, official BFL), not from re-hosts (Krea/Stability) or re-encoded exports (Recraft/Canva). Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 4 +++- data/synthid_corpus/manifest.csv | 6 ++++++ src/remove_ai_watermarks/metadata.py | 10 +++++++++- tests/test_metadata.py | 15 +++++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index ccfbe77..15bf905 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,7 +27,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `noai/c2pa.py` — PNG chunk parser; use `extract_c2pa_chunk(path)` to get raw caBX payload, `has_c2pa_metadata(path)` to detect. Do not reimplement chunk parsing. `extract_c2pa_info(path)` sets `synthid_watermark`/`synthid_vendors` when the manifest is signed by a SynthID-using vendor. - `noai/constants.py` — PNG_SIGNATURE, C2PA_CHUNK_TYPE, C2PA_SIGNATURES, C2PA_ISSUERS, and `SYNTHID_C2PA_ISSUERS` (issuers that pair SynthID with C2PA: Google, OpenAI). Add a new issuer here, not inline. - `metadata.py` — `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. -- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, IPTC "Made with AI", embedded SD/ComfyUI params, SynthID proxy, visible Gemini sparkle) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. Add platform mappings to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") are not flagged. +- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, IPTC "Made with AI", embedded SD/ComfyUI params, SynthID proxy, visible Gemini sparkle) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. Add platform mappings to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. - `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`. - `invisible_watermark.py` — `detect_invisible_watermark(path)` decodes the OPEN DWT-DCT watermarks (public decoder, no key) embedded by Stable Diffusion / SDXL / FLUX via the `imwatermark` library. Known fixed patterns (verified against upstream source) live in `_BITS_48` (SDXL 48-bit, FLUX.2 48-bit) and `_SD1_STRING` ("StableDiffusionV1", SD 1.x/2.x). Optional dep (extra `detect`); returns None when absent. **Unlike SynthID this is locally detectable**, but the watermark is fragile (does not survive JPEG re-encode/resize — verified gone after JPEG q90), so it confirms origin only on pristine files. Add new known patterns here. The file carries a top-of-module pyright pragma because imwatermark/cv2 ship no type stubs. - `face_protector.py` — YOLO detect + soft-blend pattern; mirror this for any "protect region during diffusion" features @@ -37,6 +37,8 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r Who embeds what, and whether it is locally detectable (so we know which gaps are fillable). See `identify.py` for what we read. - **Locally detectable (open decoder, no key/API):** Stable Diffusion / SDXL / FLUX via `imwatermark` DWT-DCT (now covered by `invisible_watermark.py`). FLUX uses the same library (`black-forest-labs/flux2` `src/flux2/watermark.py`, 48-bit `0b001010101111111010000111100111001111010100101110`); SDXL is the diffusers `WATERMARK_MESSAGE` (`0b101100111110110010010000011110111011000110011110`). Caveat: fragile to re-encoding. - **C2PA / IPTC (covered by the issuer/marker scan):** OpenAI, Google, Adobe Firefly, Microsoft (Designer + **Bing Image Creator** — collected 2026-05-24; Bing now runs Microsoft's own **MAI-Image** model, signs C2PA as "Microsoft", NOT OpenAI/DALL-E), and **Stability AI** (collected from Brand Studio / DreamStudio successor; signs C2PA as "Stability AI Ltd", no SynthID, no imwatermark on its current Stable Image model — issuer added to `C2PA_ISSUERS`). Still unsampled: Canva (its downloads are re-encoded design *exports* that strip C2PA, so a Canva "positive" is inconclusive — skipped), Getty, Shutterstock. Midjourney embeds NO C2PA and no invisible watermark (our `mj-*` sample carried only the IPTC tag). +- **EXIF/XMP generator tag (caught by `exif_generator`):** **Ideogram** writes EXIF `Make="Ideogram AI"` (collected 2026-05-24 — no C2PA, no SynthID, no imwatermark; the Make tag is the only signal). +- **No detectable signal on download (correctly reported `unknown`):** **Recraft** (PNG export is a re-encoded design export — strips everything), **Krea hosting FLUX 2** (no imwatermark despite FLUX — the host omits the encoder, same as Stability's hosted SDXL), and Midjourney (embeds nothing). Lesson: the imwatermark detector only fires on *pristine* output from a pipeline that runs the encoder (diffusers default, official BFL), not from re-hosts (Krea/Stability) or re-encoded exports (Recraft/Canva). - **Invisible but NOT locally detectable (proprietary, API/oracle only — same wall as SynthID):** Amazon Titan Image Generator + Nova Canvas (Bedrock `DetectGeneratedContent` API), Kakao (new SynthID image adopter, May 2026), NVIDIA Cosmos (SynthID video). No local detector possible; treat like SynthID. ## Known limitations diff --git a/data/synthid_corpus/manifest.csv b/data/synthid_corpus/manifest.csv index 9fa589c..5f3b1e4 100644 --- a/data/synthid_corpus/manifest.csv +++ b/data/synthid_corpus/manifest.csv @@ -110,3 +110,9 @@ f3ebe4683fc40aa2a0b80cc3ced3dc8062385ae32b488f4b33cb2330949e658d,f3ebe468-leonar 12775f4c0aec1ecd615e88c4941a788a053d0258d08d31ad34d330d3e21969bd,12775f4c-grok_original.jpg,neg,Grok (xAI),aurora,784,1168,jpeg,,,none,2026-05-24T22:47:46Z,full-res original via Download button; xAI non-adopter; NO C2PA/IPTC/SynthID (has benign EXIF block) 74b605deb102d74443a182f70fbea34a6a470bed381d62ae0bfa0083a6ecc5a3,74b605de-_e1f7984d-98e7-4cdc-91d2-f72314a4bc56.jpeg,neg,Bing Image Creator,MAI-Image-2e,1024,1024,jpeg,,,c2pa-metadata,2026-05-25T00:10:28Z,C2PA issuer Microsoft; non-SynthID; issuer-coverage fixture f9bb6b1039ee228221d2eead7daebd46422a36a4a8959a74fa0198e1aa19f9b8,f9bb6b10-a_red_ceramic_coffee_mug_on_a_wooden_table_soft_morning_light_photorealistic_20260525000725_04.png,neg,Stability Brand Studio,Stable Image,1024,1024,png,Stability AI,,c2pa-metadata,2026-05-25T00:10:31Z,C2PA issuer Stability AI; non-SynthID; no imwatermark +23a06e0067930a9ad09a803f634414eb8ca0234fc0bbaca8d33f138745175e2e,23a06e00-a-red-ceramic-coffee-mug-on-a-wooden-table--soft-m.png,neg,Recraft V4.1,recraft-v4,896,1152,png,,,none,2026-05-25T01:21:35Z,"PNG export, no C2PA/EXIF/IPTC/imwatermark; clean download" +4f3fea84c24fda8d72a6e7bfbae31827348d9edc755ec9a5e8c835860130de57,4f3fea84-a_red_ceramic_coffee_mug_on_a_wooden_table_soft_morning_light_photorealistic_67qgxtyin9kjk5yb5mph_0.png,neg,Krea (FLUX 2 host),flux-2-dev via Krea,832,1248,png,,,none,2026-05-25T01:21:38Z,no imwatermark despite FLUX; Krea host strips/omits; clean download +7bdff2c3131cdc21783282a8a95c88f4b14ad38494a1041d0d68691af7fa5cd8,7bdff2c3-0001_1_a-red-ceramic-coffee-mug-sits-on-weather_D3BRZfOyUwuh5Emjtu8v4g_F7yI1GFGTayDA1NAUmkraw_cover.jpeg,neg,Ideogram,ideogram-3,1024,1024,jpeg,,,none,2026-05-25T01:36:38Z,EXIF Make='Ideogram AI'; no C2PA/SynthID; caught via exif_generator +ca2162624607b8f030c63cb1371d6afdd3f0d7af3d9b17514f7ae64a2e8f378c,ca216262-0001_2_a-red-ceramic-coffee-mug-sits-on-weather_rs0Lt1beXV-Mwjs33vWKVw_F7yI1GFGTayDA1NAUmkraw.jpeg,neg,Ideogram,ideogram-3,1024,1024,jpeg,,,none,2026-05-25T01:36:38Z,EXIF Make='Ideogram AI'; no C2PA/SynthID; caught via exif_generator +8a90f2ddf83ceedd7944531d27248191c79dc321ec76ea1f36df6cff9f6b8916,8a90f2dd-0001_3_a-red-ceramic-coffee-mug-sits-on-weather_Lm96lBwcXsmdeCfCpm52vw_F7yI1GFGTayDA1NAUmkraw.jpeg,neg,Ideogram,ideogram-3,1024,1024,jpeg,,,none,2026-05-25T01:36:38Z,EXIF Make='Ideogram AI'; no C2PA/SynthID; caught via exif_generator +87fca198d5c436b27a16685972802a2ff0a310a941248c9ec8d81560d7f387a1,87fca198-0001_4_a-red-ceramic-coffee-mug-sits-on-weather_pGmlgcH4XqSaq41hc0a9AQ_F7yI1GFGTayDA1NAUmkraw.jpeg,neg,Ideogram,ideogram-3,1024,1024,jpeg,,,none,2026-05-25T01:36:38Z,EXIF Make='Ideogram AI'; no C2PA/SynthID; caught via exif_generator diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py index 1995afe..6263967 100644 --- a/src/remove_ai_watermarks/metadata.py +++ b/src/remove_ai_watermarks/metadata.py @@ -207,7 +207,15 @@ def exif_generator(image_path: Path) -> str | None: exif_bytes = img.info.get("exif") if exif_bytes: tags = piexif.load(exif_bytes).get("0th", {}) - for tag in (piexif.ImageIFD.Software, piexif.ImageIFD.Artist, piexif.ImageIFD.ImageDescription): + # Make catches camera-style tags AI tools reuse (Ideogram writes + # Make="Ideogram AI"); real cameras put "Apple"/"Canon" there, which + # carry no AI token, so this stays low-false-positive. + for tag in ( + piexif.ImageIFD.Software, + piexif.ImageIFD.Make, + piexif.ImageIFD.Artist, + piexif.ImageIFD.ImageDescription, + ): value = tags.get(tag) if isinstance(value, bytes): candidates.append(value.decode("latin1", "replace")) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 100e3b9..268ab45 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -362,6 +362,21 @@ class TestExifGenerator: path = _img_with_software(tmp_path, "jpg", "Adobe Photoshop 25.0") assert exif_generator(path) is None + def test_make_tag_ai_tool_detected(self, tmp_path: Path): + # Ideogram tags its output with EXIF Make="Ideogram AI" (verified on a + # real download), so the Make tag must be read too. + exif = piexif.dump({"0th": {piexif.ImageIFD.Make: b"Ideogram AI"}, "Exif": {}, "GPS": {}, "1st": {}}) + path = tmp_path / "ideogram.jpg" + Image.new("RGB", (64, 64)).save(path, exif=exif) + assert exif_generator(path) == "Ideogram AI" + + def test_camera_make_not_flagged(self, tmp_path: Path): + # A real camera Make ("Apple") carries no AI token -> not flagged. + exif = piexif.dump({"0th": {piexif.ImageIFD.Make: b"Apple"}, "Exif": {}, "GPS": {}, "1st": {}}) + path = tmp_path / "iphone.jpg" + Image.new("RGB", (64, 64)).save(path, exif=exif) + assert exif_generator(path) is None + def test_xmp_creatortool_scan_covers_unopenable(self, tmp_path: Path): # PIL can't open this fake HEIF; the raw XMP CreatorTool scan still works. path = tmp_path / "fake.heic"