Files
remove-ai-watermarks/src/remove_ai_watermarks/identify.py
T
test-user 31f0a82906 feat(metadata): detect C2PA/AIGC/IPTC manifests after a large mdat in MP4 (v0.6.8)
Provenance detection no longer relies on a fixed first-MB read. In a streaming /
non-faststart MP4 the C2PA manifest sits AFTER a multi-megabyte mdat, beyond the
1 MB scan window, so it was missed.

- isobmff.scan_c2pa_region(path): a file-seeking top-level box walker that
  returns the payloads of uuid/jumb (provenance) boxes, seeking past mdat by
  size without reading it -- works on multi-GB files. Returns b"" for
  non-ISOBMFF or on read error. Mirrors the box-size encoding of the existing
  in-memory _iter_top_level_boxes (largesize / size==0).
- metadata.scan_head(path, size): the shared input for every C2PA/AIGC/IPTC
  byte scan -- first __TEXT	__DATA	__OBJC	others	dec	hex bytes plus, for ISOBMFF, the late provenance-box
  payloads. Behavior-neutral (f.read(size)) for non-ISOBMFF inputs.
- Routed all six metadata scan sites (has_ai_metadata, aigc_label,
  iptc_ai_system, synthid_source, exif_generator XMP, get_ai_metadata
  soft-binding) and identify's head read through scan_head.

6 new tests: late box found by scan_c2pa_region / scan_head, the fixed window
provably misses it, non-ISOBMFF -> b"", front-placed (faststart) regression.

The remaining gap stays documented: EXIF/XMP stored as items inside the meta
box (AVIF/HEIF stills) still needs meta-box surgery or exiftool.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 13:42:29 -07:00

534 lines
25 KiB
Python

"""Image provenance: identify where an image was made and what watermarks it carries.
Aggregates every locally-readable signal into a single :class:`ProvenanceReport`:
- **C2PA Content Credentials** (issuer, claim generator, digital source type) ->
the signing platform (OpenAI, Google, Adobe, Microsoft).
- **IPTC ``digitalSourceType``** "Made with AI" marker (Meta, X, others).
- **PNG text / EXIF generation parameters** (Stable Diffusion, ComfyUI, InvokeAI).
- **SynthID metadata proxy** -- a C2PA companion from a SynthID-using vendor
(Google / OpenAI) implies the invisible pixel watermark.
- **Visible Gemini sparkle** (optional; needs cv2/numpy, no GPU).
Hard limit: a stripped image (re-encoded, screenshotted, social-media upload)
loses all metadata, and the SynthID *pixel* watermark is not locally decodable
(proprietary decoder). Absence of signals is therefore reported as ``Unknown``,
never as "clean". See CLAUDE.md "SynthID detection is metadata-only".
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING
from remove_ai_watermarks.metadata import (
AI_METADATA_KEYS,
AIGC_MARKERS,
C2PA_UUID,
IPTC_AI_FIELD_MARKERS,
IPTC_AI_MARKERS,
aigc_label,
exif_generator,
get_ai_metadata,
iptc_ai_system,
scan_head,
xai_signature,
)
from remove_ai_watermarks.noai.c2pa import cbor_text_after, extract_c2pa_info, soft_binding_vendors_in
from remove_ai_watermarks.noai.constants import C2PA_AI_TOOLS, C2PA_ISSUERS
if TYPE_CHECKING:
from pathlib import Path
log = logging.getLogger(__name__)
# How much of a non-PNG container to binary-scan for the C2PA issuer.
_SCAN_BYTES = 1024 * 1024
# Visible-sparkle confidence above which the signal is trusted as provenance.
# Stricter than the removal default (0.25): on the corpus, Gemini-family
# sparkles score >= 0.56 while non-sparkle images top out at 0.49, so 0.5
# cleanly separates them and avoids false positives when sparkle is the only
# signal (e.g. an OpenAI image scored 0.37 -- below threshold, correctly dropped).
_SPARKLE_THRESHOLD = 0.5
# Issuer (C2PA signer) -> human-readable generating platform. Ordered: when a
# manifest names several issuers (Microsoft Designer signs as "OpenAI,
# Microsoft"), the first match wins so the product, not the backend, is named.
_ISSUER_PLATFORM: tuple[tuple[str, str], ...] = (
# Microsoft signs both Designer and Bing Image Creator; Bing now runs its
# own MAI-Image model (not DALL-E), so the label stays model-neutral.
("Microsoft", "Microsoft (Bing Image Creator / Designer)"),
("Adobe", "Adobe Firefly"),
("OpenAI", "OpenAI (ChatGPT / gpt-image / DALL-E / Sora)"),
("Google", "Google (Gemini / Imagen)"),
("Stability AI", "Stability AI (Stable Image / DreamStudio)"),
)
# PNG-text / EXIF keys that indicate a local diffusion pipeline (vs. a hosted
# platform's C2PA). Subset of AI_METADATA_KEYS; excludes the C2PA/Software keys.
_LOCAL_GEN_KEYS = frozenset(
AI_METADATA_KEYS & {"parameters", "prompt", "negative_prompt", "workflow", "comfyui", "invokeai_metadata", "dream"}
)
_STRIP_CAVEAT = (
"Absence of metadata is not proof the image is clean: C2PA, EXIF, and PNG "
"text chunks are stripped by re-encoding, screenshots, or social-media upload."
)
_SYNTHID_CAVEAT = (
"SynthID is a metadata proxy here; the pixel watermark is not locally "
"verifiable (proprietary decoder). Confirm via the Gemini app or openai.com/verify."
)
_OPENAI_CAVEAT = (
"OpenAI began pairing SynthID with C2PA around 2026-05; OpenAI images from "
"before the rollout carry C2PA without SynthID, so the SynthID verdict is 'likely'."
)
_IPTC_ONLY_CAVEAT = "The IPTC 'Made with AI' tag flags AI provenance but does not identify the specific platform."
_INVISIBLE_WM_CAVEAT = (
"The open invisible watermark is fragile: it does not survive JPEG re-encoding "
"or resizing, so it confirms origin only on a pristine (un-re-encoded) file."
)
@dataclass
class Signal:
"""A single provenance signal that was found (or affirmatively absent)."""
name: str
detail: str
confidence: str # "high" | "medium"
@dataclass
class ProvenanceReport:
"""Aggregated provenance verdict for one image."""
path: Path
is_ai_generated: bool | None # True / False is never asserted; None = unknown
platform: str | None
confidence: str # "high" | "medium" | "none"
watermarks: list[str] = field(default_factory=list[str])
signals: list[Signal] = field(default_factory=list["Signal"])
caveats: list[str] = field(default_factory=list[str])
# Contradictions between independent provenance signals (e.g. two different
# AI vendors both claiming the image, or camera-capture credentials next to
# AI-generation markers). Non-empty means the provenance is internally
# inconsistent -- a strong tell of spoofed, transplanted, or laundered metadata.
integrity_clashes: list[str] = field(default_factory=list[str])
def _issuers_in(data: bytes) -> list[str]:
"""C2PA issuer names whose signature byte appears in ``data`` (binary scan)."""
return sorted({name for sig, name in C2PA_ISSUERS.items() if sig in data})
def _ai_tools_in(data: bytes) -> list[str]:
"""Known C2PA AI-tool / generator names appearing in ``data`` (binary scan).
PNG has a structured claim_generator; for JPEG/WebP/AVIF/HEIF/JXL the
generator lives in a JUMBF/EXIF/XMP blob the PNG parser can't reach, so a
byte scan recovers the same attribution (e.g. "Imagen", "DALL-E").
"""
return sorted({name for sig, name in C2PA_AI_TOOLS.items() if sig in data})
# Distinctive C2PA device/camera tokens (cert CN, cert org, or claim-generator
# substrings) scanned in the manifest bytes -> platform. This is more reliable
# than mapping an issuer name (which also matches incidental mentions: a
# timestamp authority like "Truepic" in a Leica chain, an XMP-toolkit "Adobe"
# string in a Nikon file, or "Google" in a Pixel camera's cert -- all verified
# on real samples), and more robust than parsing the claim generator (which
# lives under varying CBOR keys, e.g. `claim_generator` vs `claim_generator_info`,
# and is absent on the Pixel sample where only the cert CN "Pixel Camera"
# identifies it). Camera C2PA marks CAPTURE authenticity, not AI, so these never
# assert is_ai on their own (the verdict still comes from the digital-source-type:
# the Pixel sample carries `computationalCapture`, not `trainedAlgorithmicMedia`).
# Only tokens verified against a real signed file are listed (Leica, Nikon,
# Truepic, Google Pixel); add Sony/Canon/Samsung/Bria as real samples are captured.
_DEVICE_C2PA_PLATFORM: tuple[tuple[bytes, str], ...] = (
(b"lc_c2pa", "Leica (camera, C2PA capture)"),
(b"Leica Camera", "Leica (camera, C2PA capture)"),
(b"NIKON", "Nikon (camera, C2PA capture)"),
(b"Pixel Camera", "Google Pixel (camera, C2PA capture)"),
# Sony uses its own ``sony.*`` C2PA assertion namespace (sony.sig / sony.cert);
# match that, NOT bare "Sony" (which is an EXIF Make on countless photos).
# Verified on a real Sony-signed file (Sony PXW-Z300, signer "Sony Corporation").
(b"sony.sig", "Sony (camera, C2PA capture)"),
(b"sony.cert", "Sony (camera, C2PA capture)"),
# "Truepic_Lens" (from the Lens SDK claim generator), NOT bare "Truepic" --
# Truepic is a C2PA signing authority whose name appears in the trust chain
# of unrelated manifests (e.g. OpenAI), so the bare token mis-attributes.
(b"Truepic_Lens", "Truepic Lens (verified capture)"),
)
def _device_platform(head: bytes) -> str | None:
"""Map a distinctive C2PA device/camera token in the manifest bytes to a platform."""
for token, platform in _DEVICE_C2PA_PLATFORM:
if token in head:
return platform
return None
def _attribute_platform(issuers: list[str], *, is_ai: bool = True) -> str | None:
"""Map a set of C2PA issuer names to a human-readable generating platform.
A specific AI-generator platform (Adobe Firefly, OpenAI, ...) is named only
when the content is actually AI (``is_ai``, i.e. digital-source-type
``trainedAlgorithmicMedia``). Otherwise an issuer-name byte match is likely
incidental -- e.g. an "Adobe XMP" toolkit string in a Canon/Sony camera
capture, or a "Google" cert org -- so we fall back to a neutral signer label
rather than mislabel a camera photo as "Adobe Firefly". Real Firefly/OpenAI/
Google AI output carries the AI source-type, so it is unaffected. ``is_ai``
defaults True so the issuer->platform mapping can still be unit-tested in
isolation; ``identify`` passes the file's actual ``c2pa_is_ai``.
"""
joined = " ".join(issuers)
if is_ai:
for needle, platform in _ISSUER_PLATFORM:
if needle in joined:
return platform
if issuers: # e.g. Truepic alone -- a signing authority, not a generator
return f"C2PA signer: {', '.join(issuers)} (no known AI generator named)"
return None
# Coarse origin-vendor normalization for integrity-clash detection. Two signals
# that resolve to the SAME key are consistent (a C2PA "Google (Gemini)" issuer
# and a SynthID-Google proxy, or Adobe Firefly + its Adobe TrustMark soft
# binding); two DIFFERENT keys from independent generator stamps are a
# contradiction (a C2PA OpenAI manifest on an image whose EXIF says "Ideogram
# AI"). Substring match on the lowercased platform/detail string; first hit wins,
# so order specific tokens before brand umbrellas where they overlap.
_AI_VENDOR_TOKENS: tuple[tuple[str, str], ...] = (
("gpt-image", "OpenAI"),
("dall", "OpenAI"),
("sora", "OpenAI"),
("openai", "OpenAI"),
("gemini", "Google"),
("imagen", "Google"),
("nano banana", "Google"),
("google", "Google"),
("firefly", "Adobe"),
("adobe", "Adobe"),
("bing", "Microsoft"),
("designer", "Microsoft"),
("microsoft", "Microsoft"),
("stability", "Stability AI"),
("stable diffusion", "Stability AI"),
("sdxl", "Stability AI"),
("ideogram", "Ideogram"),
("grok", "xAI"),
("aurora", "xAI"),
("xai", "xAI"),
)
def _vendor_of(text: str | None) -> str | None:
"""Normalize a platform/generator string to a coarse origin-vendor key, or None."""
if not text:
return None
low = text.lower()
for token, vendor in _AI_VENDOR_TOKENS:
if token in low:
return vendor
return None
def _integrity_clashes(
ai_vendors: dict[str, str], camera_label: str | None, *, camera_has_ai_marker: bool
) -> list[str]:
"""Surface contradictions between independent provenance signals.
Args:
ai_vendors: family name -> normalized AI-origin vendor, one entry per
generator-stamped signal (C2PA issuer when the source is AI, SynthID
proxy, EXIF/XMP generator tag, IPTC AISystemUsed, xAI, AIGC label).
camera_label: a camera/verified-capture C2PA device platform, if one was
identified (Pixel, Leica, Sony, Nikon, Truepic), else None.
camera_has_ai_marker: True when an AI-generation stamp coexists with the
camera credentials.
Returns:
Human-readable clash descriptions; empty when the signals agree.
"""
clashes: list[str] = []
by_vendor: dict[str, list[str]] = {}
for family, vendor in ai_vendors.items():
by_vendor.setdefault(vendor, []).append(family)
if len(by_vendor) >= 2:
parts = [f"{vendor} (via {', '.join(sorted(fams))})" for vendor, fams in sorted(by_vendor.items())]
clashes.append(
"Conflicting AI-origin attributions from independent signals: "
+ " vs ".join(parts)
+ " -- one provenance set was likely spoofed, transplanted, or laundered."
)
if camera_label and camera_has_ai_marker:
vendors = ", ".join(sorted(set(ai_vendors.values()))) or "present"
clashes.append(
f"Camera-capture C2PA credentials ({camera_label}) coexist with AI-generation markers "
f"({vendors}) -- a genuine camera capture is not AI-generated, so the provenance is inconsistent."
)
return clashes
def _visible_sparkle(image_path: Path) -> float | None:
"""Visible Gemini-sparkle confidence in [0, 1], or None if unavailable.
Optional: needs cv2/numpy (no GPU). The cv2 work lives in gemini_engine so
this module stays dependency-light; returns None if cv2 or the engine
assets are missing, or the image can't be read.
"""
try:
from remove_ai_watermarks.gemini_engine import detect_sparkle_confidence
except Exception as exc: # cv2/engine assets missing
log.debug("visible-sparkle detector unavailable: %s", exc)
return None
return detect_sparkle_confidence(image_path)
def _invisible_watermark(image_path: Path) -> str | None:
"""Open invisible-watermark scheme name (SD/SDXL/FLUX) or None.
Optional: needs the imwatermark decoder (extra ``detect``). Returns None if
it is not installed or no known watermark decodes.
"""
from remove_ai_watermarks.invisible_watermark import detect_invisible_watermark
return detect_invisible_watermark(image_path)
def _trustmark(image_path: Path) -> str | None:
"""Adobe TrustMark scheme name or None.
Optional: needs the ``trustmark`` decoder (extra ``trustmark``). Returns None
if it is not installed or no TrustMark watermark decodes.
"""
from remove_ai_watermarks.trustmark_detector import detect_trustmark
return detect_trustmark(image_path)
def identify(image_path: Path, *, check_visible: bool = True, check_invisible: bool = True) -> ProvenanceReport:
"""Identify an image's origin platform and watermark inventory.
Args:
image_path: Path to the image (PNG, JPEG, WebP, or ISOBMFF container).
check_visible: Also run the visible Gemini-sparkle detector (cv2). Set
False for a pure-metadata, dependency-light scan.
check_invisible: Also decode open invisible watermarks (SD/SDXL/FLUX) via
the optional imwatermark library. No-op when it is not installed.
Returns:
A :class:`ProvenanceReport`. ``is_ai_generated`` is True when any AI
signal is found and None (unknown) when none is -- it is never asserted
False, because stripped metadata leaves no local proof of a clean origin.
"""
info = extract_c2pa_info(image_path) # PNG-structured; {} for other formats
meta = get_ai_metadata(image_path) # PNG text + EXIF + C2PA fields + synthid
# First MB covers C2PA (PNG caBX, JPEG APP11, AVIF/HEIF/JXL uuid box) and
# IPTC markers for the non-PNG path where extract_c2pa_info returns {}.
# scan_head also seeks out late ISOBMFF provenance boxes (manifest after a
# large mdat in a streaming MP4) that a fixed first-MB read would miss.
head = scan_head(image_path, _SCAN_BYTES)
signals: list[Signal] = []
watermarks: list[str] = []
caveats: list[str] = []
# One normalized origin vendor per generator-stamped signal, for integrity-
# clash detection (see _integrity_clashes). Visible sparkle and the open
# invisible watermark are deliberately excluded: the former is a fuzzy visual
# score, the latter can be a by-product of our own SDXL removal pass, so
# neither is a trustworthy "the generator stamped its identity" claim.
ai_vendor_claims: dict[str, str] = {}
camera_label = _device_platform(head)
# ── C2PA Content Credentials ────────────────────────────────────
has_c2pa = bool(info) or b"c2pa" in head.lower() or C2PA_UUID in head
issuers = [info["issuer"]] if info.get("issuer") else _issuers_in(head)
c2pa_is_ai = "trainedAlgorithmicMedia" in info.get("source_type", "") or any(
m in head for m in (b"trainedAlgorithmicMedia", b"compositeWithTrainedAlgorithmicMedia")
)
# Generator string (for the signal detail): structured for PNG, CBOR-scanned
# for other containers. Best-effort -- some manifests key it as
# `claim_generator_info` (Pixel), so this can be None even when a device is
# identified by `_device_platform`.
generator = (
info.get("claim_generator")
or cbor_text_after(head, b"claim_generator")
or (", ".join(tools) if (tools := _ai_tools_in(head)) else None)
)
# Platform: a distinctive device/camera token in the manifest wins (it is the
# signer/producer), with the issuer byte-scan only as fallback. The issuer
# scan alone mis-attributed real samples (Leica->Truepic timestamp authority,
# Nikon->Adobe namespace, Pixel->Google Gemini) -- the device scan fixes that.
platform = (camera_label or _attribute_platform(issuers, is_ai=c2pa_is_ai)) if has_c2pa else None
if has_c2pa:
detail = ", ".join(filter(None, [", ".join(issuers), generator, info.get("source_type")]))
signals.append(Signal("c2pa", detail or "C2PA manifest present", "high"))
watermarks.append(f"C2PA Content Credentials ({', '.join(issuers) or 'unknown signer'})")
# Record the AI-origin vendor for clash detection only when the source is
# actually AI -- classify the issuer attribution / generator, NOT the
# resolved `platform` (which may be a camera device token whose label,
# e.g. "Google Pixel", would mis-normalize to an AI vendor).
if c2pa_is_ai and (v := (_vendor_of(_attribute_platform(issuers, is_ai=True)) or _vendor_of(generator))):
ai_vendor_claims["c2pa"] = v
# ── SynthID metadata proxy ──────────────────────────────────────
# get_ai_metadata already sets synthid_watermark for both PNG (caBX parser)
# and non-PNG (its own synthid_source fallback), so no extra scan is needed.
synthid = meta.get("synthid_watermark")
if synthid:
watermarks.append(f"SynthID pixel watermark ({synthid})")
caveats.append(_SYNTHID_CAVEAT)
if "OpenAI" in (" ".join(issuers) + synthid):
caveats.append(_OPENAI_CAVEAT)
if v := _vendor_of(synthid):
ai_vendor_claims["synthid"] = v
# ── C2PA soft-binding: a named forensic/third-party watermark vendor ─
# (Adobe TrustMark, Digimarc, Imatag, ...). Present in the manifest even when
# the watermark itself can't be decoded; names whose watermark stamped the pixels.
soft_binding = meta.get("soft_binding") or (", ".join(v) if (v := soft_binding_vendors_in(head)) else None)
if soft_binding:
signals.append(Signal("soft_binding", f"C2PA soft binding: {soft_binding}", "high"))
watermarks.append(f"Forensic watermark soft binding ({soft_binding})")
# ── IPTC "Made with AI" (Meta etc.), only meaningful without C2PA ─
iptc = any(m in head for m in IPTC_AI_MARKERS)
if iptc and not has_c2pa:
signals.append(Signal("iptc", "digitalSourceType (Made with AI)", "high"))
watermarks.append("IPTC digitalSourceType (Made with AI)")
caveats.append(_IPTC_ONLY_CAVEAT)
if platform is None:
platform = "Made-with-AI tag (e.g. Meta AI); platform not specified"
# ── IPTC 2025.1 AI-disclosure fields (Iptc4xmpExt:AISystemUsed etc.) ─
iptc_ai = any(m in head for m in IPTC_AI_FIELD_MARKERS)
if iptc_ai:
system = iptc_ai_system(image_path)
named = bool(system) and system != "fields present"
signals.append(
Signal("iptc_ai_system", f"IPTC AI disclosure ({system})" if named else "IPTC AI disclosure fields", "high")
)
watermarks.append(f"IPTC 2025.1 AI disclosure ({system})" if named else "IPTC 2025.1 AI disclosure fields")
if platform is None and named:
platform = f"{system} (IPTC AISystemUsed)"
if named and (v := _vendor_of(system)):
ai_vendor_claims["iptc_ai_system"] = v
# ── China TC260 AIGC label (Doubao and other China-served gens) ──
aigc = any(m in head for m in AIGC_MARKERS)
if aigc:
producer = (aigc_label(image_path) or {}).get("ContentProducer", "")
signals.append(Signal("aigc", f"TC260 AIGC label{f' (producer {producer})' if producer else ''}", "high"))
watermarks.append("China AIGC label (TC260 standard)")
if platform is None:
platform = "China AIGC-labeled generator (TC260; e.g. Doubao)"
ai_vendor_claims["aigc"] = "China AIGC (TC260)"
# ── Local diffusion parameters (Stable Diffusion / ComfyUI) ──────
local_keys = sorted(k for k in meta if k.lower() in _LOCAL_GEN_KEYS)
if local_keys:
signals.append(Signal("gen_params", f"embedded keys: {', '.join(local_keys)}", "high"))
watermarks.append("Embedded generation parameters (Stable Diffusion / ComfyUI)")
if platform is None:
platform = "Stable Diffusion / local pipeline (Automatic1111, ComfyUI, InvokeAI)"
# ── EXIF Software / XMP CreatorTool generator (cross-format) ─────
# Catches a generator tag (incl. inside AVIF/HEIF/JXL) when there is no C2PA.
if generator_tag := exif_generator(image_path):
signals.append(Signal("exif_generator", f"EXIF/XMP generator: {generator_tag}", "high"))
watermarks.append(f"Embedded generator tag: {generator_tag}")
if platform is None:
platform = f"{generator_tag} (EXIF/XMP generator tag)"
if v := _vendor_of(generator_tag):
ai_vendor_claims["exif_generator"] = v
# ── xAI / Grok EXIF signature scheme (no C2PA/SynthID/IPTC) ──────
# Grok's only provenance signal: EXIF ImageDescription "Signature: <base64>"
# + a UUID Artist. Distinct from exif_generator (which matches generator
# tokens); verified stable across 3 generations. See CLAUDE.md.
if xai_signature(image_path):
signals.append(Signal("xai_signature", "EXIF Signature blob + UUID Artist", "high"))
watermarks.append("xAI/Grok EXIF signature")
if platform is None:
platform = "xAI (Grok / Aurora)"
ai_vendor_claims["xai"] = "xAI"
# ── Open invisible watermark (SD / SDXL / FLUX, dwtDct) ──────────
# Public decoder, no key -- a definitive embedded signal on pristine files.
if check_invisible and (scheme := _invisible_watermark(image_path)) is not None:
signals.append(Signal("invisible_watermark", scheme, "high"))
watermarks.append(f"Open invisible watermark: {scheme}")
caveats.append(_INVISIBLE_WM_CAVEAT)
if platform is None:
platform = f"{scheme} (open DWT-DCT watermark)"
# ── Adobe TrustMark invisible watermark (open decoder, no key) ───
# The watermark behind Adobe Durable Content Credentials. Decoded locally,
# but it binds provenance for human-authored content too, so it enriches the
# watermark inventory without by itself asserting AI origin.
if check_invisible and (tm_scheme := _trustmark(image_path)) is not None:
signals.append(Signal("trustmark", tm_scheme, "high"))
watermarks.append(f"Adobe TrustMark invisible watermark ({tm_scheme})")
if platform is None:
platform = "Adobe (TrustMark / Content Credentials)"
# ── Verdict so far (metadata + embedded watermark) ──────────────
invisible_wm = any(s.name == "invisible_watermark" for s in signals)
exif_gen = any(s.name == "exif_generator" for s in signals)
xai_sig = any(s.name == "xai_signature" for s in signals)
ai_from_metadata = bool(
(has_c2pa and (c2pa_is_ai or synthid))
or iptc
or iptc_ai
or aigc
or local_keys
or invisible_wm
or exif_gen
or xai_sig
)
# ── Visible Gemini sparkle (fallback for stripped-metadata case) ─
if check_visible and (conf := _visible_sparkle(image_path)) is not None and conf >= _SPARKLE_THRESHOLD:
signals.append(Signal("visible_sparkle", f"NCC confidence {conf:.2f}", "medium"))
watermarks.append(f"Visible Gemini sparkle (confidence {conf:.2f})")
if platform is None:
platform = "Google Gemini family (visible sparkle detected)"
visible_only = any(s.name == "visible_sparkle" for s in signals) and not ai_from_metadata
if ai_from_metadata:
is_ai: bool | None = True
confidence = "high"
elif visible_only:
is_ai = True
confidence = "medium"
else:
is_ai = None
confidence = "none"
# ── Integrity clashes: contradictions between independent signals ─
clashes = _integrity_clashes(ai_vendor_claims, camera_label, camera_has_ai_marker=bool(ai_vendor_claims))
caveats.append(_STRIP_CAVEAT)
# De-duplicate while preserving order.
caveats = list(dict.fromkeys(caveats))
return ProvenanceReport(
path=image_path,
is_ai_generated=is_ai,
platform=platform,
confidence=confidence,
watermarks=watermarks,
signals=signals,
caveats=caveats,
integrity_clashes=clashes,
)