Files
remove-ai-watermarks/src/remove_ai_watermarks/metadata.py
T
test-user ba94de8275 feat: strip AI-provenance EXIF tags on removal (v0.5.6)
remove_ai_metadata now scrubs AI tags from the JPEG EXIF instead of passing
the block through wholesale. Closes the v0.5.5 follow-up: the xAI/Grok
Signature + UUID-Artist pair was detected but not removed.

- metadata._scrub_ai_exif(): deletes the xAI signature pair and any
  Software/Make/Artist/ImageDescription tag carrying an AI_GENERATOR_TOKENS
  token (so Ideogram's Make="Ideogram AI" is scrubbed too), keeping genuine
  camera/editor EXIF intact.
- Shared _is_xai_signature_pair / _exif_text helpers (module-level compiled
  regexes) are now the single source of truth, used by both xai_signature
  and _scrub_ai_exif.
- Tests: Grok signature stripped on JPEG output, Ideogram Make stripped,
  real-camera Make ("Apple") preserved. 325 passing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 14:26:20 -07:00

521 lines
19 KiB
Python

"""AI metadata detection and removal.
Wraps the noai-watermark metadata handling for stripping AI-generation
metadata (EXIF, PNG text chunks, C2PA provenance) from images.
For metadata-only operations, the heavy ML dependencies are NOT required.
"""
from __future__ import annotations
import contextlib
import logging
import re
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from pathlib import Path
logger = logging.getLogger(__name__)
# ── Known AI metadata keys ──────────────────────────────────────────
AI_METADATA_KEYS: frozenset[str] = frozenset(
k.lower()
for k in [
"parameters",
"prompt",
"negative_prompt",
"workflow",
"comfyui",
"sd-metadata",
"invokeai_metadata",
"generation_data",
"ai_metadata",
"dream",
"sd:prompt",
"sd:negative_prompt",
"sd:seed",
"sd:steps",
"sd:sampler",
"sd:cfg_scale",
"sd:model_hash",
"c2pa",
"c2pa_chunk",
"Software",
]
)
AI_KEYWORDS: tuple[str, ...] = (
"stable_diffusion",
"comfyui",
"automatic1111",
"invokeai",
"midjourney",
"dall-e",
"dalle",
"imagen",
"synthid",
"google_ai",
"openai",
"c2pa",
)
# C2PA UUID used in ISOBMFF (AVIF, HEIF, MP4) ``uuid`` boxes.
# Reference: https://spec.c2pa.org/specifications/specifications/2.1/specs/C2PA_Specification.html
C2PA_UUID: bytes = bytes.fromhex("d8fec3d61b0e483c92975828877ec481")
# IPTC ``digitalSourceType`` values (IPTC 2025.1) that flag AI provenance.
# Used by Instagram, Facebook, X (Twitter) to show "Made with AI" labels.
IPTC_AI_MARKERS: tuple[bytes, ...] = (
b"trainedAlgorithmicMedia",
b"compositeSynthetic",
b"algorithmicMedia",
b"compositeWithTrainedAlgorithmicMedia",
)
# China's mandatory AI-content labeling (TC260, the national cybersecurity
# standards committee). AI generators serving China embed an XMP block in the
# TC260 namespace -- ``<TC260:AIGC>{"Label":"1",...}``. Doubao (ByteDance) uses
# this; the same standard is mandatory for Jimeng, Kling, Qwen, Ernie, etc.,
# so the marker covers the whole China-AIGC-labeled ecosystem. Container-
# agnostic (XMP is text), so a raw-byte scan catches it in PNG/JPEG/etc.
AIGC_MARKERS: tuple[bytes, ...] = (
b"tc260.org.cn/ns/AIGC",
b"TC260:AIGC",
)
STANDARD_METADATA_KEYS: frozenset[str] = frozenset(
[
"Author",
"Title",
"Description",
"Copyright",
"Creation Time",
"Software",
"Comment",
"Disclaimer",
"Source",
"Warning",
]
)
def _is_ai_key(key: str) -> bool:
"""Check if a metadata key is AI-related."""
key_lower = key.lower()
if key_lower in AI_METADATA_KEYS:
return True
return any(kw in key_lower for kw in AI_KEYWORDS)
def has_ai_metadata(image_path: Path) -> bool:
"""Check if an image contains AI-generation metadata.
Args:
image_path: Path to the image.
Returns:
True if AI metadata is detected.
"""
from PIL import Image
# PIL may not handle AVIF/HEIF/JPEG-XL without the optional plugins
# (ultralytics also monkey-patches Image.open in a way that can raise
# ModuleNotFoundError when pi_heif autoload fails), so any open failure
# falls through to the binary scan.
try:
with Image.open(image_path) as img:
for key in img.info:
if _is_ai_key(key):
return True
except Exception as exc:
logger.debug("PIL could not open %s for metadata scan: %s", image_path, exc)
# Check C2PA — via the official ``c2pa`` lib if available, otherwise via a
# binary scan that also catches AVIF/HEIF/JPEG-XL containers (PIL doesn't
# expose their metadata uniformly).
try:
from c2pa import has_c2pa_metadata
if has_c2pa_metadata(image_path):
return True
except ImportError:
pass
# Binary scan covers C2PA (PNG caBX, JPEG APP11, AVIF/HEIF/JXL uuid boxes)
# and IPTC AI markers in XMP. Read only the first 512KB to bound memory.
with open(image_path, "rb") as f:
data = f.read(512 * 1024)
if b"c2pa" in data.lower() or b"C2PA" in data:
return True
if C2PA_UUID in data:
return True
if any(marker in data for marker in AIGC_MARKERS):
return True
if any(marker in data for marker in IPTC_AI_MARKERS):
return True
# xAI / Grok: no C2PA/IPTC/XMP -- only the EXIF Signature + UUID-Artist pair.
return xai_signature(image_path)
def aigc_label(image_path: Path) -> dict[str, str] | None:
"""Parse a China TC260 ``<TC260:AIGC>`` AI-labeling block, if present.
Returns the decoded JSON (e.g. ``{"Label": "1", "ContentProducer": ...}``)
or None. The block is XMP text (HTML-entity encoded), so it is found by a
container-agnostic raw-byte scan and works for PNG/JPEG/WebP alike.
"""
import html
import json
import re
with open(image_path, "rb") as f:
data = f.read(1024 * 1024)
match = re.search(rb"<TC260:AIGC>(.*?)</TC260:AIGC>", data, re.DOTALL)
if not match:
return None
raw = html.unescape(match.group(1).decode("utf-8", "replace"))
try:
parsed = json.loads(raw)
except ValueError:
return None
return {str(k): str(v) for k, v in parsed.items()} if isinstance(parsed, dict) else None
def synthid_source(image_path: Path) -> str | None:
"""Return the vendor name(s) if the image carries a SynthID pixel watermark.
This is a *metadata-based* proxy: Google (Imagen/Gemini) and OpenAI
(ChatGPT/DALL-E/gpt-image) embed an invisible SynthID watermark alongside
a C2PA manifest, so a C2PA manifest signed by one of them on AI-generated
content implies SynthID in the pixels. Adobe Firefly / Microsoft Designer
sign C2PA but do not use SynthID, so they return None.
The verdict is reliable only while the C2PA manifest is intact -- absence
is not proof, because C2PA can be stripped while the pixel watermark
survives, and the pixel watermark itself is not locally detectable
(proprietary decoder).
Args:
image_path: Path to the image (PNG, JPEG, WebP, or ISOBMFF container).
Returns:
Comma-joined vendor name(s) (e.g. ``"OpenAI"``) or None.
"""
from remove_ai_watermarks.noai.c2pa import extract_c2pa_info, synthid_vendors_in
# PNG: the caBX chunk parser gives a clean, structured issuer.
vendors = extract_c2pa_info(image_path).get("synthid_vendors")
if vendors:
return ", ".join(vendors)
# Non-PNG containers (JPEG APP11, WebP, AVIF/HEIF/JXL uuid box) keep the
# C2PA manifest where the PNG parser can't reach it. Binary-scan for the
# same signal: a C2PA manifest from a SynthID-using issuer on AI content.
with open(image_path, "rb") as f:
data = f.read(1024 * 1024)
has_c2pa = b"c2pa" in data.lower() or C2PA_UUID in data
# Matches both "trainedAlgorithmicMedia" and "compositeWithTrainedAlgorithmicMedia".
ai_source = b"trainedAlgorithmicMedia" in data or b"TrainedAlgorithmicMedia" in data
if not (has_c2pa and ai_source):
return None
matched = synthid_vendors_in(data)
return ", ".join(matched) if matched else None
def exif_generator(image_path: Path) -> str | None:
"""Return an AI-generator name from the EXIF ``Software`` / XMP ``CreatorTool``
field, if it matches a known generator (see ``AI_GENERATOR_TOKENS``), else None.
Cross-format: EXIF is read via PIL + piexif for any container PIL can open
(JPEG/WebP/AVIF/PNG); an XMP ``CreatorTool`` raw-byte scan additionally covers
HEIF/JPEG-XL that PIL can't open without plugins. Only AI tokens match, so
ordinary editors (plain "Adobe Photoshop", "GIMP") are not flagged.
"""
import re
from remove_ai_watermarks.noai.constants import AI_GENERATOR_TOKENS
candidates: list[str] = []
# EXIF Software / Artist / ImageDescription (0th IFD) via PIL exif bytes.
try:
import piexif
from PIL import Image
with Image.open(image_path) as img:
exif_bytes = img.info.get("exif")
if exif_bytes:
tags = piexif.load(exif_bytes).get("0th", {})
# Make catches camera-style tags AI tools reuse (Ideogram writes
# Make="Ideogram AI"); real cameras put "Apple"/"Canon" there, which
# carry no AI token, so this stays low-false-positive.
for tag in (
piexif.ImageIFD.Software,
piexif.ImageIFD.Make,
piexif.ImageIFD.Artist,
piexif.ImageIFD.ImageDescription,
):
value = tags.get(tag)
if isinstance(value, bytes):
candidates.append(value.decode("latin1", "replace"))
except Exception as exc: # unopenable format / malformed EXIF
logger.debug("EXIF generator read failed for %s: %s", image_path, exc)
# XMP CreatorTool: text, container-agnostic (covers HEIF/JXL via raw scan).
try:
with open(image_path, "rb") as f:
head = f.read(1024 * 1024)
for match in re.finditer(rb"CreatorTool[>\"'=\s]{1,4}([^<\"']{1,80})", head):
candidates.append(match.group(1).decode("latin1", "replace"))
except Exception as exc:
logger.debug("XMP CreatorTool scan failed for %s: %s", image_path, exc)
for value in candidates:
if any(token in value.lower() for token in AI_GENERATOR_TOKENS):
return value.strip()
return None
# xAI / Grok EXIF signature scheme. A 64+ char base64 blob after "Signature:"
# is far beyond any incidental description text, and the UUID Artist makes the
# pair xAI-specific -- both required keeps the false-positive rate near zero.
_XAI_SIGNATURE_RE = re.compile(r"Signature:\s*[A-Za-z0-9+/=]{64,}")
_UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE)
def _is_xai_signature_pair(description: str, artist: str) -> bool:
"""True if an EXIF (ImageDescription, Artist) pair is xAI/Grok's scheme."""
return _XAI_SIGNATURE_RE.match(description) is not None and _UUID_RE.fullmatch(artist) is not None
def _exif_text(ifd: dict, tag: int) -> str:
"""Decode a piexif 0th-IFD byte tag to a stripped string ('' if absent)."""
value = ifd.get(tag)
return value.decode("latin1", "replace").strip() if isinstance(value, bytes) else ""
def xai_signature(image_path: Path) -> bool:
"""Detect xAI / Grok's EXIF provenance signature scheme.
Grok image downloads (Aurora model) carry no C2PA, XMP, SynthID, or IPTC --
their only provenance signal is a private EXIF pair: ``ImageDescription`` =
``"Signature: <base64>"`` together with ``Artist`` = the image UUID. Verified
stable across three independent generations (2026-05-26; see CLAUDE.md). The
signature is xAI's and is not locally verifiable (no public key); detection
keys on this distinctive, low-false-positive shape, not on the signature's
validity. It survives only on the *original* JPEG download -- the web-UI
image is a re-encoded WebP that drops EXIF.
"""
try:
import piexif
from PIL import Image
with Image.open(image_path) as img:
exif_bytes = img.info.get("exif")
if not exif_bytes:
return False
tags = piexif.load(exif_bytes).get("0th", {})
except Exception as exc: # unopenable format / malformed EXIF
logger.debug("xAI-signature EXIF read failed for %s: %s", image_path, exc)
return False
return _is_xai_signature_pair(
_exif_text(tags, piexif.ImageIFD.ImageDescription), _exif_text(tags, piexif.ImageIFD.Artist)
)
def _scrub_ai_exif(exif_dict: dict) -> list[str]:
"""Delete AI-provenance tags from a piexif dict's ``0th`` IFD, in place.
Removes (a) the xAI/Grok signature pair (``ImageDescription`` "Signature: ..."
+ UUID ``Artist``) and (b) any ``Software`` / ``Make`` / ``Artist`` /
``ImageDescription`` tag whose value carries an ``AI_GENERATOR_TOKENS`` token
(Ideogram's ``Make``, Firefly's ``Software``, etc.). Mirrors the detection in
``xai_signature`` / ``exif_generator`` so removal scrubs exactly what
``identify`` flags, while leaving genuine camera/editor EXIF intact. Returns
the names of the removed tags (for logging).
"""
import piexif
from remove_ai_watermarks.noai.constants import AI_GENERATOR_TOKENS
ifd = exif_dict.get("0th")
if not ifd:
return []
drop: dict[int, str] = {}
# (a) xAI / Grok: the Signature blob and the UUID Artist go together.
if _is_xai_signature_pair(
_exif_text(ifd, piexif.ImageIFD.ImageDescription), _exif_text(ifd, piexif.ImageIFD.Artist)
):
drop[piexif.ImageIFD.ImageDescription] = "ImageDescription"
drop[piexif.ImageIFD.Artist] = "Artist"
# (b) Known AI generator token in any of the text tags.
for tag, name in (
(piexif.ImageIFD.Software, "Software"),
(piexif.ImageIFD.Make, "Make"),
(piexif.ImageIFD.Artist, "Artist"),
(piexif.ImageIFD.ImageDescription, "ImageDescription"),
):
if any(token in _exif_text(ifd, tag).lower() for token in AI_GENERATOR_TOKENS):
drop[tag] = name
for tag in drop:
ifd.pop(tag, None)
return list(drop.values())
def get_ai_metadata(image_path: Path) -> dict[str, str]:
"""Extract AI-related metadata from an image.
Args:
image_path: Path to the image.
Returns:
Dictionary of AI metadata key-value pairs.
"""
from PIL import Image
from remove_ai_watermarks.noai.c2pa import extract_c2pa_info, synthid_verdict
result: dict[str, str] = {}
# PIL may not open AVIF/HEIF/JPEG-XL without optional plugins (and
# ultralytics' Image.open patch can raise ModuleNotFoundError); fall through
# to the C2PA/binary path on any open failure. See CLAUDE.md.
try:
with Image.open(image_path) as img:
for key, value in img.info.items():
if _is_ai_key(key):
if isinstance(value, bytes):
result[key] = f"<binary {len(value)} bytes>"
elif isinstance(value, str) and len(value) > 200:
result[key] = value[:200] + ""
else:
result[key] = str(value)
except Exception as exc:
logger.debug("PIL could not open %s for AI-metadata scan: %s", image_path, exc)
# C2PA manifest fields from the single canonical parser (noai/c2pa.py).
c2pa = extract_c2pa_info(image_path)
for key in (
"c2pa_manifest",
"claim_generator",
"c2pa_spec",
"issuer",
"source_type",
"actions",
"synthid_watermark",
):
if key in c2pa:
result.setdefault(key, str(c2pa[key]))
# Non-PNG containers (JPEG/WebP/AVIF): extract_c2pa_info is PNG-only, so
# fall back to the format-agnostic source check for the SynthID verdict.
if "synthid_watermark" not in result and (vendor := synthid_source(image_path)):
result.setdefault("synthid_watermark", synthid_verdict(vendor))
# China TC260 AI-content label (Doubao and other China-served generators).
if aigc := aigc_label(image_path):
producer = aigc.get("ContentProducer", "")
result["aigc_label"] = f"China AIGC label (TC260){f'; producer {producer}' if producer else ''}"
# xAI / Grok EXIF signature scheme (its only provenance signal).
if xai_signature(image_path):
result.setdefault("xai_signature", "xAI/Grok EXIF signature (Artist UUID + Signature blob)")
return result
def remove_ai_metadata(
source_path: Path,
output_path: Path | None = None,
keep_standard: bool = True,
) -> Path:
"""Remove AI-generation metadata from an image.
Strips EXIF AI tags, PNG text chunks, and C2PA provenance manifests
while optionally preserving standard metadata (Author, Title, etc.).
Args:
source_path: Path to the source image.
output_path: Output path (None = overwrite source).
keep_standard: If True, preserve standard metadata fields.
Returns:
Path to the cleaned image.
"""
import piexif
from PIL import Image
from PIL.PngImagePlugin import PngInfo
if output_path is None:
output_path = source_path
# AVIF/HEIF/JPEG-XL: strip C2PA boxes at the container level without
# re-encoding. Avoids needing PIL plugins (pillow-heif / pillow-jxl) and
# preserves pixel data bit-for-bit.
if source_path.suffix.lower() in (".avif", ".heif", ".heic", ".jxl"):
from remove_ai_watermarks.noai.isobmff import strip_c2pa_boxes
data = source_path.read_bytes()
cleaned, stripped = strip_c2pa_boxes(data)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_bytes(cleaned)
logger.info("Stripped %d C2PA box(es) → %s", stripped, output_path)
return output_path
# Read image and filter metadata
with Image.open(source_path) as img:
img = img.copy()
fmt = output_path.suffix.lower()
save_kwargs: dict = {}
if fmt in (".jpg", ".jpeg"):
save_kwargs["format"] = "JPEG"
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
else:
save_kwargs["format"] = "PNG"
# Collect non-AI metadata
kept_meta: dict[str, str] = {}
exif_data = None
for key, value in img.info.items():
if _is_ai_key(key):
continue
if key == "exif":
with contextlib.suppress(Exception):
exif_data = piexif.load(value)
continue
if key in ("dpi", "gamma"):
save_kwargs[key] = value
continue
if keep_standard and key in STANDARD_METADATA_KEYS:
kept_meta[key] = str(value) if not isinstance(value, str) else value
# Apply cleaned metadata
if save_kwargs["format"] == "PNG" and kept_meta:
pnginfo = PngInfo()
for k, v in kept_meta.items():
pnginfo.add_text(k, v)
save_kwargs["pnginfo"] = pnginfo
if exif_data and save_kwargs["format"] == "JPEG":
# Scrub AI-provenance EXIF tags (xAI/Grok signature, generator tokens)
# while keeping genuine camera/editor EXIF; PNG output drops EXIF entirely.
if removed := _scrub_ai_exif(exif_data):
logger.info("Scrubbed AI EXIF tag(s): %s", ", ".join(removed))
with contextlib.suppress(Exception):
save_kwargs["exif"] = piexif.dump(exif_data)
output_path.parent.mkdir(parents=True, exist_ok=True)
img.save(output_path, **save_kwargs)
logger.info("Stripped AI metadata → %s", output_path)
return output_path