mirror of
https://github.com/wiltodelta/remove-ai-watermarks.git
synced 2026-06-05 18:46:34 +02:00
ba94de8275
remove_ai_metadata now scrubs AI tags from the JPEG EXIF instead of passing
the block through wholesale. Closes the v0.5.5 follow-up: the xAI/Grok
Signature + UUID-Artist pair was detected but not removed.
- metadata._scrub_ai_exif(): deletes the xAI signature pair and any
Software/Make/Artist/ImageDescription tag carrying an AI_GENERATOR_TOKENS
token (so Ideogram's Make="Ideogram AI" is scrubbed too), keeping genuine
camera/editor EXIF intact.
- Shared _is_xai_signature_pair / _exif_text helpers (module-level compiled
regexes) are now the single source of truth, used by both xai_signature
and _scrub_ai_exif.
- Tests: Grok signature stripped on JPEG output, Ideogram Make stripped,
real-camera Make ("Apple") preserved. 325 passing.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
521 lines
19 KiB
Python
521 lines
19 KiB
Python
"""AI metadata detection and removal.
|
|
|
|
Wraps the noai-watermark metadata handling for stripping AI-generation
|
|
metadata (EXIF, PNG text chunks, C2PA provenance) from images.
|
|
|
|
For metadata-only operations, the heavy ML dependencies are NOT required.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import contextlib
|
|
import logging
|
|
import re
|
|
from typing import TYPE_CHECKING
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ── Known AI metadata keys ──────────────────────────────────────────
|
|
|
|
AI_METADATA_KEYS: frozenset[str] = frozenset(
|
|
k.lower()
|
|
for k in [
|
|
"parameters",
|
|
"prompt",
|
|
"negative_prompt",
|
|
"workflow",
|
|
"comfyui",
|
|
"sd-metadata",
|
|
"invokeai_metadata",
|
|
"generation_data",
|
|
"ai_metadata",
|
|
"dream",
|
|
"sd:prompt",
|
|
"sd:negative_prompt",
|
|
"sd:seed",
|
|
"sd:steps",
|
|
"sd:sampler",
|
|
"sd:cfg_scale",
|
|
"sd:model_hash",
|
|
"c2pa",
|
|
"c2pa_chunk",
|
|
"Software",
|
|
]
|
|
)
|
|
|
|
AI_KEYWORDS: tuple[str, ...] = (
|
|
"stable_diffusion",
|
|
"comfyui",
|
|
"automatic1111",
|
|
"invokeai",
|
|
"midjourney",
|
|
"dall-e",
|
|
"dalle",
|
|
"imagen",
|
|
"synthid",
|
|
"google_ai",
|
|
"openai",
|
|
"c2pa",
|
|
)
|
|
|
|
# C2PA UUID used in ISOBMFF (AVIF, HEIF, MP4) ``uuid`` boxes.
|
|
# Reference: https://spec.c2pa.org/specifications/specifications/2.1/specs/C2PA_Specification.html
|
|
C2PA_UUID: bytes = bytes.fromhex("d8fec3d61b0e483c92975828877ec481")
|
|
|
|
# IPTC ``digitalSourceType`` values (IPTC 2025.1) that flag AI provenance.
|
|
# Used by Instagram, Facebook, X (Twitter) to show "Made with AI" labels.
|
|
IPTC_AI_MARKERS: tuple[bytes, ...] = (
|
|
b"trainedAlgorithmicMedia",
|
|
b"compositeSynthetic",
|
|
b"algorithmicMedia",
|
|
b"compositeWithTrainedAlgorithmicMedia",
|
|
)
|
|
|
|
# China's mandatory AI-content labeling (TC260, the national cybersecurity
|
|
# standards committee). AI generators serving China embed an XMP block in the
|
|
# TC260 namespace -- ``<TC260:AIGC>{"Label":"1",...}``. Doubao (ByteDance) uses
|
|
# this; the same standard is mandatory for Jimeng, Kling, Qwen, Ernie, etc.,
|
|
# so the marker covers the whole China-AIGC-labeled ecosystem. Container-
|
|
# agnostic (XMP is text), so a raw-byte scan catches it in PNG/JPEG/etc.
|
|
AIGC_MARKERS: tuple[bytes, ...] = (
|
|
b"tc260.org.cn/ns/AIGC",
|
|
b"TC260:AIGC",
|
|
)
|
|
|
|
STANDARD_METADATA_KEYS: frozenset[str] = frozenset(
|
|
[
|
|
"Author",
|
|
"Title",
|
|
"Description",
|
|
"Copyright",
|
|
"Creation Time",
|
|
"Software",
|
|
"Comment",
|
|
"Disclaimer",
|
|
"Source",
|
|
"Warning",
|
|
]
|
|
)
|
|
|
|
|
|
def _is_ai_key(key: str) -> bool:
|
|
"""Check if a metadata key is AI-related."""
|
|
key_lower = key.lower()
|
|
if key_lower in AI_METADATA_KEYS:
|
|
return True
|
|
return any(kw in key_lower for kw in AI_KEYWORDS)
|
|
|
|
|
|
def has_ai_metadata(image_path: Path) -> bool:
|
|
"""Check if an image contains AI-generation metadata.
|
|
|
|
Args:
|
|
image_path: Path to the image.
|
|
|
|
Returns:
|
|
True if AI metadata is detected.
|
|
"""
|
|
from PIL import Image
|
|
|
|
# PIL may not handle AVIF/HEIF/JPEG-XL without the optional plugins
|
|
# (ultralytics also monkey-patches Image.open in a way that can raise
|
|
# ModuleNotFoundError when pi_heif autoload fails), so any open failure
|
|
# falls through to the binary scan.
|
|
try:
|
|
with Image.open(image_path) as img:
|
|
for key in img.info:
|
|
if _is_ai_key(key):
|
|
return True
|
|
except Exception as exc:
|
|
logger.debug("PIL could not open %s for metadata scan: %s", image_path, exc)
|
|
|
|
# Check C2PA — via the official ``c2pa`` lib if available, otherwise via a
|
|
# binary scan that also catches AVIF/HEIF/JPEG-XL containers (PIL doesn't
|
|
# expose their metadata uniformly).
|
|
try:
|
|
from c2pa import has_c2pa_metadata
|
|
|
|
if has_c2pa_metadata(image_path):
|
|
return True
|
|
except ImportError:
|
|
pass
|
|
|
|
# Binary scan covers C2PA (PNG caBX, JPEG APP11, AVIF/HEIF/JXL uuid boxes)
|
|
# and IPTC AI markers in XMP. Read only the first 512KB to bound memory.
|
|
with open(image_path, "rb") as f:
|
|
data = f.read(512 * 1024)
|
|
if b"c2pa" in data.lower() or b"C2PA" in data:
|
|
return True
|
|
if C2PA_UUID in data:
|
|
return True
|
|
if any(marker in data for marker in AIGC_MARKERS):
|
|
return True
|
|
if any(marker in data for marker in IPTC_AI_MARKERS):
|
|
return True
|
|
# xAI / Grok: no C2PA/IPTC/XMP -- only the EXIF Signature + UUID-Artist pair.
|
|
return xai_signature(image_path)
|
|
|
|
|
|
def aigc_label(image_path: Path) -> dict[str, str] | None:
|
|
"""Parse a China TC260 ``<TC260:AIGC>`` AI-labeling block, if present.
|
|
|
|
Returns the decoded JSON (e.g. ``{"Label": "1", "ContentProducer": ...}``)
|
|
or None. The block is XMP text (HTML-entity encoded), so it is found by a
|
|
container-agnostic raw-byte scan and works for PNG/JPEG/WebP alike.
|
|
"""
|
|
import html
|
|
import json
|
|
import re
|
|
|
|
with open(image_path, "rb") as f:
|
|
data = f.read(1024 * 1024)
|
|
match = re.search(rb"<TC260:AIGC>(.*?)</TC260:AIGC>", data, re.DOTALL)
|
|
if not match:
|
|
return None
|
|
raw = html.unescape(match.group(1).decode("utf-8", "replace"))
|
|
try:
|
|
parsed = json.loads(raw)
|
|
except ValueError:
|
|
return None
|
|
return {str(k): str(v) for k, v in parsed.items()} if isinstance(parsed, dict) else None
|
|
|
|
|
|
def synthid_source(image_path: Path) -> str | None:
|
|
"""Return the vendor name(s) if the image carries a SynthID pixel watermark.
|
|
|
|
This is a *metadata-based* proxy: Google (Imagen/Gemini) and OpenAI
|
|
(ChatGPT/DALL-E/gpt-image) embed an invisible SynthID watermark alongside
|
|
a C2PA manifest, so a C2PA manifest signed by one of them on AI-generated
|
|
content implies SynthID in the pixels. Adobe Firefly / Microsoft Designer
|
|
sign C2PA but do not use SynthID, so they return None.
|
|
|
|
The verdict is reliable only while the C2PA manifest is intact -- absence
|
|
is not proof, because C2PA can be stripped while the pixel watermark
|
|
survives, and the pixel watermark itself is not locally detectable
|
|
(proprietary decoder).
|
|
|
|
Args:
|
|
image_path: Path to the image (PNG, JPEG, WebP, or ISOBMFF container).
|
|
|
|
Returns:
|
|
Comma-joined vendor name(s) (e.g. ``"OpenAI"``) or None.
|
|
"""
|
|
from remove_ai_watermarks.noai.c2pa import extract_c2pa_info, synthid_vendors_in
|
|
|
|
# PNG: the caBX chunk parser gives a clean, structured issuer.
|
|
vendors = extract_c2pa_info(image_path).get("synthid_vendors")
|
|
if vendors:
|
|
return ", ".join(vendors)
|
|
|
|
# Non-PNG containers (JPEG APP11, WebP, AVIF/HEIF/JXL uuid box) keep the
|
|
# C2PA manifest where the PNG parser can't reach it. Binary-scan for the
|
|
# same signal: a C2PA manifest from a SynthID-using issuer on AI content.
|
|
with open(image_path, "rb") as f:
|
|
data = f.read(1024 * 1024)
|
|
has_c2pa = b"c2pa" in data.lower() or C2PA_UUID in data
|
|
# Matches both "trainedAlgorithmicMedia" and "compositeWithTrainedAlgorithmicMedia".
|
|
ai_source = b"trainedAlgorithmicMedia" in data or b"TrainedAlgorithmicMedia" in data
|
|
if not (has_c2pa and ai_source):
|
|
return None
|
|
matched = synthid_vendors_in(data)
|
|
return ", ".join(matched) if matched else None
|
|
|
|
|
|
def exif_generator(image_path: Path) -> str | None:
|
|
"""Return an AI-generator name from the EXIF ``Software`` / XMP ``CreatorTool``
|
|
field, if it matches a known generator (see ``AI_GENERATOR_TOKENS``), else None.
|
|
|
|
Cross-format: EXIF is read via PIL + piexif for any container PIL can open
|
|
(JPEG/WebP/AVIF/PNG); an XMP ``CreatorTool`` raw-byte scan additionally covers
|
|
HEIF/JPEG-XL that PIL can't open without plugins. Only AI tokens match, so
|
|
ordinary editors (plain "Adobe Photoshop", "GIMP") are not flagged.
|
|
"""
|
|
import re
|
|
|
|
from remove_ai_watermarks.noai.constants import AI_GENERATOR_TOKENS
|
|
|
|
candidates: list[str] = []
|
|
|
|
# EXIF Software / Artist / ImageDescription (0th IFD) via PIL exif bytes.
|
|
try:
|
|
import piexif
|
|
from PIL import Image
|
|
|
|
with Image.open(image_path) as img:
|
|
exif_bytes = img.info.get("exif")
|
|
if exif_bytes:
|
|
tags = piexif.load(exif_bytes).get("0th", {})
|
|
# Make catches camera-style tags AI tools reuse (Ideogram writes
|
|
# Make="Ideogram AI"); real cameras put "Apple"/"Canon" there, which
|
|
# carry no AI token, so this stays low-false-positive.
|
|
for tag in (
|
|
piexif.ImageIFD.Software,
|
|
piexif.ImageIFD.Make,
|
|
piexif.ImageIFD.Artist,
|
|
piexif.ImageIFD.ImageDescription,
|
|
):
|
|
value = tags.get(tag)
|
|
if isinstance(value, bytes):
|
|
candidates.append(value.decode("latin1", "replace"))
|
|
except Exception as exc: # unopenable format / malformed EXIF
|
|
logger.debug("EXIF generator read failed for %s: %s", image_path, exc)
|
|
|
|
# XMP CreatorTool: text, container-agnostic (covers HEIF/JXL via raw scan).
|
|
try:
|
|
with open(image_path, "rb") as f:
|
|
head = f.read(1024 * 1024)
|
|
for match in re.finditer(rb"CreatorTool[>\"'=\s]{1,4}([^<\"']{1,80})", head):
|
|
candidates.append(match.group(1).decode("latin1", "replace"))
|
|
except Exception as exc:
|
|
logger.debug("XMP CreatorTool scan failed for %s: %s", image_path, exc)
|
|
|
|
for value in candidates:
|
|
if any(token in value.lower() for token in AI_GENERATOR_TOKENS):
|
|
return value.strip()
|
|
return None
|
|
|
|
|
|
# xAI / Grok EXIF signature scheme. A 64+ char base64 blob after "Signature:"
|
|
# is far beyond any incidental description text, and the UUID Artist makes the
|
|
# pair xAI-specific -- both required keeps the false-positive rate near zero.
|
|
_XAI_SIGNATURE_RE = re.compile(r"Signature:\s*[A-Za-z0-9+/=]{64,}")
|
|
_UUID_RE = re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE)
|
|
|
|
|
|
def _is_xai_signature_pair(description: str, artist: str) -> bool:
|
|
"""True if an EXIF (ImageDescription, Artist) pair is xAI/Grok's scheme."""
|
|
return _XAI_SIGNATURE_RE.match(description) is not None and _UUID_RE.fullmatch(artist) is not None
|
|
|
|
|
|
def _exif_text(ifd: dict, tag: int) -> str:
|
|
"""Decode a piexif 0th-IFD byte tag to a stripped string ('' if absent)."""
|
|
value = ifd.get(tag)
|
|
return value.decode("latin1", "replace").strip() if isinstance(value, bytes) else ""
|
|
|
|
|
|
def xai_signature(image_path: Path) -> bool:
|
|
"""Detect xAI / Grok's EXIF provenance signature scheme.
|
|
|
|
Grok image downloads (Aurora model) carry no C2PA, XMP, SynthID, or IPTC --
|
|
their only provenance signal is a private EXIF pair: ``ImageDescription`` =
|
|
``"Signature: <base64>"`` together with ``Artist`` = the image UUID. Verified
|
|
stable across three independent generations (2026-05-26; see CLAUDE.md). The
|
|
signature is xAI's and is not locally verifiable (no public key); detection
|
|
keys on this distinctive, low-false-positive shape, not on the signature's
|
|
validity. It survives only on the *original* JPEG download -- the web-UI
|
|
image is a re-encoded WebP that drops EXIF.
|
|
"""
|
|
try:
|
|
import piexif
|
|
from PIL import Image
|
|
|
|
with Image.open(image_path) as img:
|
|
exif_bytes = img.info.get("exif")
|
|
if not exif_bytes:
|
|
return False
|
|
tags = piexif.load(exif_bytes).get("0th", {})
|
|
except Exception as exc: # unopenable format / malformed EXIF
|
|
logger.debug("xAI-signature EXIF read failed for %s: %s", image_path, exc)
|
|
return False
|
|
|
|
return _is_xai_signature_pair(
|
|
_exif_text(tags, piexif.ImageIFD.ImageDescription), _exif_text(tags, piexif.ImageIFD.Artist)
|
|
)
|
|
|
|
|
|
def _scrub_ai_exif(exif_dict: dict) -> list[str]:
|
|
"""Delete AI-provenance tags from a piexif dict's ``0th`` IFD, in place.
|
|
|
|
Removes (a) the xAI/Grok signature pair (``ImageDescription`` "Signature: ..."
|
|
+ UUID ``Artist``) and (b) any ``Software`` / ``Make`` / ``Artist`` /
|
|
``ImageDescription`` tag whose value carries an ``AI_GENERATOR_TOKENS`` token
|
|
(Ideogram's ``Make``, Firefly's ``Software``, etc.). Mirrors the detection in
|
|
``xai_signature`` / ``exif_generator`` so removal scrubs exactly what
|
|
``identify`` flags, while leaving genuine camera/editor EXIF intact. Returns
|
|
the names of the removed tags (for logging).
|
|
"""
|
|
import piexif
|
|
|
|
from remove_ai_watermarks.noai.constants import AI_GENERATOR_TOKENS
|
|
|
|
ifd = exif_dict.get("0th")
|
|
if not ifd:
|
|
return []
|
|
|
|
drop: dict[int, str] = {}
|
|
|
|
# (a) xAI / Grok: the Signature blob and the UUID Artist go together.
|
|
if _is_xai_signature_pair(
|
|
_exif_text(ifd, piexif.ImageIFD.ImageDescription), _exif_text(ifd, piexif.ImageIFD.Artist)
|
|
):
|
|
drop[piexif.ImageIFD.ImageDescription] = "ImageDescription"
|
|
drop[piexif.ImageIFD.Artist] = "Artist"
|
|
|
|
# (b) Known AI generator token in any of the text tags.
|
|
for tag, name in (
|
|
(piexif.ImageIFD.Software, "Software"),
|
|
(piexif.ImageIFD.Make, "Make"),
|
|
(piexif.ImageIFD.Artist, "Artist"),
|
|
(piexif.ImageIFD.ImageDescription, "ImageDescription"),
|
|
):
|
|
if any(token in _exif_text(ifd, tag).lower() for token in AI_GENERATOR_TOKENS):
|
|
drop[tag] = name
|
|
|
|
for tag in drop:
|
|
ifd.pop(tag, None)
|
|
return list(drop.values())
|
|
|
|
|
|
def get_ai_metadata(image_path: Path) -> dict[str, str]:
|
|
"""Extract AI-related metadata from an image.
|
|
|
|
Args:
|
|
image_path: Path to the image.
|
|
|
|
Returns:
|
|
Dictionary of AI metadata key-value pairs.
|
|
"""
|
|
from PIL import Image
|
|
|
|
from remove_ai_watermarks.noai.c2pa import extract_c2pa_info, synthid_verdict
|
|
|
|
result: dict[str, str] = {}
|
|
|
|
# PIL may not open AVIF/HEIF/JPEG-XL without optional plugins (and
|
|
# ultralytics' Image.open patch can raise ModuleNotFoundError); fall through
|
|
# to the C2PA/binary path on any open failure. See CLAUDE.md.
|
|
try:
|
|
with Image.open(image_path) as img:
|
|
for key, value in img.info.items():
|
|
if _is_ai_key(key):
|
|
if isinstance(value, bytes):
|
|
result[key] = f"<binary {len(value)} bytes>"
|
|
elif isinstance(value, str) and len(value) > 200:
|
|
result[key] = value[:200] + "…"
|
|
else:
|
|
result[key] = str(value)
|
|
except Exception as exc:
|
|
logger.debug("PIL could not open %s for AI-metadata scan: %s", image_path, exc)
|
|
|
|
# C2PA manifest fields from the single canonical parser (noai/c2pa.py).
|
|
c2pa = extract_c2pa_info(image_path)
|
|
for key in (
|
|
"c2pa_manifest",
|
|
"claim_generator",
|
|
"c2pa_spec",
|
|
"issuer",
|
|
"source_type",
|
|
"actions",
|
|
"synthid_watermark",
|
|
):
|
|
if key in c2pa:
|
|
result.setdefault(key, str(c2pa[key]))
|
|
|
|
# Non-PNG containers (JPEG/WebP/AVIF): extract_c2pa_info is PNG-only, so
|
|
# fall back to the format-agnostic source check for the SynthID verdict.
|
|
if "synthid_watermark" not in result and (vendor := synthid_source(image_path)):
|
|
result.setdefault("synthid_watermark", synthid_verdict(vendor))
|
|
|
|
# China TC260 AI-content label (Doubao and other China-served generators).
|
|
if aigc := aigc_label(image_path):
|
|
producer = aigc.get("ContentProducer", "")
|
|
result["aigc_label"] = f"China AIGC label (TC260){f'; producer {producer}' if producer else ''}"
|
|
|
|
# xAI / Grok EXIF signature scheme (its only provenance signal).
|
|
if xai_signature(image_path):
|
|
result.setdefault("xai_signature", "xAI/Grok EXIF signature (Artist UUID + Signature blob)")
|
|
return result
|
|
|
|
|
|
def remove_ai_metadata(
|
|
source_path: Path,
|
|
output_path: Path | None = None,
|
|
keep_standard: bool = True,
|
|
) -> Path:
|
|
"""Remove AI-generation metadata from an image.
|
|
|
|
Strips EXIF AI tags, PNG text chunks, and C2PA provenance manifests
|
|
while optionally preserving standard metadata (Author, Title, etc.).
|
|
|
|
Args:
|
|
source_path: Path to the source image.
|
|
output_path: Output path (None = overwrite source).
|
|
keep_standard: If True, preserve standard metadata fields.
|
|
|
|
Returns:
|
|
Path to the cleaned image.
|
|
"""
|
|
import piexif
|
|
from PIL import Image
|
|
from PIL.PngImagePlugin import PngInfo
|
|
|
|
if output_path is None:
|
|
output_path = source_path
|
|
|
|
# AVIF/HEIF/JPEG-XL: strip C2PA boxes at the container level without
|
|
# re-encoding. Avoids needing PIL plugins (pillow-heif / pillow-jxl) and
|
|
# preserves pixel data bit-for-bit.
|
|
if source_path.suffix.lower() in (".avif", ".heif", ".heic", ".jxl"):
|
|
from remove_ai_watermarks.noai.isobmff import strip_c2pa_boxes
|
|
|
|
data = source_path.read_bytes()
|
|
cleaned, stripped = strip_c2pa_boxes(data)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_bytes(cleaned)
|
|
logger.info("Stripped %d C2PA box(es) → %s", stripped, output_path)
|
|
return output_path
|
|
|
|
# Read image and filter metadata
|
|
with Image.open(source_path) as img:
|
|
img = img.copy()
|
|
fmt = output_path.suffix.lower()
|
|
|
|
save_kwargs: dict = {}
|
|
if fmt in (".jpg", ".jpeg"):
|
|
save_kwargs["format"] = "JPEG"
|
|
if img.mode in ("RGBA", "P"):
|
|
img = img.convert("RGB")
|
|
else:
|
|
save_kwargs["format"] = "PNG"
|
|
|
|
# Collect non-AI metadata
|
|
kept_meta: dict[str, str] = {}
|
|
exif_data = None
|
|
|
|
for key, value in img.info.items():
|
|
if _is_ai_key(key):
|
|
continue
|
|
if key == "exif":
|
|
with contextlib.suppress(Exception):
|
|
exif_data = piexif.load(value)
|
|
continue
|
|
if key in ("dpi", "gamma"):
|
|
save_kwargs[key] = value
|
|
continue
|
|
if keep_standard and key in STANDARD_METADATA_KEYS:
|
|
kept_meta[key] = str(value) if not isinstance(value, str) else value
|
|
|
|
# Apply cleaned metadata
|
|
if save_kwargs["format"] == "PNG" and kept_meta:
|
|
pnginfo = PngInfo()
|
|
for k, v in kept_meta.items():
|
|
pnginfo.add_text(k, v)
|
|
save_kwargs["pnginfo"] = pnginfo
|
|
|
|
if exif_data and save_kwargs["format"] == "JPEG":
|
|
# Scrub AI-provenance EXIF tags (xAI/Grok signature, generator tokens)
|
|
# while keeping genuine camera/editor EXIF; PNG output drops EXIF entirely.
|
|
if removed := _scrub_ai_exif(exif_data):
|
|
logger.info("Scrubbed AI EXIF tag(s): %s", ", ".join(removed))
|
|
with contextlib.suppress(Exception):
|
|
save_kwargs["exif"] = piexif.dump(exif_data)
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
img.save(output_path, **save_kwargs)
|
|
|
|
logger.info("Stripped AI metadata → %s", output_path)
|
|
return output_path
|