Files
remove-ai-watermarks/tests/test_noai.py
T
test-user 03fb460f77 Track the labeled SynthID corpus; complete metadata-source test coverage
Corpus images were gitignored (local-only). The negatives were reviewed and
cleared for publishing, so the labeled set is now committed (regular git, 65 MB
across 25 files) -- making the removal regression set reproducible and CI-able.

Corpus:
- Track data/synthid_corpus/images/ (pos 9, neg 15, cleaned 1); keep only the
  synthetic refs/ calibration fills gitignored.
- Reconcile manifest.csv to the on-disk files: 117 -> 25 rows (92 dangling rows
  for removed images pruned; dedup left one cleaned output, f6dd47a5).
- Rewrite the corpus README layout/policy (images committed; review every image
  for private content before adding -- public repo, permanent history).

Test fixtures:
- Remove data/samples/not-ai-1/2/3 (personal iPhone photos, incl. GPS EXIF).
- Add the clean_photo conftest fixture serving a verified-negative image from
  the corpus neg/ set; repoint the three "non-AI / clean photo" tests onto it
  (skips if the corpus is absent).

Metadata-source coverage (close the last sub-variant gaps):
- c2pa digitalSourceType: algorithmicMedia (procedural, not flagged AI) and
  compositeWithTrainedAlgorithmicMedia (AI + SynthID proxy).
- exif_generator: EXIF Artist and ImageDescription fields (Software/Make/XMP
  CreatorTool were already covered).

All 8 metadata-source kinds are now tested at both the unit and identify()
level. 313 tests pass. CLAUDE.md updated (corpus tracked, clean_photo fixture).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 14:46:47 -07:00

348 lines
13 KiB
Python

"""Tests for vendored noai submodules: constants, extractor, cleaner, c2pa."""
from __future__ import annotations
import struct
from pathlib import Path
import pytest
from remove_ai_watermarks.noai.c2pa import (
_cbor_text_after,
_parse_c2pa_chunk,
extract_c2pa_chunk,
extract_c2pa_info,
has_c2pa_metadata,
inject_c2pa_chunk,
synthid_verdict,
)
from remove_ai_watermarks.noai.cleaner import (
has_ai_content,
)
from remove_ai_watermarks.noai.cleaner import (
remove_ai_metadata as noai_remove_ai_metadata,
)
from remove_ai_watermarks.noai.constants import (
AI_KEYWORDS,
AI_METADATA_KEYS,
C2PA_CHUNK_TYPE,
PNG_SIGNATURE,
SUPPORTED_FORMATS,
)
from remove_ai_watermarks.noai.extractor import (
extract_ai_metadata,
extract_metadata,
get_ai_metadata_summary,
has_ai_metadata,
)
from remove_ai_watermarks.noai.isobmff import (
is_isobmff,
strip_c2pa_boxes,
)
# ── Constants ───────────────────────────────────────────────────────
class TestConstants:
"""Verify constant integrity."""
def test_supported_formats_include_png(self):
assert ".png" in SUPPORTED_FORMATS
def test_supported_formats_include_jpg(self):
assert ".jpg" in SUPPORTED_FORMATS
def test_ai_metadata_keys_not_empty(self):
assert len(AI_METADATA_KEYS) > 0
def test_ai_keywords_not_empty(self):
assert len(AI_KEYWORDS) > 0
def test_png_signature_bytes(self):
assert PNG_SIGNATURE == b"\x89PNG\r\n\x1a\n"
def test_c2pa_chunk_type(self):
assert C2PA_CHUNK_TYPE == b"caBX"
# ── Extractor ───────────────────────────────────────────────────────
class TestExtractor:
"""Tests for noai.extractor functions."""
def test_extract_metadata_returns_dict(self, tmp_clean_png):
meta = extract_metadata(tmp_clean_png)
assert isinstance(meta, dict)
def test_extract_metadata_gets_standard_keys(self, tmp_clean_png):
meta = extract_metadata(tmp_clean_png)
assert "Author" in meta
def test_extract_ai_metadata_from_ai_image(self, tmp_png_with_ai_metadata):
meta = extract_ai_metadata(tmp_png_with_ai_metadata)
assert "parameters" in meta
def test_extract_ai_metadata_from_clean_image(self, tmp_clean_png):
meta = extract_ai_metadata(tmp_clean_png)
assert len(meta) == 0
def test_has_ai_metadata_detects(self, tmp_png_with_ai_metadata):
assert has_ai_metadata(tmp_png_with_ai_metadata)
def test_has_ai_metadata_clean(self, tmp_clean_png):
assert not has_ai_metadata(tmp_clean_png)
def test_summary_with_ai(self, tmp_png_with_ai_metadata):
summary = get_ai_metadata_summary(tmp_png_with_ai_metadata)
assert "AI Image Metadata" in summary
def test_summary_clean(self, tmp_clean_png):
summary = get_ai_metadata_summary(tmp_clean_png)
assert "No AI metadata" in summary
# ── Cleaner ─────────────────────────────────────────────────────────
class TestCleaner:
"""Tests for noai.cleaner functions."""
def test_remove_ai_metadata(self, tmp_png_with_ai_metadata, tmp_path):
output = tmp_path / "cleaned.png"
noai_remove_ai_metadata(tmp_png_with_ai_metadata, output)
assert output.exists()
# Verify AI metadata removed
meta = extract_ai_metadata(output)
assert "parameters" not in meta
def test_has_ai_content(self, tmp_png_with_ai_metadata):
assert has_ai_content(tmp_png_with_ai_metadata)
# ── C2PA ────────────────────────────────────────────────────────────
class TestC2PA:
"""Tests for C2PA detection on regular (non-C2PA) images."""
def test_no_c2pa_on_regular_png(self, tmp_clean_png):
assert not has_c2pa_metadata(tmp_clean_png)
def test_no_c2pa_on_jpeg(self, tmp_jpeg_path):
assert not has_c2pa_metadata(tmp_jpeg_path)
def test_extract_c2pa_none_on_regular(self, tmp_clean_png):
assert extract_c2pa_chunk(tmp_clean_png) is None
def test_extract_c2pa_info_empty(self, tmp_clean_png):
info = extract_c2pa_info(tmp_clean_png)
assert info == {}
def test_c2pa_returns_false_for_non_png(self, tmp_jpeg_path):
assert not has_c2pa_metadata(tmp_jpeg_path)
SAMPLES_DIR = Path(__file__).resolve().parent.parent / "data" / "samples"
@pytest.mark.skipif(not SAMPLES_DIR.exists(), reason="data/samples not present")
class TestC2PARealSamples:
"""Parser behavior on real committed C2PA images."""
def test_detects_c2pa_in_openai_png(self):
assert has_c2pa_metadata(SAMPLES_DIR / "chatgpt-1.png")
def test_extract_info_openai_fields(self):
info = extract_c2pa_info(SAMPLES_DIR / "chatgpt-1.png")
assert info["has_c2pa"] is True
assert "OpenAI" in info["issuer"]
assert "c2pa_manifest" in info # "C2PA manifest (N bytes)"
assert "trainedAlgorithmicMedia" in info["source_type"]
# CBOR-clean claim generator, no regex artifacts (e.g. "fGPT-4o").
assert info["claim_generator"]
assert not info["claim_generator"].startswith("f")
assert "synthid_watermark" in info
def test_extract_info_adobe_has_no_synthid(self):
info = extract_c2pa_info(SAMPLES_DIR / "firefly-1.png")
assert "Adobe" in info["issuer"]
assert "synthid_watermark" not in info
def test_extract_chunk_returns_bytes(self):
chunk = extract_c2pa_chunk(SAMPLES_DIR / "chatgpt-1.png")
assert chunk is not None
assert chunk[4:8] == b"caBX" # chunk type in the 8-byte header
def test_inject_round_trip(self, tmp_clean_png, tmp_path):
"""Extract a real C2PA chunk, inject into a clean PNG, re-detect."""
chunk = extract_c2pa_chunk(SAMPLES_DIR / "chatgpt-1.png")
out = tmp_path / "injected.png"
inject_c2pa_chunk(tmp_clean_png, out, chunk)
assert has_c2pa_metadata(out)
assert "OpenAI" in extract_c2pa_info(out)["issuer"]
class TestC2PAInjectValidation:
def test_inject_rejects_non_png(self, tmp_path):
with pytest.raises(ValueError, match="only supported for PNG"):
inject_c2pa_chunk(tmp_path / "in.jpg", tmp_path / "out.png", b"")
# ── CBOR text extraction (parser internals) ─────────────────────────
class TestCborTextAfter:
"""_cbor_text_after handles the three CBOR text-string length prefixes."""
def test_direct_length(self):
# major-type 3, direct length (0x60 + len). "abc" -> 0x63.
payload = b"name" + bytes([0x63]) + b"abc"
assert _cbor_text_after(payload, b"name") == "abc"
def test_one_byte_length(self):
s = b"x" * 30
payload = b"name" + bytes([0x78, 30]) + s
assert _cbor_text_after(payload, b"name") == "x" * 30
def test_two_byte_length(self):
s = b"y" * 300
payload = b"name" + bytes([0x79]) + struct.pack(">H", 300) + s
assert _cbor_text_after(payload, b"name") == "y" * 300
def test_key_not_found_returns_none(self):
assert _cbor_text_after(b"nothing here", b"name") is None
def test_key_at_end_returns_none(self):
assert _cbor_text_after(b"prefixname", b"name") is None
def test_invalid_head_returns_none(self):
# 0x00 is not a text-string head.
assert _cbor_text_after(b"name" + bytes([0x00]) + b"abc", b"name") is None
def test_latin1_fallback_on_invalid_utf8(self):
payload = b"name" + bytes([0x61]) + b"\xff" # len 1, invalid utf-8
assert _cbor_text_after(payload, b"name") is not None
class TestSynthIDVerdict:
def test_format(self):
assert synthid_verdict("OpenAI") == "likely present (OpenAI embeds SynthID with C2PA)"
def test_multiple_vendors(self):
assert "Google LLC, OpenAI" in synthid_verdict("Google LLC, OpenAI")
class TestParseChunkGuards:
"""_parse_c2pa_chunk rejects non-printable claim_generator garbage.
On some manifests (observed: Microsoft Designer) the first ``name`` key
precedes a binary hash field, not the generator string. The clean issuer +
SynthID verdict must still come through.
"""
def test_clean_generator_kept(self):
# "name" + CBOR text-string (head 0x69 = 0x60+9) "gpt-image"
chunk = b"...name" + bytes([0x69]) + b"gpt-image" + b"OpenAI trainedAlgorithmicMedia"
info: dict = {}
_parse_c2pa_chunk(chunk, info)
assert info["claim_generator"] == "gpt-image"
assert "OpenAI" in info["issuer"]
assert "synthid_watermark" in info # OpenAI + trainedAlgorithmicMedia
def test_nonprintable_generator_dropped(self):
# "name" + CBOR string (head 0x64 = len 4) with a control byte -> garbage
chunk = b"...name" + bytes([0x64]) + b"\x81abc" + b"OpenAI trainedAlgorithmicMedia"
info: dict = {}
_parse_c2pa_chunk(chunk, info)
assert "claim_generator" not in info # control-char garbage rejected
assert "OpenAI" in info["issuer"] # issuer byte-search still robust
class TestC2PADigitalSourceType:
"""The three IPTC digitalSourceType variants drive the AI verdict.
Only *trained* and *composite-with-trained* mean AI-generated (and so imply
a SynthID proxy for a SynthID vendor); plain ``algorithmicMedia`` is
procedural (not trained) and must NOT be flagged as AI.
"""
def test_plain_algorithmic_media_not_flagged_ai(self):
chunk = b"...name" + bytes([0x69]) + b"some-tool" + b" OpenAI algorithmicMedia"
info: dict = {}
_parse_c2pa_chunk(chunk, info)
assert info["source_type"] == "algorithmicMedia"
assert "synthid_watermark" not in info # procedural, not AI-generated
def test_composite_with_trained_is_ai_and_synthid(self):
chunk = b"...name" + bytes([0x69]) + b"some-tool" + b" OpenAI compositeWithTrainedAlgorithmicMedia"
info: dict = {}
_parse_c2pa_chunk(chunk, info)
assert "compositeWithTrainedAlgorithmicMedia" in info["source_type"]
assert "synthid_watermark" in info # AI-enhanced + OpenAI issuer
# ── ISOBMFF (AVIF / HEIF / JPEG-XL container stripping) ──────────────
FTYP = b"\x00\x00\x00\x18ftypavif\x00\x00\x00\x00avifmif1" # 24-byte ftyp box
class TestISOBMFF:
def test_is_isobmff_true(self):
assert is_isobmff(FTYP)
def test_is_isobmff_false_for_png(self):
assert not is_isobmff(b"\x89PNG\r\n\x1a\n\x00\x00")
def test_is_isobmff_false_for_short(self):
assert not is_isobmff(b"abc")
def test_strips_jpegxl_jumb_box(self):
"""JPEG-XL stores JUMBF in a ``jumb`` box, always stripped."""
jumb = struct.pack(">I", 8 + 5) + b"jumb" + b"hello"
cleaned, stripped = strip_c2pa_boxes(FTYP + jumb)
assert stripped == 1
assert cleaned == FTYP
def test_keeps_non_c2pa_box_with_64bit_size(self):
"""size==1 means a 64-bit largesize follows; non-C2PA box is kept."""
payload = b"\x00" * 8
box = b"\x00\x00\x00\x01" + b"free" + struct.pack(">Q", 16 + len(payload)) + payload
cleaned, stripped = strip_c2pa_boxes(FTYP + box)
assert stripped == 0
assert cleaned == FTYP + box
def test_malformed_box_does_not_crash(self):
# A box claiming size 4 (< 8-byte header) must terminate iteration safely.
cleaned, stripped = strip_c2pa_boxes(FTYP + b"\x00\x00\x00\x04XXXX")
assert stripped == 0
assert cleaned.startswith(FTYP)
def test_size_zero_box_runs_to_eof(self):
# size32==0 means the box extends to EOF; a non-C2PA box round-trips.
box = struct.pack(">I", 0) + b"free" + b"\x00\x00\x00\x00"
cleaned, stripped = strip_c2pa_boxes(FTYP + box)
assert stripped == 0
assert cleaned == FTYP + box
def test_truncated_largesize_terminates_safely(self):
# size32==1 promises a 64-bit largesize, but the box ends after 8 bytes;
# iteration must stop rather than read the missing largesize past EOF.
cleaned, stripped = strip_c2pa_boxes(FTYP + b"\x00\x00\x00\x01uuid")
assert stripped == 0
assert cleaned == FTYP
class TestC2PAInvalidSignature:
"""A .png file that is not actually PNG-signed must read as clean, not crash."""
def test_has_c2pa_false_for_non_png_bytes(self, tmp_path: Path):
fake = tmp_path / "fake.png"
fake.write_bytes(b"\xff\xd8\xff\xe0 not a png at all, just garbage bytes")
assert has_c2pa_metadata(fake) is False
def test_extract_chunk_none_for_non_png_bytes(self, tmp_path: Path):
fake = tmp_path / "fake.png"
fake.write_bytes(b"\xff\xd8\xff\xe0 not a png at all, just garbage bytes")
assert extract_c2pa_chunk(fake) is None