mirror of
https://github.com/wiltodelta/remove-ai-watermarks.git
synced 2026-05-26 14:17:47 +02:00
03fb460f77
Corpus images were gitignored (local-only). The negatives were reviewed and cleared for publishing, so the labeled set is now committed (regular git, 65 MB across 25 files) -- making the removal regression set reproducible and CI-able. Corpus: - Track data/synthid_corpus/images/ (pos 9, neg 15, cleaned 1); keep only the synthetic refs/ calibration fills gitignored. - Reconcile manifest.csv to the on-disk files: 117 -> 25 rows (92 dangling rows for removed images pruned; dedup left one cleaned output, f6dd47a5). - Rewrite the corpus README layout/policy (images committed; review every image for private content before adding -- public repo, permanent history). Test fixtures: - Remove data/samples/not-ai-1/2/3 (personal iPhone photos, incl. GPS EXIF). - Add the clean_photo conftest fixture serving a verified-negative image from the corpus neg/ set; repoint the three "non-AI / clean photo" tests onto it (skips if the corpus is absent). Metadata-source coverage (close the last sub-variant gaps): - c2pa digitalSourceType: algorithmicMedia (procedural, not flagged AI) and compositeWithTrainedAlgorithmicMedia (AI + SynthID proxy). - exif_generator: EXIF Artist and ImageDescription fields (Software/Make/XMP CreatorTool were already covered). All 8 metadata-source kinds are now tested at both the unit and identify() level. 313 tests pass. CLAUDE.md updated (corpus tracked, clean_photo fixture). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
348 lines
13 KiB
Python
348 lines
13 KiB
Python
"""Tests for vendored noai submodules: constants, extractor, cleaner, c2pa."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import struct
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from remove_ai_watermarks.noai.c2pa import (
|
|
_cbor_text_after,
|
|
_parse_c2pa_chunk,
|
|
extract_c2pa_chunk,
|
|
extract_c2pa_info,
|
|
has_c2pa_metadata,
|
|
inject_c2pa_chunk,
|
|
synthid_verdict,
|
|
)
|
|
from remove_ai_watermarks.noai.cleaner import (
|
|
has_ai_content,
|
|
)
|
|
from remove_ai_watermarks.noai.cleaner import (
|
|
remove_ai_metadata as noai_remove_ai_metadata,
|
|
)
|
|
from remove_ai_watermarks.noai.constants import (
|
|
AI_KEYWORDS,
|
|
AI_METADATA_KEYS,
|
|
C2PA_CHUNK_TYPE,
|
|
PNG_SIGNATURE,
|
|
SUPPORTED_FORMATS,
|
|
)
|
|
from remove_ai_watermarks.noai.extractor import (
|
|
extract_ai_metadata,
|
|
extract_metadata,
|
|
get_ai_metadata_summary,
|
|
has_ai_metadata,
|
|
)
|
|
from remove_ai_watermarks.noai.isobmff import (
|
|
is_isobmff,
|
|
strip_c2pa_boxes,
|
|
)
|
|
|
|
# ── Constants ───────────────────────────────────────────────────────
|
|
|
|
|
|
class TestConstants:
|
|
"""Verify constant integrity."""
|
|
|
|
def test_supported_formats_include_png(self):
|
|
assert ".png" in SUPPORTED_FORMATS
|
|
|
|
def test_supported_formats_include_jpg(self):
|
|
assert ".jpg" in SUPPORTED_FORMATS
|
|
|
|
def test_ai_metadata_keys_not_empty(self):
|
|
assert len(AI_METADATA_KEYS) > 0
|
|
|
|
def test_ai_keywords_not_empty(self):
|
|
assert len(AI_KEYWORDS) > 0
|
|
|
|
def test_png_signature_bytes(self):
|
|
assert PNG_SIGNATURE == b"\x89PNG\r\n\x1a\n"
|
|
|
|
def test_c2pa_chunk_type(self):
|
|
assert C2PA_CHUNK_TYPE == b"caBX"
|
|
|
|
|
|
# ── Extractor ───────────────────────────────────────────────────────
|
|
|
|
|
|
class TestExtractor:
|
|
"""Tests for noai.extractor functions."""
|
|
|
|
def test_extract_metadata_returns_dict(self, tmp_clean_png):
|
|
meta = extract_metadata(tmp_clean_png)
|
|
assert isinstance(meta, dict)
|
|
|
|
def test_extract_metadata_gets_standard_keys(self, tmp_clean_png):
|
|
meta = extract_metadata(tmp_clean_png)
|
|
assert "Author" in meta
|
|
|
|
def test_extract_ai_metadata_from_ai_image(self, tmp_png_with_ai_metadata):
|
|
meta = extract_ai_metadata(tmp_png_with_ai_metadata)
|
|
assert "parameters" in meta
|
|
|
|
def test_extract_ai_metadata_from_clean_image(self, tmp_clean_png):
|
|
meta = extract_ai_metadata(tmp_clean_png)
|
|
assert len(meta) == 0
|
|
|
|
def test_has_ai_metadata_detects(self, tmp_png_with_ai_metadata):
|
|
assert has_ai_metadata(tmp_png_with_ai_metadata)
|
|
|
|
def test_has_ai_metadata_clean(self, tmp_clean_png):
|
|
assert not has_ai_metadata(tmp_clean_png)
|
|
|
|
def test_summary_with_ai(self, tmp_png_with_ai_metadata):
|
|
summary = get_ai_metadata_summary(tmp_png_with_ai_metadata)
|
|
assert "AI Image Metadata" in summary
|
|
|
|
def test_summary_clean(self, tmp_clean_png):
|
|
summary = get_ai_metadata_summary(tmp_clean_png)
|
|
assert "No AI metadata" in summary
|
|
|
|
|
|
# ── Cleaner ─────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestCleaner:
|
|
"""Tests for noai.cleaner functions."""
|
|
|
|
def test_remove_ai_metadata(self, tmp_png_with_ai_metadata, tmp_path):
|
|
output = tmp_path / "cleaned.png"
|
|
noai_remove_ai_metadata(tmp_png_with_ai_metadata, output)
|
|
assert output.exists()
|
|
# Verify AI metadata removed
|
|
meta = extract_ai_metadata(output)
|
|
assert "parameters" not in meta
|
|
|
|
def test_has_ai_content(self, tmp_png_with_ai_metadata):
|
|
assert has_ai_content(tmp_png_with_ai_metadata)
|
|
|
|
|
|
# ── C2PA ────────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestC2PA:
|
|
"""Tests for C2PA detection on regular (non-C2PA) images."""
|
|
|
|
def test_no_c2pa_on_regular_png(self, tmp_clean_png):
|
|
assert not has_c2pa_metadata(tmp_clean_png)
|
|
|
|
def test_no_c2pa_on_jpeg(self, tmp_jpeg_path):
|
|
assert not has_c2pa_metadata(tmp_jpeg_path)
|
|
|
|
def test_extract_c2pa_none_on_regular(self, tmp_clean_png):
|
|
assert extract_c2pa_chunk(tmp_clean_png) is None
|
|
|
|
def test_extract_c2pa_info_empty(self, tmp_clean_png):
|
|
info = extract_c2pa_info(tmp_clean_png)
|
|
assert info == {}
|
|
|
|
def test_c2pa_returns_false_for_non_png(self, tmp_jpeg_path):
|
|
assert not has_c2pa_metadata(tmp_jpeg_path)
|
|
|
|
|
|
SAMPLES_DIR = Path(__file__).resolve().parent.parent / "data" / "samples"
|
|
|
|
|
|
@pytest.mark.skipif(not SAMPLES_DIR.exists(), reason="data/samples not present")
|
|
class TestC2PARealSamples:
|
|
"""Parser behavior on real committed C2PA images."""
|
|
|
|
def test_detects_c2pa_in_openai_png(self):
|
|
assert has_c2pa_metadata(SAMPLES_DIR / "chatgpt-1.png")
|
|
|
|
def test_extract_info_openai_fields(self):
|
|
info = extract_c2pa_info(SAMPLES_DIR / "chatgpt-1.png")
|
|
assert info["has_c2pa"] is True
|
|
assert "OpenAI" in info["issuer"]
|
|
assert "c2pa_manifest" in info # "C2PA manifest (N bytes)"
|
|
assert "trainedAlgorithmicMedia" in info["source_type"]
|
|
# CBOR-clean claim generator, no regex artifacts (e.g. "fGPT-4o").
|
|
assert info["claim_generator"]
|
|
assert not info["claim_generator"].startswith("f")
|
|
assert "synthid_watermark" in info
|
|
|
|
def test_extract_info_adobe_has_no_synthid(self):
|
|
info = extract_c2pa_info(SAMPLES_DIR / "firefly-1.png")
|
|
assert "Adobe" in info["issuer"]
|
|
assert "synthid_watermark" not in info
|
|
|
|
def test_extract_chunk_returns_bytes(self):
|
|
chunk = extract_c2pa_chunk(SAMPLES_DIR / "chatgpt-1.png")
|
|
assert chunk is not None
|
|
assert chunk[4:8] == b"caBX" # chunk type in the 8-byte header
|
|
|
|
def test_inject_round_trip(self, tmp_clean_png, tmp_path):
|
|
"""Extract a real C2PA chunk, inject into a clean PNG, re-detect."""
|
|
chunk = extract_c2pa_chunk(SAMPLES_DIR / "chatgpt-1.png")
|
|
out = tmp_path / "injected.png"
|
|
inject_c2pa_chunk(tmp_clean_png, out, chunk)
|
|
assert has_c2pa_metadata(out)
|
|
assert "OpenAI" in extract_c2pa_info(out)["issuer"]
|
|
|
|
|
|
class TestC2PAInjectValidation:
|
|
def test_inject_rejects_non_png(self, tmp_path):
|
|
with pytest.raises(ValueError, match="only supported for PNG"):
|
|
inject_c2pa_chunk(tmp_path / "in.jpg", tmp_path / "out.png", b"")
|
|
|
|
|
|
# ── CBOR text extraction (parser internals) ─────────────────────────
|
|
|
|
|
|
class TestCborTextAfter:
|
|
"""_cbor_text_after handles the three CBOR text-string length prefixes."""
|
|
|
|
def test_direct_length(self):
|
|
# major-type 3, direct length (0x60 + len). "abc" -> 0x63.
|
|
payload = b"name" + bytes([0x63]) + b"abc"
|
|
assert _cbor_text_after(payload, b"name") == "abc"
|
|
|
|
def test_one_byte_length(self):
|
|
s = b"x" * 30
|
|
payload = b"name" + bytes([0x78, 30]) + s
|
|
assert _cbor_text_after(payload, b"name") == "x" * 30
|
|
|
|
def test_two_byte_length(self):
|
|
s = b"y" * 300
|
|
payload = b"name" + bytes([0x79]) + struct.pack(">H", 300) + s
|
|
assert _cbor_text_after(payload, b"name") == "y" * 300
|
|
|
|
def test_key_not_found_returns_none(self):
|
|
assert _cbor_text_after(b"nothing here", b"name") is None
|
|
|
|
def test_key_at_end_returns_none(self):
|
|
assert _cbor_text_after(b"prefixname", b"name") is None
|
|
|
|
def test_invalid_head_returns_none(self):
|
|
# 0x00 is not a text-string head.
|
|
assert _cbor_text_after(b"name" + bytes([0x00]) + b"abc", b"name") is None
|
|
|
|
def test_latin1_fallback_on_invalid_utf8(self):
|
|
payload = b"name" + bytes([0x61]) + b"\xff" # len 1, invalid utf-8
|
|
assert _cbor_text_after(payload, b"name") is not None
|
|
|
|
|
|
class TestSynthIDVerdict:
|
|
def test_format(self):
|
|
assert synthid_verdict("OpenAI") == "likely present (OpenAI embeds SynthID with C2PA)"
|
|
|
|
def test_multiple_vendors(self):
|
|
assert "Google LLC, OpenAI" in synthid_verdict("Google LLC, OpenAI")
|
|
|
|
|
|
class TestParseChunkGuards:
|
|
"""_parse_c2pa_chunk rejects non-printable claim_generator garbage.
|
|
|
|
On some manifests (observed: Microsoft Designer) the first ``name`` key
|
|
precedes a binary hash field, not the generator string. The clean issuer +
|
|
SynthID verdict must still come through.
|
|
"""
|
|
|
|
def test_clean_generator_kept(self):
|
|
# "name" + CBOR text-string (head 0x69 = 0x60+9) "gpt-image"
|
|
chunk = b"...name" + bytes([0x69]) + b"gpt-image" + b"OpenAI trainedAlgorithmicMedia"
|
|
info: dict = {}
|
|
_parse_c2pa_chunk(chunk, info)
|
|
assert info["claim_generator"] == "gpt-image"
|
|
assert "OpenAI" in info["issuer"]
|
|
assert "synthid_watermark" in info # OpenAI + trainedAlgorithmicMedia
|
|
|
|
def test_nonprintable_generator_dropped(self):
|
|
# "name" + CBOR string (head 0x64 = len 4) with a control byte -> garbage
|
|
chunk = b"...name" + bytes([0x64]) + b"\x81abc" + b"OpenAI trainedAlgorithmicMedia"
|
|
info: dict = {}
|
|
_parse_c2pa_chunk(chunk, info)
|
|
assert "claim_generator" not in info # control-char garbage rejected
|
|
assert "OpenAI" in info["issuer"] # issuer byte-search still robust
|
|
|
|
|
|
class TestC2PADigitalSourceType:
|
|
"""The three IPTC digitalSourceType variants drive the AI verdict.
|
|
|
|
Only *trained* and *composite-with-trained* mean AI-generated (and so imply
|
|
a SynthID proxy for a SynthID vendor); plain ``algorithmicMedia`` is
|
|
procedural (not trained) and must NOT be flagged as AI.
|
|
"""
|
|
|
|
def test_plain_algorithmic_media_not_flagged_ai(self):
|
|
chunk = b"...name" + bytes([0x69]) + b"some-tool" + b" OpenAI algorithmicMedia"
|
|
info: dict = {}
|
|
_parse_c2pa_chunk(chunk, info)
|
|
assert info["source_type"] == "algorithmicMedia"
|
|
assert "synthid_watermark" not in info # procedural, not AI-generated
|
|
|
|
def test_composite_with_trained_is_ai_and_synthid(self):
|
|
chunk = b"...name" + bytes([0x69]) + b"some-tool" + b" OpenAI compositeWithTrainedAlgorithmicMedia"
|
|
info: dict = {}
|
|
_parse_c2pa_chunk(chunk, info)
|
|
assert "compositeWithTrainedAlgorithmicMedia" in info["source_type"]
|
|
assert "synthid_watermark" in info # AI-enhanced + OpenAI issuer
|
|
|
|
|
|
# ── ISOBMFF (AVIF / HEIF / JPEG-XL container stripping) ──────────────
|
|
|
|
FTYP = b"\x00\x00\x00\x18ftypavif\x00\x00\x00\x00avifmif1" # 24-byte ftyp box
|
|
|
|
|
|
class TestISOBMFF:
|
|
def test_is_isobmff_true(self):
|
|
assert is_isobmff(FTYP)
|
|
|
|
def test_is_isobmff_false_for_png(self):
|
|
assert not is_isobmff(b"\x89PNG\r\n\x1a\n\x00\x00")
|
|
|
|
def test_is_isobmff_false_for_short(self):
|
|
assert not is_isobmff(b"abc")
|
|
|
|
def test_strips_jpegxl_jumb_box(self):
|
|
"""JPEG-XL stores JUMBF in a ``jumb`` box, always stripped."""
|
|
jumb = struct.pack(">I", 8 + 5) + b"jumb" + b"hello"
|
|
cleaned, stripped = strip_c2pa_boxes(FTYP + jumb)
|
|
assert stripped == 1
|
|
assert cleaned == FTYP
|
|
|
|
def test_keeps_non_c2pa_box_with_64bit_size(self):
|
|
"""size==1 means a 64-bit largesize follows; non-C2PA box is kept."""
|
|
payload = b"\x00" * 8
|
|
box = b"\x00\x00\x00\x01" + b"free" + struct.pack(">Q", 16 + len(payload)) + payload
|
|
cleaned, stripped = strip_c2pa_boxes(FTYP + box)
|
|
assert stripped == 0
|
|
assert cleaned == FTYP + box
|
|
|
|
def test_malformed_box_does_not_crash(self):
|
|
# A box claiming size 4 (< 8-byte header) must terminate iteration safely.
|
|
cleaned, stripped = strip_c2pa_boxes(FTYP + b"\x00\x00\x00\x04XXXX")
|
|
assert stripped == 0
|
|
assert cleaned.startswith(FTYP)
|
|
|
|
def test_size_zero_box_runs_to_eof(self):
|
|
# size32==0 means the box extends to EOF; a non-C2PA box round-trips.
|
|
box = struct.pack(">I", 0) + b"free" + b"\x00\x00\x00\x00"
|
|
cleaned, stripped = strip_c2pa_boxes(FTYP + box)
|
|
assert stripped == 0
|
|
assert cleaned == FTYP + box
|
|
|
|
def test_truncated_largesize_terminates_safely(self):
|
|
# size32==1 promises a 64-bit largesize, but the box ends after 8 bytes;
|
|
# iteration must stop rather than read the missing largesize past EOF.
|
|
cleaned, stripped = strip_c2pa_boxes(FTYP + b"\x00\x00\x00\x01uuid")
|
|
assert stripped == 0
|
|
assert cleaned == FTYP
|
|
|
|
|
|
class TestC2PAInvalidSignature:
|
|
"""A .png file that is not actually PNG-signed must read as clean, not crash."""
|
|
|
|
def test_has_c2pa_false_for_non_png_bytes(self, tmp_path: Path):
|
|
fake = tmp_path / "fake.png"
|
|
fake.write_bytes(b"\xff\xd8\xff\xe0 not a png at all, just garbage bytes")
|
|
assert has_c2pa_metadata(fake) is False
|
|
|
|
def test_extract_chunk_none_for_non_png_bytes(self, tmp_path: Path):
|
|
fake = tmp_path / "fake.png"
|
|
fake.write_bytes(b"\xff\xd8\xff\xe0 not a png at all, just garbage bytes")
|
|
assert extract_c2pa_chunk(fake) is None
|