mirror of
https://github.com/wiltodelta/remove-ai-watermarks.git
synced 2026-05-26 22:22:24 +02:00
f07ce10c72
Detect SynthID-bearing images via their C2PA companion: a manifest signed by a
SynthID-using vendor (Google/OpenAI) on AI-generated content implies an
invisible SynthID pixel watermark. Verified end-to-end against the vendor
oracles (openai.com/verify, Gemini "Verify with SynthID").
- metadata: synthid_source() + synthid_watermark verdict in get_ai_metadata,
surfaced as a `metadata --check` callout. Format-agnostic (PNG caBX parser +
JPEG/WebP/AVIF/HEIF/JXL binary scan).
- constants: SYNTHID_C2PA_ISSUERS {Google, OpenAI}; +opened/placed actions.
- c2pa: single CBOR-aware parser (_cbor_text_after) replaces glitchy regex
(fixes fGPT-4o claim_generator); removed duplicate _scan_png_c2pa_chunk from
metadata; shared synthid_verdict / synthid_vendors_in helpers.
- corpus: scripts/synthid_corpus.py ingest tool + data/synthid_corpus/
(manifest tracked, images gitignored) for a labeled reference set.
- tests: +38 across C2PA parser internals, extract/inject round-trip, ISOBMFF
container stripping, all IPTC AI markers, and invisible watermark strength
tiers (SynthID/StableSignature/TreeRing/StegaStamp/RingID/RivaGAN/...).
Pixel-level SynthID detection remains out of reach locally (Google's decoder is
proprietary); a from-scratch spectral pilot confirmed it does not separate real
content. See CLAUDE.md for the full evaluation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
67 lines
2.4 KiB
Python
67 lines
2.4 KiB
Python
"""Tests for the SynthID corpus ingestion script."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import csv
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from click.testing import CliRunner
|
|
|
|
# scripts/ is not an installed package; add it to the path for import.
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts"))
|
|
|
|
import synthid_corpus
|
|
|
|
SAMPLES_DIR = Path(__file__).resolve().parent.parent / "data" / "samples"
|
|
|
|
|
|
def _manifest_rows(root: Path) -> list[dict[str, str]]:
|
|
with open(root / "manifest.csv", newline="") as f:
|
|
return list(csv.DictReader(f))
|
|
|
|
|
|
@pytest.mark.skipif(not SAMPLES_DIR.exists(), reason="data/samples not present")
|
|
class TestIngest:
|
|
def test_ingest_openai_flags_synthid_metadata(self, tmp_path: Path):
|
|
runner = CliRunner()
|
|
result = runner.invoke(
|
|
synthid_corpus.cli,
|
|
["ingest", str(SAMPLES_DIR / "chatgpt-1.png"), "--label", "pos", "--root", str(tmp_path)],
|
|
)
|
|
assert result.exit_code == 0, result.output
|
|
|
|
rows = _manifest_rows(tmp_path)
|
|
assert len(rows) == 1
|
|
row = rows[0]
|
|
assert row["label"] == "pos"
|
|
assert row["synthid_metadata"] == "yes"
|
|
assert int(row["width"]) > 0
|
|
assert int(row["height"]) > 0
|
|
# The copied file lands under images/pos/ with a sha-prefixed name.
|
|
assert (tmp_path / "images" / "pos" / row["filename"]).exists()
|
|
|
|
def test_ingest_firefly_not_flagged(self, tmp_path: Path):
|
|
runner = CliRunner()
|
|
runner.invoke(
|
|
synthid_corpus.cli,
|
|
["ingest", str(SAMPLES_DIR / "firefly-1.png"), "--label", "neg", "--root", str(tmp_path)],
|
|
)
|
|
rows = _manifest_rows(tmp_path)
|
|
assert len(rows) == 1
|
|
assert rows[0]["synthid_metadata"] == "" # Adobe signs C2PA but not SynthID
|
|
|
|
def test_ingest_dedupes_by_sha256(self, tmp_path: Path):
|
|
runner = CliRunner()
|
|
args = ["ingest", str(SAMPLES_DIR / "chatgpt-1.png"), "--label", "pos", "--root", str(tmp_path)]
|
|
runner.invoke(synthid_corpus.cli, args)
|
|
runner.invoke(synthid_corpus.cli, args) # second time: duplicate
|
|
assert len(_manifest_rows(tmp_path)) == 1
|
|
|
|
def test_status_on_empty_corpus(self, tmp_path: Path):
|
|
runner = CliRunner()
|
|
result = runner.invoke(synthid_corpus.cli, ["status", "--root", str(tmp_path)])
|
|
assert result.exit_code == 0
|
|
assert "empty" in result.output.lower()
|