"""Tests for vendored noai submodules: constants, extractor, cleaner, c2pa.""" from __future__ import annotations import struct from pathlib import Path import pytest from remove_ai_watermarks.noai.c2pa import ( _cbor_text_after, _parse_c2pa_chunk, extract_c2pa_chunk, extract_c2pa_info, has_c2pa_metadata, inject_c2pa_chunk, synthid_verdict, ) from remove_ai_watermarks.noai.cleaner import ( has_ai_content, ) from remove_ai_watermarks.noai.cleaner import ( remove_ai_metadata as noai_remove_ai_metadata, ) from remove_ai_watermarks.noai.constants import ( AI_KEYWORDS, AI_METADATA_KEYS, C2PA_CHUNK_TYPE, PNG_SIGNATURE, SUPPORTED_FORMATS, ) from remove_ai_watermarks.noai.extractor import ( extract_ai_metadata, extract_metadata, get_ai_metadata_summary, has_ai_metadata, ) from remove_ai_watermarks.noai.isobmff import ( is_isobmff, strip_c2pa_boxes, ) # ── Constants ─────────────────────────────────────────────────────── class TestConstants: """Verify constant integrity.""" def test_supported_formats_include_png(self): assert ".png" in SUPPORTED_FORMATS def test_supported_formats_include_jpg(self): assert ".jpg" in SUPPORTED_FORMATS def test_ai_metadata_keys_not_empty(self): assert len(AI_METADATA_KEYS) > 0 def test_ai_keywords_not_empty(self): assert len(AI_KEYWORDS) > 0 def test_png_signature_bytes(self): assert PNG_SIGNATURE == b"\x89PNG\r\n\x1a\n" def test_c2pa_chunk_type(self): assert C2PA_CHUNK_TYPE == b"caBX" # ── Extractor ─────────────────────────────────────────────────────── class TestExtractor: """Tests for noai.extractor functions.""" def test_extract_metadata_returns_dict(self, tmp_clean_png): meta = extract_metadata(tmp_clean_png) assert isinstance(meta, dict) def test_extract_metadata_gets_standard_keys(self, tmp_clean_png): meta = extract_metadata(tmp_clean_png) assert "Author" in meta def test_extract_ai_metadata_from_ai_image(self, tmp_png_with_ai_metadata): meta = extract_ai_metadata(tmp_png_with_ai_metadata) assert "parameters" in meta def test_extract_ai_metadata_from_clean_image(self, tmp_clean_png): meta = extract_ai_metadata(tmp_clean_png) assert len(meta) == 0 def test_has_ai_metadata_detects(self, tmp_png_with_ai_metadata): assert has_ai_metadata(tmp_png_with_ai_metadata) def test_has_ai_metadata_clean(self, tmp_clean_png): assert not has_ai_metadata(tmp_clean_png) def test_summary_with_ai(self, tmp_png_with_ai_metadata): summary = get_ai_metadata_summary(tmp_png_with_ai_metadata) assert "AI Image Metadata" in summary def test_summary_clean(self, tmp_clean_png): summary = get_ai_metadata_summary(tmp_clean_png) assert "No AI metadata" in summary # ── Cleaner ───────────────────────────────────────────────────────── class TestCleaner: """Tests for noai.cleaner functions.""" def test_remove_ai_metadata(self, tmp_png_with_ai_metadata, tmp_path): output = tmp_path / "cleaned.png" noai_remove_ai_metadata(tmp_png_with_ai_metadata, output) assert output.exists() # Verify AI metadata removed meta = extract_ai_metadata(output) assert "parameters" not in meta def test_has_ai_content(self, tmp_png_with_ai_metadata): assert has_ai_content(tmp_png_with_ai_metadata) # ── C2PA ──────────────────────────────────────────────────────────── class TestC2PA: """Tests for C2PA detection on regular (non-C2PA) images.""" def test_no_c2pa_on_regular_png(self, tmp_clean_png): assert not has_c2pa_metadata(tmp_clean_png) def test_no_c2pa_on_jpeg(self, tmp_jpeg_path): assert not has_c2pa_metadata(tmp_jpeg_path) def test_extract_c2pa_none_on_regular(self, tmp_clean_png): assert extract_c2pa_chunk(tmp_clean_png) is None def test_extract_c2pa_info_empty(self, tmp_clean_png): info = extract_c2pa_info(tmp_clean_png) assert info == {} def test_c2pa_returns_false_for_non_png(self, tmp_jpeg_path): assert not has_c2pa_metadata(tmp_jpeg_path) SAMPLES_DIR = Path(__file__).resolve().parent.parent / "data" / "samples" @pytest.mark.skipif(not SAMPLES_DIR.exists(), reason="data/samples not present") class TestC2PARealSamples: """Parser behavior on real committed C2PA images.""" def test_detects_c2pa_in_openai_png(self): assert has_c2pa_metadata(SAMPLES_DIR / "chatgpt-1.png") def test_extract_info_openai_fields(self): info = extract_c2pa_info(SAMPLES_DIR / "chatgpt-1.png") assert info["has_c2pa"] is True assert "OpenAI" in info["issuer"] assert "c2pa_manifest" in info # "C2PA manifest (N bytes)" assert "trainedAlgorithmicMedia" in info["source_type"] # CBOR-clean claim generator, no regex artifacts (e.g. "fGPT-4o"). assert info["claim_generator"] assert not info["claim_generator"].startswith("f") assert "synthid_watermark" in info def test_extract_info_adobe_has_no_synthid(self): info = extract_c2pa_info(SAMPLES_DIR / "firefly-1.png") assert "Adobe" in info["issuer"] assert "synthid_watermark" not in info def test_extract_chunk_returns_bytes(self): chunk = extract_c2pa_chunk(SAMPLES_DIR / "chatgpt-1.png") assert chunk is not None assert chunk[4:8] == b"caBX" # chunk type in the 8-byte header def test_inject_round_trip(self, tmp_clean_png, tmp_path): """Extract a real C2PA chunk, inject into a clean PNG, re-detect.""" chunk = extract_c2pa_chunk(SAMPLES_DIR / "chatgpt-1.png") out = tmp_path / "injected.png" inject_c2pa_chunk(tmp_clean_png, out, chunk) assert has_c2pa_metadata(out) assert "OpenAI" in extract_c2pa_info(out)["issuer"] class TestC2PAInjectValidation: def test_inject_rejects_non_png(self, tmp_path): with pytest.raises(ValueError, match="only supported for PNG"): inject_c2pa_chunk(tmp_path / "in.jpg", tmp_path / "out.png", b"") # ── CBOR text extraction (parser internals) ───────────────────────── class TestCborTextAfter: """_cbor_text_after handles the three CBOR text-string length prefixes.""" def test_direct_length(self): # major-type 3, direct length (0x60 + len). "abc" -> 0x63. payload = b"name" + bytes([0x63]) + b"abc" assert _cbor_text_after(payload, b"name") == "abc" def test_one_byte_length(self): s = b"x" * 30 payload = b"name" + bytes([0x78, 30]) + s assert _cbor_text_after(payload, b"name") == "x" * 30 def test_two_byte_length(self): s = b"y" * 300 payload = b"name" + bytes([0x79]) + struct.pack(">H", 300) + s assert _cbor_text_after(payload, b"name") == "y" * 300 def test_key_not_found_returns_none(self): assert _cbor_text_after(b"nothing here", b"name") is None def test_key_at_end_returns_none(self): assert _cbor_text_after(b"prefixname", b"name") is None def test_invalid_head_returns_none(self): # 0x00 is not a text-string head. assert _cbor_text_after(b"name" + bytes([0x00]) + b"abc", b"name") is None def test_latin1_fallback_on_invalid_utf8(self): payload = b"name" + bytes([0x61]) + b"\xff" # len 1, invalid utf-8 assert _cbor_text_after(payload, b"name") is not None class TestSynthIDVerdict: def test_format(self): assert synthid_verdict("OpenAI") == "likely present (OpenAI embeds SynthID with C2PA)" def test_multiple_vendors(self): assert "Google LLC, OpenAI" in synthid_verdict("Google LLC, OpenAI") class TestParseChunkGuards: """_parse_c2pa_chunk rejects non-printable claim_generator garbage. On some manifests (observed: Microsoft Designer) the first ``name`` key precedes a binary hash field, not the generator string. The clean issuer + SynthID verdict must still come through. """ def test_clean_generator_kept(self): # "name" + CBOR text-string (head 0x69 = 0x60+9) "gpt-image" chunk = b"...name" + bytes([0x69]) + b"gpt-image" + b"OpenAI trainedAlgorithmicMedia" info: dict = {} _parse_c2pa_chunk(chunk, info) assert info["claim_generator"] == "gpt-image" assert "OpenAI" in info["issuer"] assert "synthid_watermark" in info # OpenAI + trainedAlgorithmicMedia def test_nonprintable_generator_dropped(self): # "name" + CBOR string (head 0x64 = len 4) with a control byte -> garbage chunk = b"...name" + bytes([0x64]) + b"\x81abc" + b"OpenAI trainedAlgorithmicMedia" info: dict = {} _parse_c2pa_chunk(chunk, info) assert "claim_generator" not in info # control-char garbage rejected assert "OpenAI" in info["issuer"] # issuer byte-search still robust # ── ISOBMFF (AVIF / HEIF / JPEG-XL container stripping) ────────────── FTYP = b"\x00\x00\x00\x18ftypavif\x00\x00\x00\x00avifmif1" # 24-byte ftyp box class TestISOBMFF: def test_is_isobmff_true(self): assert is_isobmff(FTYP) def test_is_isobmff_false_for_png(self): assert not is_isobmff(b"\x89PNG\r\n\x1a\n\x00\x00") def test_is_isobmff_false_for_short(self): assert not is_isobmff(b"abc") def test_strips_jpegxl_jumb_box(self): """JPEG-XL stores JUMBF in a ``jumb`` box, always stripped.""" jumb = struct.pack(">I", 8 + 5) + b"jumb" + b"hello" cleaned, stripped = strip_c2pa_boxes(FTYP + jumb) assert stripped == 1 assert cleaned == FTYP def test_keeps_non_c2pa_box_with_64bit_size(self): """size==1 means a 64-bit largesize follows; non-C2PA box is kept.""" payload = b"\x00" * 8 box = b"\x00\x00\x00\x01" + b"free" + struct.pack(">Q", 16 + len(payload)) + payload cleaned, stripped = strip_c2pa_boxes(FTYP + box) assert stripped == 0 assert cleaned == FTYP + box def test_malformed_box_does_not_crash(self): # A box claiming size 4 (< 8-byte header) must terminate iteration safely. cleaned, stripped = strip_c2pa_boxes(FTYP + b"\x00\x00\x00\x04XXXX") assert stripped == 0 assert cleaned.startswith(FTYP)