mirror of
https://github.com/wiltodelta/remove-ai-watermarks.git
synced 2026-05-26 14:17:47 +02:00
fix(c2pa): drop non-printable claim_generator garbage
On some manifests (observed: Microsoft Designer) the first CBOR "name" key precedes a binary hash field, not the generator string, so _cbor_text_after returns control-char garbage. Guard with isprintable() to drop it; issuer detection (byte-search) and the SynthID verdict are unaffected. Adds TestParseChunkGuards covering kept-vs-dropped cases. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -196,9 +196,12 @@ def _parse_c2pa_chunk(chunk_data: bytes, c2pa_info: dict[str, Any]) -> None:
|
||||
|
||||
# Claim generator and spec version: read the CBOR text-string values
|
||||
# directly (regex byte-grabbing produced artifacts like ``fGPT-4o``).
|
||||
if generator := _cbor_text_after(chunk_data, b"name"):
|
||||
# Guard with isprintable(): on some manifests (e.g. Microsoft Designer) the
|
||||
# first ``name`` key precedes a binary field (a hash), not the generator
|
||||
# string, which would otherwise surface as control-char garbage.
|
||||
if (generator := _cbor_text_after(chunk_data, b"name")) and generator.isprintable():
|
||||
c2pa_info["claim_generator"] = generator
|
||||
if spec := _cbor_text_after(chunk_data, b"specVersion"):
|
||||
if (spec := _cbor_text_after(chunk_data, b"specVersion")) and spec.isprintable():
|
||||
c2pa_info["c2pa_spec"] = spec
|
||||
|
||||
# Find actions
|
||||
|
||||
@@ -9,6 +9,7 @@ import pytest
|
||||
|
||||
from remove_ai_watermarks.noai.c2pa import (
|
||||
_cbor_text_after,
|
||||
_parse_c2pa_chunk,
|
||||
extract_c2pa_chunk,
|
||||
extract_c2pa_info,
|
||||
has_c2pa_metadata,
|
||||
@@ -232,6 +233,32 @@ class TestSynthIDVerdict:
|
||||
assert "Google LLC, OpenAI" in synthid_verdict("Google LLC, OpenAI")
|
||||
|
||||
|
||||
class TestParseChunkGuards:
|
||||
"""_parse_c2pa_chunk rejects non-printable claim_generator garbage.
|
||||
|
||||
On some manifests (observed: Microsoft Designer) the first ``name`` key
|
||||
precedes a binary hash field, not the generator string. The clean issuer +
|
||||
SynthID verdict must still come through.
|
||||
"""
|
||||
|
||||
def test_clean_generator_kept(self):
|
||||
# "name" + CBOR text-string (head 0x69 = 0x60+9) "gpt-image"
|
||||
chunk = b"...name" + bytes([0x69]) + b"gpt-image" + b"OpenAI trainedAlgorithmicMedia"
|
||||
info: dict = {}
|
||||
_parse_c2pa_chunk(chunk, info)
|
||||
assert info["claim_generator"] == "gpt-image"
|
||||
assert "OpenAI" in info["issuer"]
|
||||
assert "synthid_watermark" in info # OpenAI + trainedAlgorithmicMedia
|
||||
|
||||
def test_nonprintable_generator_dropped(self):
|
||||
# "name" + CBOR string (head 0x64 = len 4) with a control byte -> garbage
|
||||
chunk = b"...name" + bytes([0x64]) + b"\x81abc" + b"OpenAI trainedAlgorithmicMedia"
|
||||
info: dict = {}
|
||||
_parse_c2pa_chunk(chunk, info)
|
||||
assert "claim_generator" not in info # control-char garbage rejected
|
||||
assert "OpenAI" in info["issuer"] # issuer byte-search still robust
|
||||
|
||||
|
||||
# ── ISOBMFF (AVIF / HEIF / JPEG-XL container stripping) ──────────────
|
||||
|
||||
FTYP = b"\x00\x00\x00\x18ftypavif\x00\x00\x00\x00avifmif1" # 24-byte ftyp box
|
||||
|
||||
Reference in New Issue
Block a user