fix(c2pa): drop non-printable claim_generator garbage

On some manifests (observed: Microsoft Designer) the first CBOR "name"
key precedes a binary hash field, not the generator string, so
_cbor_text_after returns control-char garbage. Guard with isprintable()
to drop it; issuer detection (byte-search) and the SynthID verdict are
unaffected. Adds TestParseChunkGuards covering kept-vs-dropped cases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
test-user
2026-05-24 15:55:07 -07:00
parent da0edcbddc
commit 6cef1d59f0
2 changed files with 32 additions and 2 deletions
+5 -2
View File
@@ -196,9 +196,12 @@ def _parse_c2pa_chunk(chunk_data: bytes, c2pa_info: dict[str, Any]) -> None:
# Claim generator and spec version: read the CBOR text-string values
# directly (regex byte-grabbing produced artifacts like ``fGPT-4o``).
if generator := _cbor_text_after(chunk_data, b"name"):
# Guard with isprintable(): on some manifests (e.g. Microsoft Designer) the
# first ``name`` key precedes a binary field (a hash), not the generator
# string, which would otherwise surface as control-char garbage.
if (generator := _cbor_text_after(chunk_data, b"name")) and generator.isprintable():
c2pa_info["claim_generator"] = generator
if spec := _cbor_text_after(chunk_data, b"specVersion"):
if (spec := _cbor_text_after(chunk_data, b"specVersion")) and spec.isprintable():
c2pa_info["c2pa_spec"] = spec
# Find actions
+27
View File
@@ -9,6 +9,7 @@ import pytest
from remove_ai_watermarks.noai.c2pa import (
_cbor_text_after,
_parse_c2pa_chunk,
extract_c2pa_chunk,
extract_c2pa_info,
has_c2pa_metadata,
@@ -232,6 +233,32 @@ class TestSynthIDVerdict:
assert "Google LLC, OpenAI" in synthid_verdict("Google LLC, OpenAI")
class TestParseChunkGuards:
"""_parse_c2pa_chunk rejects non-printable claim_generator garbage.
On some manifests (observed: Microsoft Designer) the first ``name`` key
precedes a binary hash field, not the generator string. The clean issuer +
SynthID verdict must still come through.
"""
def test_clean_generator_kept(self):
# "name" + CBOR text-string (head 0x69 = 0x60+9) "gpt-image"
chunk = b"...name" + bytes([0x69]) + b"gpt-image" + b"OpenAI trainedAlgorithmicMedia"
info: dict = {}
_parse_c2pa_chunk(chunk, info)
assert info["claim_generator"] == "gpt-image"
assert "OpenAI" in info["issuer"]
assert "synthid_watermark" in info # OpenAI + trainedAlgorithmicMedia
def test_nonprintable_generator_dropped(self):
# "name" + CBOR string (head 0x64 = len 4) with a control byte -> garbage
chunk = b"...name" + bytes([0x64]) + b"\x81abc" + b"OpenAI trainedAlgorithmicMedia"
info: dict = {}
_parse_c2pa_chunk(chunk, info)
assert "claim_generator" not in info # control-char garbage rejected
assert "OpenAI" in info["issuer"] # issuer byte-search still robust
# ── ISOBMFF (AVIF / HEIF / JPEG-XL container stripping) ──────────────
FTYP = b"\x00\x00\x00\x18ftypavif\x00\x00\x00\x00avifmif1" # 24-byte ftyp box