From 6cef1d59f0ea06874db65ecc2e74d6f513aeb0f7 Mon Sep 17 00:00:00 2001 From: test-user Date: Sun, 24 May 2026 15:55:07 -0700 Subject: [PATCH] fix(c2pa): drop non-printable claim_generator garbage On some manifests (observed: Microsoft Designer) the first CBOR "name" key precedes a binary hash field, not the generator string, so _cbor_text_after returns control-char garbage. Guard with isprintable() to drop it; issuer detection (byte-search) and the SynthID verdict are unaffected. Adds TestParseChunkGuards covering kept-vs-dropped cases. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/remove_ai_watermarks/noai/c2pa.py | 7 +++++-- tests/test_noai.py | 27 +++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/remove_ai_watermarks/noai/c2pa.py b/src/remove_ai_watermarks/noai/c2pa.py index 08d61bc..f3ef4a7 100644 --- a/src/remove_ai_watermarks/noai/c2pa.py +++ b/src/remove_ai_watermarks/noai/c2pa.py @@ -196,9 +196,12 @@ def _parse_c2pa_chunk(chunk_data: bytes, c2pa_info: dict[str, Any]) -> None: # Claim generator and spec version: read the CBOR text-string values # directly (regex byte-grabbing produced artifacts like ``fGPT-4o``). - if generator := _cbor_text_after(chunk_data, b"name"): + # Guard with isprintable(): on some manifests (e.g. Microsoft Designer) the + # first ``name`` key precedes a binary field (a hash), not the generator + # string, which would otherwise surface as control-char garbage. + if (generator := _cbor_text_after(chunk_data, b"name")) and generator.isprintable(): c2pa_info["claim_generator"] = generator - if spec := _cbor_text_after(chunk_data, b"specVersion"): + if (spec := _cbor_text_after(chunk_data, b"specVersion")) and spec.isprintable(): c2pa_info["c2pa_spec"] = spec # Find actions diff --git a/tests/test_noai.py b/tests/test_noai.py index 1665040..ad59699 100644 --- a/tests/test_noai.py +++ b/tests/test_noai.py @@ -9,6 +9,7 @@ import pytest from remove_ai_watermarks.noai.c2pa import ( _cbor_text_after, + _parse_c2pa_chunk, extract_c2pa_chunk, extract_c2pa_info, has_c2pa_metadata, @@ -232,6 +233,32 @@ class TestSynthIDVerdict: assert "Google LLC, OpenAI" in synthid_verdict("Google LLC, OpenAI") +class TestParseChunkGuards: + """_parse_c2pa_chunk rejects non-printable claim_generator garbage. + + On some manifests (observed: Microsoft Designer) the first ``name`` key + precedes a binary hash field, not the generator string. The clean issuer + + SynthID verdict must still come through. + """ + + def test_clean_generator_kept(self): + # "name" + CBOR text-string (head 0x69 = 0x60+9) "gpt-image" + chunk = b"...name" + bytes([0x69]) + b"gpt-image" + b"OpenAI trainedAlgorithmicMedia" + info: dict = {} + _parse_c2pa_chunk(chunk, info) + assert info["claim_generator"] == "gpt-image" + assert "OpenAI" in info["issuer"] + assert "synthid_watermark" in info # OpenAI + trainedAlgorithmicMedia + + def test_nonprintable_generator_dropped(self): + # "name" + CBOR string (head 0x64 = len 4) with a control byte -> garbage + chunk = b"...name" + bytes([0x64]) + b"\x81abc" + b"OpenAI trainedAlgorithmicMedia" + info: dict = {} + _parse_c2pa_chunk(chunk, info) + assert "claim_generator" not in info # control-char garbage rejected + assert "OpenAI" in info["issuer"] # issuer byte-search still robust + + # ── ISOBMFF (AVIF / HEIF / JPEG-XL container stripping) ────────────── FTYP = b"\x00\x00\x00\x18ftypavif\x00\x00\x00\x00avifmif1" # 24-byte ftyp box