fix(identify): attribute C2PA by claim_generator, not incidental issuer tokens (v0.6.1)

Verified on real signed files that the issuer byte-scan mis-attributes
multi-entity manifests: Leica read as "Truepic" (timestamp authority in the
chain), Nikon as "Adobe Firefly" (XMP-toolkit "Adobe" + the sample's
"Adobe_MAX" name), Truepic as "Google". Platform attribution now prefers the
claim generator (what produced the asset) and falls back to the issuer scan.

- New _CLAIM_GENERATOR_PLATFORM map + _platform_from_generator; claim generator
  read for non-PNG via the now-public c2pa.cbor_text_after.
- Device tokens listed only where verified against a real C2PA file (Leica
  lc_c2pa, Nikon, Truepic Lens); Pixel/Samsung/Sony/Canon/Bria deferred until a
  real sample confirms the in-manifest string. Camera C2PA marks capture
  authenticity, so these never set is_ai.
- cbor_text_after made public (was _cbor_text_after); call sites + tests updated.
- Regression test: claim_generator beats incidental Adobe/Google/Truepic tokens.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
test-user
2026-05-26 20:10:07 -07:00
parent 2676325184
commit dda2ee7fbb
8 changed files with 76 additions and 20 deletions
+1 -1
View File
@@ -1,3 +1,3 @@
"""Remove-AI-Watermarks: Unified tool for removing visible and invisible AI watermarks."""
__version__ = "0.6.0"
__version__ = "0.6.1"
+41 -4
View File
@@ -34,7 +34,7 @@ from remove_ai_watermarks.metadata import (
iptc_ai_system,
xai_signature,
)
from remove_ai_watermarks.noai.c2pa import extract_c2pa_info, soft_binding_vendors_in
from remove_ai_watermarks.noai.c2pa import cbor_text_after, extract_c2pa_info, soft_binding_vendors_in
from remove_ai_watermarks.noai.constants import C2PA_AI_TOOLS, C2PA_ISSUERS
if TYPE_CHECKING:
@@ -127,6 +127,35 @@ def _ai_tools_in(data: bytes) -> list[str]:
return sorted({name for sig, name in C2PA_AI_TOOLS.items() if sig in data})
# C2PA claim-generator substring -> platform. The claim generator names what
# PRODUCED the asset, so it is far more reliable than byte-scanning the manifest
# for an issuer name (which also matches incidental mentions: a timestamp
# authority like "Truepic" in a Leica chain, an XMP-toolkit "Adobe" string in a
# Nikon file, or "Google" in a URL -- all verified on real samples). Ordered:
# camera tokens first so a device wins over an incidental tool name (Nikon's
# real sample claim generator also contains "Adobe_MAX"). Camera C2PA marks
# CAPTURE authenticity, not AI, so these never assert is_ai on their own (the
# verdict still comes from the digital-source-type). Only tokens verified
# against a real signed file are listed; add more as samples are captured.
_CLAIM_GENERATOR_PLATFORM: tuple[tuple[str, str], ...] = (
("lc_c2pa", "Leica (camera, C2PA capture)"),
("leica", "Leica (camera, C2PA capture)"),
("nikon", "Nikon (camera, C2PA capture)"),
("truepic", "Truepic Lens (verified capture)"),
)
def _platform_from_generator(generator: str | None) -> str | None:
"""Map a C2PA claim-generator string to a device/platform, or None."""
if not generator:
return None
low = generator.lower()
for token, platform in _CLAIM_GENERATOR_PLATFORM:
if token in low:
return platform
return None
def _attribute_platform(issuers: list[str]) -> str | None:
"""Map a set of C2PA issuer names to a human-readable generating platform."""
joined = " ".join(issuers)
@@ -205,12 +234,20 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b
# ── C2PA Content Credentials ────────────────────────────────────
has_c2pa = bool(info) or b"c2pa" in head.lower() or C2PA_UUID in head
issuers = [info["issuer"]] if info.get("issuer") else _issuers_in(head)
platform = _attribute_platform(issuers) if has_c2pa else None
c2pa_is_ai = "trainedAlgorithmicMedia" in info.get("source_type", "") or any(
m in head for m in (b"trainedAlgorithmicMedia", b"compositeWithTrainedAlgorithmicMedia")
)
# Generator: structured for PNG, binary-scanned for other containers.
generator = info.get("claim_generator") or (", ".join(tools) if (tools := _ai_tools_in(head)) else None)
# Generator: structured for PNG, CBOR-scanned for other containers. The claim
# generator is the authoritative "what produced this", so it drives platform
# attribution; the issuer byte-scan is only the fallback (it matches
# incidental chain/namespace mentions -- on real samples Leica mis-read as
# Truepic, Nikon as Adobe, Truepic as Google until claim-generator took over).
generator = (
info.get("claim_generator")
or cbor_text_after(head, b"claim_generator")
or (", ".join(tools) if (tools := _ai_tools_in(head)) else None)
)
platform = (_platform_from_generator(generator) or _attribute_platform(issuers)) if has_c2pa else None
if has_c2pa:
detail = ", ".join(filter(None, [", ".join(issuers), generator, info.get("source_type")]))
signals.append(Signal("c2pa", detail or "C2PA manifest present", "high"))
+3 -3
View File
@@ -131,7 +131,7 @@ def extract_c2pa_info(image_path: Path) -> dict[str, Any]:
return c2pa_info
def _cbor_text_after(payload: bytes, key: bytes) -> str | None:
def cbor_text_after(payload: bytes, key: bytes) -> str | None:
"""Return the CBOR text-string immediately following ``key`` in ``payload``.
Handles CBOR major-type 3 length prefixes: direct (0x60-0x77), 1-byte
@@ -212,9 +212,9 @@ def _parse_c2pa_chunk(chunk_data: bytes, c2pa_info: dict[str, Any]) -> None:
# Guard with isprintable(): on some manifests (e.g. Microsoft Designer) the
# first ``name`` key precedes a binary field (a hash), not the generator
# string, which would otherwise surface as control-char garbage.
if (generator := _cbor_text_after(chunk_data, b"name")) and generator.isprintable():
if (generator := cbor_text_after(chunk_data, b"name")) and generator.isprintable():
c2pa_info["claim_generator"] = generator
if (spec := _cbor_text_after(chunk_data, b"specVersion")) and spec.isprintable():
if (spec := cbor_text_after(chunk_data, b"specVersion")) and spec.isprintable():
c2pa_info["c2pa_spec"] = spec
# Find actions