From 223cbcf1716d891144cb3b60360536266125be14 Mon Sep 17 00:00:00 2001
From: Victor Kuznetsov <kuznetsov.va@gmail.com>
Date: Thu, 28 May 2026 12:40:17 -0700
Subject: [PATCH] feat(metadata): detect China TC260 AIGC PNG chunk and
 HuggingFace hf-job-id

aigc_label now reads the TC260 label from a raw-JSON `AIGC` PNG tEXt chunk
(as Doubao/ByteDance write it, with no namespaced XMP marker) in addition to
the `<TC260:AIGC>` XMP block, via a shared _parse helper gated on a TC260 field
so a generic AIGC key cannot false-positive. New huggingface_job() reads the
hf-job-id PNG chunk; identify surfaces it as a medium-confidence hf_job signal
(parallel to the visible sparkle, never overriding a hard metadata verdict).
Both wired into has_ai_metadata/get_ai_metadata; the PNG save whitelist already
strips them on removal. Found by auditing 646 corpus originals: 28 AIGC and 3
hf-job files the library previously reported as Unknown.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 CLAUDE.md                            |   6 +-
 README.md                            |   6 +-
 src/remove_ai_watermarks/identify.py |  30 +++++++-
 src/remove_ai_watermarks/metadata.py | 100 ++++++++++++++++++++++++---
 tests/test_identify.py               |  72 +++++++++++++++++++
 tests/test_metadata.py               |  82 ++++++++++++++++++++++
 6 files changed, 280 insertions(+), 16 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 00421d2..094caed 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -16,6 +16,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r
 - `bash maintain.sh` — uv-outdated, uv-secure, ruff check/fix, ruff format, pyright, pytest -n auto
 - `maintain.sh` may not finish fully green (pre-existing, not per-change): strict pyright carries debt in `remove_ai_metadata` / `cli.py` (untyped piexif/PIL/click/rich). (`uv-secure` is clean since idna was bumped 3.11 -> 3.16, fixing GHSA-65pc-fj4g-8rjx.) To gate a change, run `uv run ruff check`, `uv run pyright <changed files>`, `uv run pytest` directly.
 - Run `uv run` from the repo root — from another cwd it falls back to a bare env without numpy/cv2/torch.
+- To add a dev tool (pytest/ruff/pyright) into the env, use `uv sync --frozen --extra dev --extra gpu`, **never `uv pip install`** — `uv pip install` re-resolves and rewrites `uv.lock`, which silently bumped `transformers` to a build incompatible with the pinned `diffusers` (`cannot import name 'Qwen3VLForConditionalGeneration'`) and broke every `identify`/metadata import. Recovery: `git checkout uv.lock && uv sync --frozen --extra gpu --extra dev`. The `gpu` extra holds `diffusers`/`transformers`/`torch`, so a bare `uv sync` (no extras) removes them and `noai/__init__` (eager pipeline import) then fails. `maintain.sh`'s `uv sync --all-extras` also pulls the heavy `trustmark`/`lama` wheels (pytorch-lightning, onnxruntime) — fine on a good connection, but on flaky DNS sync only `--extra gpu --extra dev` and run the lint/test steps by hand.
 - Metadata/C2PA tests assert against real committed fixtures in `data/samples/` (`chatgpt-*.png` = OpenAI C2PA, `firefly-1.png` = Adobe, `mj-*` = Midjourney IPTC, `doubao-1.png` = ByteDance Doubao with the China TC260 `<TC260:AIGC>` XMP label **and** a visible "豆包AI生成" text mark bottom-right; `grok-1.jpg` = xAI Grok with its EXIF-only `Signature:` blob + UUID `Artist` and no C2PA/SynthID/IPTC); synthetic byte blobs cover the JPEG/ISOBMFF format paths. The "non-AI / clean photo" control is no longer in `data/samples/` -- the `clean_photo` conftest fixture serves a verified-negative image from the corpus `neg/` set (skips if the corpus is absent).
 - SynthID reference corpus: `scripts/synthid_corpus.py` ingests labeled images into `data/synthid_corpus/`. The labeled `images/` (`pos/` `neg/` `cleaned/`) are **committed** (public repo -- review every image for private content before adding; `manifest.csv` is kept in sync with the files on disk, one row per tracked image); only the synthetic `refs/` calibration fills are gitignored. See its README for the collection protocol and verification oracles.
 
@@ -30,7 +31,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r
 - `noai/c2pa.py` — PNG chunk parser; use `extract_c2pa_chunk(path)` to get raw caBX payload, `has_c2pa_metadata(path)` to detect. Do not reimplement chunk parsing. `extract_c2pa_info(path)` sets `synthid_watermark`/`synthid_vendors` when the manifest is signed by a SynthID-using vendor, and `soft_binding`/`soft_binding_vendors` when a `c2pa.soft-binding` `alg` names a forensic-watermark vendor (`soft_binding_vendors_in(buffer)` is the shared byte-scan, used by both the PNG parser and the non-PNG binary path).
 - `noai/constants.py` — PNG_SIGNATURE, C2PA_CHUNK_TYPE, C2PA_SIGNATURES, C2PA_ISSUERS, `SYNTHID_C2PA_ISSUERS` (issuers that pair SynthID with C2PA: Google, OpenAI), and `C2PA_SOFT_BINDINGS` (soft-binding `alg` prefix → forensic-watermark vendor: Adobe TrustMark, Digimarc, Imatag, Steg.AI, Microsoft, ...). Add a new issuer/binding here, not inline.
 - `metadata.py` — `scan_head(path, size=1MB)` is the shared input for every C2PA/AIGC/IPTC byte scan: first `size` bytes plus, for ISOBMFF, the late provenance-box payloads from `isobmff.scan_c2pa_region` (catches a manifest after a large `mdat`); behavior-neutral (`f.read(size)`) for non-ISOBMFF. Use it instead of `open().read(1MB)` for any new marker scan. `synthid_source(path)` returns the vendor name(s) if the C2PA manifest implies a SynthID pixel watermark, else None. Format-agnostic: PNG via the caBX parser, JPEG/WebP/AVIF/HEIF/JXL via a binary scan (C2PA marker + SynthID issuer + AI-source marker). `get_ai_metadata` surfaces the verdict, and `metadata --check` prints it as a callout. Both `get_ai_metadata` and `has_ai_metadata` guard the PIL open with `except Exception` (HEIC/unknown formats raise non-OSError) and fall through to the binary scan. `xai_signature(path)` detects xAI/Grok's EXIF-only scheme (`ImageDescription` = `Signature: <base64>` + UUID `Artist`); it feeds `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify`. `iptc_ai_system(path)` detects the IPTC Photo Metadata 2025.1 AI-disclosure XMP properties (`IPTC_AI_FIELD_MARKERS` = `AISystemUsed`/`AISystemVersionUsed`/`AIPromptInformation`/`AIPromptWriterName`) and returns the `AISystemUsed` generator name (or `"fields present"`). `remove_ai_metadata` routes **ISOBMFF video** (`.mp4`/`.mov`/`.m4v`) through the same `isobmff.strip_c2pa_boxes` as AVIF/HEIF (MP4 is ISOBMFF), and `_scrub_ai_exif` removes the xAI signature + AI-generator EXIF tags on JPEG output.
-- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, C2PA soft-binding forensic-watermark vendor, IPTC "Made with AI" + IPTC 2025.1 `AISystemUsed`, embedded SD/ComfyUI params, SynthID proxy, xAI/Grok EXIF signature via `metadata.xai_signature`, visible Gemini sparkle, open invisible watermark, Adobe TrustMark via `trustmark_detector`) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. **C2PA platform attribution is device-token-first, issuer-scan fallback** (`_device_platform` scans manifest bytes for `_DEVICE_C2PA_PLATFORM` tokens, then `_attribute_platform`/`_ISSUER_PLATFORM`). **Why, verified on real signed files 2026-05-26:** the old issuer-only byte-scan matched ANY issuer substring anywhere, so multi-entity manifests mis-attributed -- Leica→"Truepic" (a signing authority in the trust chain), Nikon→"Adobe Firefly" (XMP-toolkit "Adobe" + the sample's "Adobe_MAX" name), Pixel→"Google (Gemini)" ("Google LLC" cert org), Truepic→"Google". A distinctive device token wins instead. **Token distinctiveness is load-bearing:** bare `b"Truepic"` mis-fires (it appears in unrelated trust chains -- it mis-attributed the OpenAI `chatgpt-1.png` fixture), so the token is the specific `b"Truepic_Lens"` from the Lens SDK claim generator; likewise `b"Pixel Camera"` (cert CN) not bare `b"Pixel"`. `_DEVICE_C2PA_PLATFORM` lists ONLY tokens **verified against a real C2PA file**: Leica (`lc_c2pa`/`Leica Camera`), Nikon (`NIKON`), Pixel (`Pixel Camera` -- from a real Pixel 10 Pro file attached to c2pa-rs issue #1609/#1554), Sony (`sony.sig`/`sony.cert` -- Sony's own C2PA assertion namespace, verified on a real Sony PXW-Z300 file; NOT bare "Sony" which is a common EXIF Make), Truepic (`Truepic_Lens`). Canon/Samsung/Bria have **no public direct-download C2PA sample** (checked exhaustively: GitHub issue/PR attachments, contentcredentials gallery, HF datasets -- all upload-to-verify or token-gated; Canon's only public file was a self-signed hobbyist CR3, not factory), so they stay unmapped until a real file is captured (same fixture discipline as Grok/Doubao). The Sony sample is video (MP4) -- our ISOBMFF C2PA path detects it; Sony Alpha stills likely share the `sony.*` namespace but are not separately verified. Camera C2PA marks capture authenticity, not AI (Pixel carries `computationalCapture`, not `trainedAlgorithmicMedia`), so these never set `is_ai` -- that stays driven by digital-source-type. `c2pa.cbor_text_after` (now public) is best-effort for the `generator` detail string only and can be None when the manifest keys it `claim_generator_info` (Pixel). **Issuer→generator mapping is `is_ai`-gated** (`_attribute_platform(issuers, is_ai=c2pa_is_ai)`): a specific AI-generator platform is named only when the digital-source-type is `trainedAlgorithmicMedia`; on a non-AI source an issuer substring is treated as incidental (an "Adobe XMP" toolkit string in an *unmapped* Canon/Sony capture would otherwise mislabel it "Adobe Firefly"), so it degrades to the neutral "C2PA signer: X" label. Real Firefly/OpenAI/Google output carries the AI source-type, so it is unaffected (verified: chatgpt-1.png→OpenAI, firefly-1.png→Adobe Firefly still attribute). `_attribute_platform` defaults `is_ai=True` so the mapping stays unit-testable in isolation. Add device tokens to `_DEVICE_C2PA_PLATFORM`, generator/issuer platforms to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. **Integrity-clash detection** (`_integrity_clashes`, surfaced as `ProvenanceReport.integrity_clashes`, printed in red by `identify` and serialized to `--json`): contradictions between independent generator stamps are a laundering/spoofing tell. Two rules: (1) two or more distinct AI-origin vendors named by independent signals (e.g. C2PA OpenAI + EXIF `Make="Ideogram AI"`), and (2) a camera-capture C2PA device (`_DEVICE_C2PA_PLATFORM`) coexisting with any AI-generation marker. Vendor normalization is `_vendor_of` over `_AI_VENDOR_TOKENS` (so a C2PA "Google (Gemini)" issuer and a SynthID-Google proxy agree, while different vendors clash). **High-precision by design:** only hard generator stamps feed it (C2PA-issuer when source is AI, SynthID, EXIF/XMP generator, IPTC `AISystemUsed`, xAI, AIGC); the fuzzy visible sparkle and the open invisible watermark are **excluded** (the latter can be a by-product of our own SDXL removal pass). The c2pa vendor is classified from the issuer attribution / generator, NOT the resolved `platform` (a camera label like "Google Pixel" would mis-normalize to "Google"). All real single-origin fixtures (chatgpt/firefly/doubao/grok/mj) verified to produce **zero** clashes (false-positive guard in `test_identify.py::TestRealSamplesHaveNoClash`).
+- `identify.py` — `identify(path)` aggregates every locally-readable signal (C2PA issuer→platform, C2PA soft-binding forensic-watermark vendor, IPTC "Made with AI" + IPTC 2025.1 `AISystemUsed`, embedded SD/ComfyUI params, SynthID proxy, xAI/Grok EXIF signature via `metadata.xai_signature`, the China TC260 AIGC label via `metadata.aigc_label`, the HuggingFace `hf-job-id` job marker via `metadata.huggingface_job`, visible Gemini sparkle, open invisible watermark, Adobe TrustMark via `trustmark_detector`) into one `ProvenanceReport`. `is_ai_generated` is True or None (never asserted False — stripped metadata is not proof of clean origin). The `hf_job` and visible-sparkle signals are **medium** confidence: each lifts an otherwise-Unknown verdict to a tentative AI (`hf_only` / `visible_only`, parallel branches) but is excluded from the high-confidence `ai_from_metadata` set, so neither overrides a hard metadata signal. Visible-sparkle is promoted only at confidence ≥ `_SPARKLE_THRESHOLD` (0.5; corpus-tuned to separate Gemini sparkles ≥0.56 from non-sparkle ≤0.49). The cv2 dependency lives in `gemini_engine.detect_sparkle_confidence`, not here. **C2PA platform attribution is device-token-first, issuer-scan fallback** (`_device_platform` scans manifest bytes for `_DEVICE_C2PA_PLATFORM` tokens, then `_attribute_platform`/`_ISSUER_PLATFORM`). **Why, verified on real signed files 2026-05-26:** the old issuer-only byte-scan matched ANY issuer substring anywhere, so multi-entity manifests mis-attributed -- Leica→"Truepic" (a signing authority in the trust chain), Nikon→"Adobe Firefly" (XMP-toolkit "Adobe" + the sample's "Adobe_MAX" name), Pixel→"Google (Gemini)" ("Google LLC" cert org), Truepic→"Google". A distinctive device token wins instead. **Token distinctiveness is load-bearing:** bare `b"Truepic"` mis-fires (it appears in unrelated trust chains -- it mis-attributed the OpenAI `chatgpt-1.png` fixture), so the token is the specific `b"Truepic_Lens"` from the Lens SDK claim generator; likewise `b"Pixel Camera"` (cert CN) not bare `b"Pixel"`. `_DEVICE_C2PA_PLATFORM` lists ONLY tokens **verified against a real C2PA file**: Leica (`lc_c2pa`/`Leica Camera`), Nikon (`NIKON`), Pixel (`Pixel Camera` -- from a real Pixel 10 Pro file attached to c2pa-rs issue #1609/#1554), Sony (`sony.sig`/`sony.cert` -- Sony's own C2PA assertion namespace, verified on a real Sony PXW-Z300 file; NOT bare "Sony" which is a common EXIF Make), Truepic (`Truepic_Lens`). Canon/Samsung/Bria have **no public direct-download C2PA sample** (checked exhaustively: GitHub issue/PR attachments, contentcredentials gallery, HF datasets -- all upload-to-verify or token-gated; Canon's only public file was a self-signed hobbyist CR3, not factory), so they stay unmapped until a real file is captured (same fixture discipline as Grok/Doubao). The Sony sample is video (MP4) -- our ISOBMFF C2PA path detects it; Sony Alpha stills likely share the `sony.*` namespace but are not separately verified. Camera C2PA marks capture authenticity, not AI (Pixel carries `computationalCapture`, not `trainedAlgorithmicMedia`), so these never set `is_ai` -- that stays driven by digital-source-type. `c2pa.cbor_text_after` (now public) is best-effort for the `generator` detail string only and can be None when the manifest keys it `claim_generator_info` (Pixel). **Issuer→generator mapping is `is_ai`-gated** (`_attribute_platform(issuers, is_ai=c2pa_is_ai)`): a specific AI-generator platform is named only when the digital-source-type is `trainedAlgorithmicMedia`; on a non-AI source an issuer substring is treated as incidental (an "Adobe XMP" toolkit string in an *unmapped* Canon/Sony capture would otherwise mislabel it "Adobe Firefly"), so it degrades to the neutral "C2PA signer: X" label. Real Firefly/OpenAI/Google output carries the AI source-type, so it is unaffected (verified: chatgpt-1.png→OpenAI, firefly-1.png→Adobe Firefly still attribute). `_attribute_platform` defaults `is_ai=True` so the mapping stays unit-testable in isolation. Add device tokens to `_DEVICE_C2PA_PLATFORM`, generator/issuer platforms to `_ISSUER_PLATFORM`, not inline. For non-PNG containers (JPEG/WebP/AVIF/HEIF/JXL) the caBX parser returns nothing, so issuer (`_issuers_in`) and generator (`_ai_tools_in`, reusing `C2PA_AI_TOOLS`) are recovered by binary-scanning the first MB. EXIF `Software` / `Make` / `Artist` / `ImageDescription` and XMP `CreatorTool` generator tags are read by `metadata.exif_generator` (PIL+piexif for any format PIL opens incl. AVIF, plus a container-agnostic XMP raw-byte scan that also covers HEIF/JXL), matched against `AI_GENERATOR_TOKENS` so ordinary editors (plain "Adobe Photoshop") and real-camera `Make` ("Apple"/"Canon") are not flagged. **Ideogram tags its output with EXIF `Make="Ideogram AI"`** (verified on a real download 2026-05-24) — that's why `Make` is read. **Integrity-clash detection** (`_integrity_clashes`, surfaced as `ProvenanceReport.integrity_clashes`, printed in red by `identify` and serialized to `--json`): contradictions between independent generator stamps are a laundering/spoofing tell. Two rules: (1) two or more distinct AI-origin vendors named by independent signals (e.g. C2PA OpenAI + EXIF `Make="Ideogram AI"`), and (2) a camera-capture C2PA device (`_DEVICE_C2PA_PLATFORM`) coexisting with any AI-generation marker. Vendor normalization is `_vendor_of` over `_AI_VENDOR_TOKENS` (so a C2PA "Google (Gemini)" issuer and a SynthID-Google proxy agree, while different vendors clash). **High-precision by design:** only hard generator stamps feed it (C2PA-issuer when source is AI, SynthID, EXIF/XMP generator, IPTC `AISystemUsed`, xAI, AIGC); the fuzzy visible sparkle and the open invisible watermark are **excluded** (the latter can be a by-product of our own SDXL removal pass). The c2pa vendor is classified from the issuer attribution / generator, NOT the resolved `platform` (a camera label like "Google Pixel" would mis-normalize to "Google"). All real single-origin fixtures (chatgpt/firefly/doubao/grok/mj) verified to produce **zero** clashes (false-positive guard in `test_identify.py::TestRealSamplesHaveNoClash`).
 - `gemini_engine.py` — visible Gemini-sparkle remover/detector (cv2/numpy, no GPU). `detect_sparkle_confidence(path)` is the file-level entry point used by `identify.py`.
 - `doubao_engine.py` — visible Doubao "豆包AI生成" remover/detector (cv2/numpy, no GPU). `DoubaoEngine.locate` anchors a bottom-right box by **geometry** (mark scales with image WIDTH, fractions in module constants; no bundled template), `extract_mask` pulls the light low-saturation glyphs with a **polarity-aware white top-hat** (brighter-than-blurred-local-bg, so white-paper documents are left untouched instead of smeared), `detect` thresholds glyph coverage (`DETECT_MIN_COVERAGE` 0.16 separates real marks ≥0.20 from corner noise, which stays ≤0.06 on large images but can spike to ~0.15 on tiny ones), `remove_watermark` inpaints (cv2 Telea/NS) and **bails when coverage > `MAX_INPAINT_COVERAGE` 0.50** (dense-text background → would smear). Wired into `visible --mark` via `cli._run_doubao_if_selected`. **Logo is near-white (~253), not the gray some third-party tools assume.** Best on photo/illustration backgrounds; high-contrast edges leave faint residue (cv2-inpaint limit). Clean per-pixel reverse-alpha (Gemini-style) is the future upgrade but needs a captured/distilled alpha map — see below.
 - `region_eraser.py` — universal region eraser (`erase` CLI). `erase(image, boxes=|mask=, backend=)`: `boxes_to_mask` → `cv2.inpaint` (`cv2` backend, default, no deps) or big-LaMa via onnxruntime (`lama` backend, extra `lama`, `Carve/LaMa-ONNX` Apache-2.0 model downloaded on first use, never bundled). `erase_lama` crops a padded region around the mask, runs LaMa at its fixed 512² input, pastes only masked pixels back (untouched areas stay pixel-exact). Lazy `_get_lama_session` singleton; `lama_available()` guards the optional import. **LaMa-ONNX costs ~3.5-4 GB peak RAM and ~5-6 s/call on CPU** (FFC working set, not arena — `enable_cpu_mem_arena=False` does not help), so it does NOT fit a minimal droplet; the cv2 backend (tens of MB, ~30 ms) does. LaMa quality at low RAM = serverless/GPU, mirroring how raiw.cc offloads SDXL to fal.
@@ -51,7 +52,8 @@ Who embeds what, and whether it is locally detectable (so we know which gaps are
 - **C2PA / IPTC (covered by the issuer/marker scan):** OpenAI, Google, Adobe Firefly, Microsoft (Designer + **Bing Image Creator** — collected 2026-05-24; Bing now runs Microsoft's own **MAI-Image** model, signs C2PA as "Microsoft", NOT OpenAI/DALL-E), and **Stability AI** (collected from Brand Studio / DreamStudio successor; signs C2PA as "Stability AI Ltd", no SynthID, no imwatermark on its current Stable Image model — issuer added to `C2PA_ISSUERS`). Still unsampled: Canva (its downloads are re-encoded design *exports* that strip C2PA, so a Canva "positive" is inconclusive — skipped), Getty, Shutterstock. Midjourney embeds NO C2PA and no invisible watermark (our `mj-*` sample carried only the IPTC tag).
 - **EXIF/XMP generator tag (caught by `exif_generator`):** **Ideogram** writes EXIF `Make="Ideogram AI"` (collected 2026-05-24 — no C2PA, no SynthID, no imwatermark; the Make tag is the only signal).
 - **xAI / Grok — its own EXIF signature scheme, NOT C2PA (DETECTED by `metadata.xai_signature`, built 2026-05-26).** Grok JPEG downloads (Aurora model) carry **no C2PA, no XMP, no SynthID, no IPTC** — only EXIF `Artist` = a UUID and EXIF `ImageDescription` = `Signature: <base64>` (a crypto signature, unverifiable locally without xAI's public key). This empirically kills the earlier unverified "xAI signs C2PA as xAI" lead — xAI is not even a C2PA member. `exif_generator` misses it (neither field holds an `AI_GENERATOR_TOKENS` token), so a dedicated detector `xai_signature(path)` matches the pair (`ImageDescription ~ ^Signature: [A-Za-z0-9+/=]{64,}` AND UUID `Artist`); wired into `has_ai_metadata`, `get_ai_metadata` (key `xai_signature`), and `identify` (signal `xai_signature`, platform "xAI (Grok / Aurora)"). **Format confirmed stable across n=3 genuine generations:** exactly three EXIF tags (`Artist`, `ExifOffset`, `ImageDescription`), `Signature:` prefix constant, base64 payload 300-1004 chars. Two capture facts: (a) the `Artist` UUID **equals the public image id** in the asset URL (`https://imagine-public.x.ai/imagine-public/images/<uuid>.jpg`), so it is NOT a private per-user secret — only the `Signature` blob is; (b) the Grok web-UI image is a re-encoded **WebP with no signature** — the EXIF survives only in the *original* JPEG (download button or that public tokenless URL), which is why screenshots / re-encodes are metadata-stripped. A real fixture `data/samples/grok-1.jpg` plus **synthetic** JPEG fixtures (fake UUID + fake `Signature:` blob) cover the detector; never add a real Grok image carrying private content (the repo is public). **Stripped on removal too:** `remove_ai_metadata` now calls `_scrub_ai_exif` on the JPEG EXIF, which deletes the xAI Signature+UUID-Artist pair **and** any `Software`/`Make`/`Artist`/`ImageDescription` tag holding an `AI_GENERATOR_TOKENS` token (so Ideogram's `Make="Ideogram AI"` is scrubbed too), while keeping genuine camera/editor EXIF. The shared `_is_xai_signature_pair` helper (module-level compiled regexes) is the single source of truth for the pattern, used by both `xai_signature` and `_scrub_ai_exif`. (AVIF/HEIF/JXL still strip only C2PA boxes via `isobmff`, not EXIF — unchanged.)
-- **China TC260 AIGC label (caught by `AIGC_MARKERS` / `metadata.aigc_label`, surfaced by `identify` as the `aigc` signal):** China-served generators embed an XMP `<TC260:AIGC>{"Label":"1","ContentProducer":...}` block — China's mandatory AI-content labeling (TC260 namespace `tc260.org.cn/ns/AIGC`). **Doubao** (ByteDance) uses it (verified on the real #13 sample 2026-05-25; `ContentProducer` `001191110102MACQD9K64010000`, no C2PA/SynthID/imwatermark — the XMP block is the only signal; GitHub attachment upload did NOT strip it). The same standard is mandatory for Jimeng/Kling/Qwen/Ernie etc., so the one marker covers the whole China-AIGC-labeled ecosystem. `aigc_label` json-decodes the (HTML-entity-encoded) block; container-agnostic raw-byte scan.
+- **China TC260 AIGC label (caught by `AIGC_MARKERS` / `metadata.aigc_label`, surfaced by `identify` as the `aigc` signal):** China-served generators embed an XMP `<TC260:AIGC>{"Label":"1","ContentProducer":...}` block — China's mandatory AI-content labeling (TC260 namespace `tc260.org.cn/ns/AIGC`). **Doubao** (ByteDance) uses it (verified on the real #13 sample 2026-05-25; `ContentProducer` `001191110102MACQD9K64010000`, no C2PA/SynthID/imwatermark — the XMP block is the only signal; GitHub attachment upload did NOT strip it). The same standard is mandatory for Jimeng/Kling/Qwen/Ernie etc., so the one marker covers the whole China-AIGC-labeled ecosystem. `aigc_label` reads **two serializations** through a shared `_parse` helper: the HTML-entity-encoded XMP `<TC260:AIGC>` block (container-agnostic raw-byte scan, any JSON object accepted) **and** a raw-JSON PNG `AIGC` tEXt chunk — Doubao also writes the label this way, with no namespaced marker at all (confirmed on the corpus 2026-05-28, `ContentProducer="doubao"`). The PNG-chunk path is gated on at least one TC260 field (`_TC260_FIELDS`) so a generic `AIGC` key cannot false-positive. In `identify`, `aigc` fires on the parsed label **or** the `AIGC_MARKERS` byte scan (the latter preserves the laundering-tell case where the JSON payload is truncated).
+- **HuggingFace-hosted job (caught by `metadata.huggingface_job`, surfaced by `identify` as the `hf_job` signal, MEDIUM confidence):** HuggingFace Jobs / Spaces stamp generated PNGs with an `hf-job-id` tEXt chunk holding the job UUID (3 on the corpus 2026-05-28, no other signal). It marks the *hosting job*, not a model — most commonly diffusion output — so it lifts an Unknown verdict to a tentative AI via `hf_only` (parallel to the visible sparkle) but never overrides a hard metadata signal; `_HF_JOB_CAVEAT` states the limit (job, not model; not proof of AI pixels). Stripped on removal (the PNG save whitelist keeps only `STANDARD_METADATA_KEYS`, so `hf-job-id` and the `AIGC` chunk are both dropped). The exact writer is not authoritatively documented (HF Jobs are generic GPU jobs), hence medium not high.
 - **No detectable signal on download (correctly reported `unknown`):** **Recraft** (PNG export is a re-encoded design export — strips everything), **Krea hosting FLUX 2** (no imwatermark despite FLUX — the host omits the encoder, same as Stability's hosted SDXL), and Midjourney (embeds nothing). Lesson: the imwatermark detector only fires on *pristine* output from a pipeline that runs the encoder (diffusers default, official BFL), not from re-hosts (Krea/Stability) or re-encoded exports (Recraft/Canva).
 - **Invisible but NOT locally detectable (proprietary, API/oracle only — same wall as SynthID):** Amazon Titan Image Generator + Nova Canvas (Bedrock `DetectGeneratedContent` API), Kakao (new SynthID image adopter, May 2026), NVIDIA Cosmos (SynthID video). No local detector possible; treat like SynthID.
 - **C2PA 2.4 "Durable Content Credentials" (April 2026; verified against the spec) raise the bar for metadata stripping.** 2.4 defines soft bindings (an invisible watermark or a content fingerprint) plus a server-side manifest repository and a new `c2pa.repository-receipt` assertion. Per the spec: "if a C2PA manifest is removed from an asset, but a copy of that manifest remains in a provenance store elsewhere, the manifest and asset may be matched using available soft bindings." So our local `metadata --remove` deletes the *embedded* manifest, but a fingerprint/watermark soft binding can still re-link the image to its manifest in a repository server-side. Stripping the file is becoming necessary-but-not-sufficient against durable provenance. (Our parsers target the stable embedded-manifest format documented in C2PA 2.1 §11; that format is unchanged in 2.4 -- the new pieces are repository/soft-binding infra, not the on-file box layout, so no parser change is implied.) Spec: https://spec.c2pa.org/specifications/specifications/2.4/specs/C2PA_Specification.html We now READ the soft-binding `alg` (`C2PA_SOFT_BINDINGS` / `soft_binding_vendors_in`) to name the forensic-watermark vendor, and locally DECODE the one open scheme, Adobe TrustMark (`trustmark_detector`); the rest (Digimarc/Imatag/Steg.AI/...) stay name-only (proprietary decoders).
diff --git a/README.md b/README.md
index 93fcb3f..56fff98 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ If this tool saves you time, consider [sponsoring its development](https://githu
 - **Smart Face Protection** — automatic extraction and blending of human faces to prevent AI distortion
 - **Batch processing** — process entire directories
 - **Detection** — three-stage NCC watermark detection with confidence scoring
-- **Provenance detection (`identify`)** — aggregate C2PA issuer, the C2PA soft-binding forensic-watermark vendor (Adobe TrustMark, Digimarc, Imatag, ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, embedded SD/ComfyUI params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the SynthID metadata proxy, the visible sparkle, the open SD/SDXL/FLUX invisible watermark, and (with the `trustmark` extra) the open Adobe TrustMark watermark into one origin-platform + watermark-inventory verdict (`--json` for machine output)
+- **Provenance detection (`identify`)** — aggregate C2PA issuer, the C2PA soft-binding forensic-watermark vendor (Adobe TrustMark, Digimarc, Imatag, ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, embedded SD/ComfyUI params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the China TC260 AIGC label (XMP or PNG chunk), the HuggingFace `hf-job-id` job marker, the SynthID metadata proxy, the visible sparkle, the open SD/SDXL/FLUX invisible watermark, and (with the `trustmark` extra) the open Adobe TrustMark watermark into one origin-platform + watermark-inventory verdict (`--json` for machine output)
 
 ## Examples
 
@@ -48,13 +48,13 @@ If this tool saves you time, consider [sponsoring its development](https://githu
 | **xAI Grok (Aurora)** | — | — | ✅ EXIF signature scheme (no C2PA): `Signature:` blob + UUID `Artist` | Detected (`identify`); metadata strip |
 | **Midjourney** | — | — | ✅ EXIF + XMP (prompt, model, seed) | Metadata strip |
 | **Meta AI** | — | — | ✅ IPTC "Made with AI" (digitalSourceType) | Metadata strip (removes the label) |
-| **Doubao** (ByteDance) / China AIGC generators | ✅ "豆包AI生成" text strip (bottom-right) | — | ✅ TC260 `<TC260:AIGC>` XMP label (China's mandatory AI labeling) | Locate + mask + inpaint (cv2, CPU) + metadata strip |
+| **Doubao** (ByteDance) / China AIGC generators | ✅ "豆包AI生成" text strip (bottom-right) | — | ✅ TC260 AIGC label — `<TC260:AIGC>` XMP **or** `AIGC` PNG chunk (China's mandatory AI labeling) | Locate + mask + inpaint (cv2, CPU) + metadata strip |
 | **StableSignature** (Meta) | — | ✅ In-model watermark | — | Diffusion regeneration |
 | **TreeRing** | — | ✅ Latent space watermark | — | Diffusion regeneration |
 
 > Visible overlays are used by Google Gemini / Nano Banana (sparkle logo) and by Doubao / China AIGC generators (the mandated "...AI生成" corner text). Both are removed deterministically on CPU. Other services rely on invisible watermarks and/or metadata; our diffusion-based regeneration works against any invisible watermark in pixel or frequency domain. For a visible mark from any other source (any position, any colour), use the universal `erase --region` command.
 
-> **Detection:** `remove-ai-watermarks identify <image>` reports the origin platform and watermark inventory for all the signals above — C2PA issuer, the C2PA soft-binding forensic-watermark vendor (TrustMark / Digimarc / Imatag / ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, the China TC260 AIGC label, embedded generation params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the SynthID metadata proxy, the visible sparkle, and (with the `[detect]` / `[trustmark]` extras) the open SD/SDXL/FLUX and Adobe TrustMark invisible watermarks. SynthID and the proprietary soft-binding watermarks (Digimarc etc.) have no local decoder, so they are reported by metadata proxy / vendor name only.
+> **Detection:** `remove-ai-watermarks identify <image>` reports the origin platform and watermark inventory for all the signals above — C2PA issuer, the C2PA soft-binding forensic-watermark vendor (TrustMark / Digimarc / Imatag / ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, the China TC260 AIGC label (XMP or PNG chunk), the HuggingFace `hf-job-id` job marker, embedded generation params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the SynthID metadata proxy, the visible sparkle, and (with the `[detect]` / `[trustmark]` extras) the open SD/SDXL/FLUX and Adobe TrustMark invisible watermarks. SynthID and the proprietary soft-binding watermarks (Digimarc etc.) have no local decoder, so they are reported by metadata proxy / vendor name only.
 
 ## How it works
 
diff --git a/src/remove_ai_watermarks/identify.py b/src/remove_ai_watermarks/identify.py
index 2ec6291..51cc055 100644
--- a/src/remove_ai_watermarks/identify.py
+++ b/src/remove_ai_watermarks/identify.py
@@ -31,6 +31,7 @@ from remove_ai_watermarks.metadata import (
     aigc_label,
     exif_generator,
     get_ai_metadata,
+    huggingface_job,
     iptc_ai_system,
     scan_head,
     xai_signature,
@@ -89,6 +90,11 @@ _INVISIBLE_WM_CAVEAT = (
     "The open invisible watermark is fragile: it does not survive JPEG re-encoding "
     "or resizing, so it confirms origin only on a pristine (un-re-encoded) file."
 )
+_HF_JOB_CAVEAT = (
+    "The hf-job-id tag marks a HuggingFace-hosted job (commonly diffusion "
+    "generation) but names neither the model nor the content type, so it is a "
+    "medium-confidence signal, not proof the pixels are AI-generated."
+)
 
 
 @dataclass
@@ -423,9 +429,14 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b
             ai_vendor_claims["iptc_ai_system"] = v
 
     # ── China TC260 AIGC label (Doubao and other China-served gens) ──
-    aigc = any(m in head for m in AIGC_MARKERS)
+    # Fire on either the namespaced byte marker (``TC260:AIGC`` / the TC260 ns
+    # URL, present in XMP and as a laundering tell even when the JSON payload is
+    # truncated) OR the parsed label, which additionally catches the raw-JSON
+    # PNG ``AIGC`` tEXt chunk that carries no namespaced marker at all.
+    aigc_data = aigc_label(image_path)
+    aigc = aigc_data is not None or any(m in head for m in AIGC_MARKERS)
     if aigc:
-        producer = (aigc_label(image_path) or {}).get("ContentProducer", "")
+        producer = (aigc_data or {}).get("ContentProducer", "")
         signals.append(Signal("aigc", f"TC260 AIGC label{f' (producer {producer})' if producer else ''}", "high"))
         watermarks.append("China AIGC label (TC260 standard)")
         if platform is None:
@@ -461,6 +472,18 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b
             platform = "xAI (Grok / Aurora)"
         ai_vendor_claims["xai"] = "xAI"
 
+    # ── HuggingFace-hosted job marker (hf-job-id PNG text chunk) ─────
+    # Marks the hosting job, not a model -- medium confidence (commonly diffusion
+    # output). Like the visible sparkle, it lifts an otherwise-Unknown verdict to
+    # a tentative AI, but never overrides a high-confidence metadata signal.
+    hf_job = huggingface_job(image_path)
+    if hf_job:
+        signals.append(Signal("hf_job", f"HuggingFace job {hf_job}", "medium"))
+        watermarks.append("HuggingFace-hosted job (hf-job-id)")
+        caveats.append(_HF_JOB_CAVEAT)
+        if platform is None:
+            platform = "HuggingFace-hosted job (model not identified)"
+
     # ── Open invisible watermark (SD / SDXL / FLUX, dwtDct) ──────────
     # Public decoder, no key -- a definitive embedded signal on pristine files.
     if check_invisible and (scheme := _invisible_watermark(image_path)) is not None:
@@ -503,11 +526,12 @@ def identify(image_path: Path, *, check_visible: bool = True, check_invisible: b
             platform = "Google Gemini family (visible sparkle detected)"
 
     visible_only = any(s.name == "visible_sparkle" for s in signals) and not ai_from_metadata
+    hf_only = bool(hf_job) and not ai_from_metadata
 
     if ai_from_metadata:
         is_ai: bool | None = True
         confidence = "high"
-    elif visible_only:
+    elif visible_only or hf_only:
         is_ai = True
         confidence = "medium"
     else:
diff --git a/src/remove_ai_watermarks/metadata.py b/src/remove_ai_watermarks/metadata.py
index d2f44d9..7363730 100644
--- a/src/remove_ai_watermarks/metadata.py
+++ b/src/remove_ai_watermarks/metadata.py
@@ -108,6 +108,27 @@ AIGC_MARKERS: tuple[bytes, ...] = (
     b"TC260:AIGC",
 )
 
+# TC260 AIGC-label JSON fields (the standard's labeling object). Doubao writes
+# the same object as a PNG ``tEXt`` chunk keyed ``AIGC`` (raw JSON, not XMP), so
+# a JSON object carrying at least one of these is accepted as a valid TC260
+# label even when the namespaced XMP element is absent.
+_TC260_FIELDS: frozenset[str] = frozenset(
+    {
+        "Label",
+        "ContentProducer",
+        "ProduceID",
+        "ContentPropagator",
+        "PropagateID",
+        "ReservedCode1",
+        "ReservedCode2",
+    }
+)
+
+# HuggingFace-hosted GPU jobs (Jobs / Spaces) stamp generated PNGs with this
+# ``tEXt`` chunk key holding the job UUID. It marks the hosting job, not a
+# specific model -- a medium-confidence AI signal (commonly diffusion output).
+_HF_JOB_KEY: str = "hf-job-id"
+
 STANDARD_METADATA_KEYS: frozenset[str] = frozenset(
     [
         "Author",
@@ -202,31 +223,90 @@ def has_ai_metadata(image_path: Path) -> bool:
     # IPTC 2025.1 AI-disclosure XMP properties (their presence flags AI content).
     if any(marker in data for marker in IPTC_AI_FIELD_MARKERS):
         return True
+    # China TC260 AIGC label as a PNG text chunk (the byte scan above catches
+    # only the XMP form; the raw-JSON tEXt chunk needs the PIL-based parse).
+    if aigc_label(image_path):
+        return True
+    # HuggingFace-hosted job marker (hf-job-id PNG text chunk).
+    if huggingface_job(image_path):
+        return True
     # xAI / Grok: no C2PA/IPTC/XMP -- only the EXIF Signature + UUID-Artist pair.
     return xai_signature(image_path)
 
 
 def aigc_label(image_path: Path) -> dict[str, str] | None:
-    """Parse a China TC260 ``<TC260:AIGC>`` AI-labeling block, if present.
+    """Parse a China TC260 AI-labeling block, if present.
+
+    Two serializations are recognized:
+
+    - a PNG ``tEXt``/``iTXt`` chunk keyed ``AIGC`` carrying the raw JSON object
+      (as written by Doubao / ByteDance), read via PIL; and
+    - an XMP ``<TC260:AIGC>{...}</TC260:AIGC>`` block (HTML-entity encoded text),
+      found by a container-agnostic raw-byte scan (PNG/JPEG/WebP alike).
 
     Returns the decoded JSON (e.g. ``{"Label": "1", "ContentProducer": ...}``)
-    or None. The block is XMP text (HTML-entity encoded), so it is found by a
-    container-agnostic raw-byte scan and works for PNG/JPEG/WebP alike.
+    or None. The PNG-chunk key ``AIGC`` is generic, so a JSON object there is
+    accepted only if it carries at least one known TC260 field (``_TC260_FIELDS``);
+    the namespaced XMP element is unambiguous, so any JSON object is accepted.
     """
     import html
     import json
-    import re
+    from typing import cast
 
+    def _parse(text: str, *, require_tc260_field: bool) -> dict[str, str] | None:
+        try:
+            parsed = json.loads(text)
+        except ValueError:
+            return None
+        if not isinstance(parsed, dict):
+            return None
+        fields = {str(k): str(v) for k, v in cast("dict[object, object]", parsed).items()}
+        if require_tc260_field and not (_TC260_FIELDS & fields.keys()):
+            return None
+        return fields
+
+    # PNG tEXt chunk keyed "AIGC" with raw JSON (Doubao and other China gens).
+    # The key is generic, so require a TC260 field to avoid a false positive.
+    try:
+        from PIL import Image
+
+        with Image.open(image_path) as img:
+            value = img.info.get("AIGC")
+    except Exception as exc:
+        logger.debug("PIL could not open %s for AIGC chunk scan: %s", image_path, exc)
+        value = None
+    if isinstance(value, str) and (result := _parse(value, require_tc260_field=True)):
+        return result
+
+    # XMP <TC260:AIGC>{...}</TC260:AIGC> block (namespaced element, unambiguous).
     data = scan_head(image_path)
     match = re.search(rb"<TC260:AIGC>(.*?)</TC260:AIGC>", data, re.DOTALL)
     if not match:
         return None
-    raw = html.unescape(match.group(1).decode("utf-8", "replace"))
+    return _parse(html.unescape(match.group(1).decode("utf-8", "replace")), require_tc260_field=False)
+
+
+def huggingface_job(image_path: Path) -> str | None:
+    """Return the HuggingFace job id if the image carries an ``hf-job-id`` PNG
+    text chunk, else None.
+
+    HuggingFace-hosted GPU jobs (Jobs / Spaces) stamp generated PNGs with an
+    ``hf-job-id`` ``tEXt`` chunk holding the job's UUID. It identifies the
+    *hosting job*, not a specific model, and is most commonly seen on diffusion-
+    generation output -- a medium-confidence AI signal, not proof of AI pixels
+    on its own.
+    """
     try:
-        parsed = json.loads(raw)
-    except ValueError:
+        from PIL import Image
+
+        with Image.open(image_path) as img:
+            value = img.info.get(_HF_JOB_KEY)
+    except Exception as exc:
+        logger.debug("PIL could not open %s for hf-job-id scan: %s", image_path, exc)
         return None
-    return {str(k): str(v) for k, v in parsed.items()} if isinstance(parsed, dict) else None
+    if isinstance(value, str) and value.strip():
+        return value.strip()
+    return None
 
 
 def iptc_ai_system(image_path: Path) -> str | None:
@@ -500,6 +580,10 @@ def get_ai_metadata(image_path: Path) -> dict[str, str]:
     # IPTC 2025.1 AI-disclosure XMP fields (Iptc4xmpExt:AISystemUsed etc.).
     if system := iptc_ai_system(image_path):
         result.setdefault("ai_system", f"IPTC 2025.1 AI disclosure ({system})")
+
+    # HuggingFace-hosted job marker (hf-job-id PNG text chunk).
+    if job := huggingface_job(image_path):
+        result.setdefault("huggingface_job", f"HuggingFace-hosted job ({job})")
     return result
 
 
diff --git a/tests/test_identify.py b/tests/test_identify.py
index fb9104c..019e761 100644
--- a/tests/test_identify.py
+++ b/tests/test_identify.py
@@ -201,6 +201,78 @@ class TestIdentifyLocalParams:
         assert r.signals == []
 
 
+# ── China TC260 AIGC label as a PNG text chunk (Doubao) ─────────────
+
+
+class TestIdentifyAigcPngChunk:
+    """The raw-JSON ``AIGC`` PNG chunk (no namespaced XMP marker) is a high-
+    confidence AI verdict, same as the XMP form."""
+
+    def _aigc_chunk_png(self, tmp_path: Path) -> Path:
+        from PIL import Image
+        from PIL.PngImagePlugin import PngInfo
+
+        p = tmp_path / "doubao_chunk.png"
+        pnginfo = PngInfo()
+        pnginfo.add_text("AIGC", json.dumps({"Label": "1", "ContentProducer": "doubao"}))
+        Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo)
+        return p
+
+    def test_png_chunk_detected_high(self, tmp_path: Path):
+        r = identify(self._aigc_chunk_png(tmp_path), check_visible=False)
+        assert r.is_ai_generated is True
+        assert r.confidence == "high"
+        assert r.platform is not None
+        assert "AIGC" in r.platform
+        signal = next(s for s in r.signals if s.name == "aigc")
+        assert "doubao" in signal.detail
+
+
+# ── HuggingFace-hosted job marker (medium confidence) ───────────────
+
+
+class TestIdentifyHuggingFaceJob:
+    """The hf-job-id chunk lifts an otherwise-Unknown verdict to a tentative
+    (medium) AI, never overriding a high-confidence metadata signal."""
+
+    def _hf_png(self, tmp_path: Path) -> Path:
+        from PIL import Image
+        from PIL.PngImagePlugin import PngInfo
+
+        p = tmp_path / "hfjob.png"
+        pnginfo = PngInfo()
+        pnginfo.add_text("hf-job-id", "ec8380a6-2091-423a-b835-209420f99ee1")
+        Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo)
+        return p
+
+    def test_hf_job_promotes_to_medium(self, tmp_path: Path):
+        r = identify(self._hf_png(tmp_path), check_visible=False)
+        assert r.is_ai_generated is True
+        assert r.confidence == "medium"
+        assert r.platform is not None
+        assert "HuggingFace" in r.platform
+        signal = next(s for s in r.signals if s.name == "hf_job")
+        assert signal.confidence == "medium"
+
+    def test_hf_job_caveat_present(self, tmp_path: Path):
+        r = identify(self._hf_png(tmp_path), check_visible=False)
+        assert any("hf-job-id" in c for c in r.caveats)
+
+    def test_metadata_keeps_high_even_with_hf_job(self, tmp_png_with_ai_metadata: Path):
+        # A high-confidence metadata verdict is not downgraded by an hf-job hit.
+        from PIL import Image
+        from PIL.PngImagePlugin import PngInfo
+
+        img = Image.open(tmp_png_with_ai_metadata)
+        pnginfo = PngInfo()
+        for k, v in img.text.items():
+            pnginfo.add_text(k, v)
+        pnginfo.add_text("hf-job-id", "ec8380a6-2091-423a-b835-209420f99ee1")
+        img.save(tmp_png_with_ai_metadata, pnginfo=pnginfo)
+        r = identify(tmp_png_with_ai_metadata, check_visible=False)
+        assert r.confidence == "high"
+
+
 # ── Visible-sparkle fallback (mocked detector) ──────────────────────
 
 
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index 245c515..b93fe0a 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -554,6 +554,88 @@ class TestAIGCLabel:
         assert "aigc_label" in meta
         assert "TC260" in meta["aigc_label"]
 
+    def _aigc_chunk_png(self, tmp_path: Path, producer: str = "doubao") -> Path:
+        """Doubao writes the TC260 object as a PNG ``tEXt`` chunk keyed ``AIGC``
+        with raw JSON (no XMP, no namespaced marker)."""
+        import json
+
+        p = tmp_path / "doubao_chunk.png"
+        pnginfo = PngInfo()
+        pnginfo.add_text(
+            "AIGC",
+            json.dumps({"Label": "1", "ContentProducer": producer, "ProduceID": "abc123"}),
+        )
+        Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo)
+        return p
+
+    def test_parses_png_text_chunk_form(self, tmp_path: Path):
+        from remove_ai_watermarks.metadata import aigc_label
+
+        info = aigc_label(self._aigc_chunk_png(tmp_path))
+        assert info is not None
+        assert info["Label"] == "1"
+        assert info["ContentProducer"] == "doubao"
+
+    def test_png_chunk_without_tc260_field_ignored(self, tmp_path: Path):
+        """A generic ``AIGC`` chunk with no TC260 field must not false-positive."""
+        import json
+
+        from remove_ai_watermarks.metadata import aigc_label
+
+        p = tmp_path / "unrelated.png"
+        pnginfo = PngInfo()
+        pnginfo.add_text("AIGC", json.dumps({"unrelated": "value"}))
+        Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo)
+        assert aigc_label(p) is None
+
+    def test_has_ai_metadata_detects_png_chunk_form(self, tmp_path: Path):
+        assert has_ai_metadata(self._aigc_chunk_png(tmp_path))
+
+    def test_remove_strips_png_chunk_form(self, tmp_path: Path):
+        from remove_ai_watermarks.metadata import aigc_label, remove_ai_metadata
+
+        out = tmp_path / "clean.png"
+        remove_ai_metadata(self._aigc_chunk_png(tmp_path), out)
+        assert aigc_label(out) is None
+        assert not has_ai_metadata(out)
+
+
+class TestHuggingFaceJob:
+    """HuggingFace-hosted job marker (``hf-job-id`` PNG text chunk)."""
+
+    def _hf_png(self, tmp_path: Path, job_id: str = "ec8380a6-2091-423a-b835-209420f99ee1") -> Path:
+        p = tmp_path / "hfjob.png"
+        pnginfo = PngInfo()
+        pnginfo.add_text("hf-job-id", job_id)
+        Image.new("RGB", (32, 32)).save(p, pnginfo=pnginfo)
+        return p
+
+    def test_returns_job_id(self, tmp_path: Path):
+        from remove_ai_watermarks.metadata import huggingface_job
+
+        assert huggingface_job(self._hf_png(tmp_path)) == "ec8380a6-2091-423a-b835-209420f99ee1"
+
+    def test_none_when_absent(self, tmp_clean_png):
+        from remove_ai_watermarks.metadata import huggingface_job
+
+        assert huggingface_job(tmp_clean_png) is None
+
+    def test_has_ai_metadata_detects_hf_job(self, tmp_path: Path):
+        assert has_ai_metadata(self._hf_png(tmp_path))
+
+    def test_get_ai_metadata_surfaces_hf_job(self, tmp_path: Path):
+        meta = get_ai_metadata(self._hf_png(tmp_path))
+        assert "huggingface_job" in meta
+        assert "ec8380a6" in meta["huggingface_job"]
+
+    def test_remove_strips_hf_job(self, tmp_path: Path):
+        from remove_ai_watermarks.metadata import huggingface_job, remove_ai_metadata
+
+        out = tmp_path / "clean.png"
+        remove_ai_metadata(self._hf_png(tmp_path), out)
+        assert huggingface_job(out) is None
+        assert not has_ai_metadata(out)
+
 
 @pytest.mark.skipif(not (SAMPLES_DIR / "doubao-1.png").exists(), reason="doubao sample not present")
 class TestAIGCRealSample: