From a2c33af2843b974c9ff801ea7e42c9e2ab5902ed Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Sat, 20 Jun 2026 09:58:22 -0700 Subject: [PATCH] feat(scripts): fidelity_metrics.py + correct the qwen-vs-controlnet claim Add scripts/fidelity_metrics.py: an objective eval harness comparing watermark-removal outputs against the original (reference) across four groups -- OCR character error rate (EasyOCR), ArcFace identity cosine (insightface), face texture (LPIPS + Laplacian-variance ratio), and whole-image LPIPS/SSIM/ PSNR. PEP 723 inline deps so it stays out of the package / uv.lock; metrics self-gate (faces only where faces, text only where text). The metrics overturned an eyeball conclusion: at EQUAL strength Qwen beats controlnet on TEXT (OpenAI typography 0.10: OCR CER 0.25 vs 0.37) but controlnet beats Qwen on FACES (gemini_3, 18 faces, 0.15 each: Laplacian-variance retention 0.62 vs 0.41, face LPIPS 0.09 vs 0.13 -- Qwen smooths faces MORE; ArcFace identity ~tied). So Qwen is the better TEXT-preserving remover, not a universal fidelity win. Correct the earlier "qwen keeps faces faithful where controlnet plasticizes" claim in CLAUDE.md, module-internals.md, known-limitations.md, README. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 2 +- README.md | 2 +- docs/known-limitations.md | 4 +- docs/module-internals.md | 2 +- scripts/fidelity_metrics.py | 303 ++++++++++++++++++++++++++++++++++++ 5 files changed, 309 insertions(+), 4 deletions(-) create mode 100644 scripts/fidelity_metrics.py diff --git a/CLAUDE.md b/CLAUDE.md index 18ae668..d0a430e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,7 +61,7 @@ Compact map. The full per-module detail (design decisions, tuned thresholds, cal - `region_eraser.py` — universal region eraser (`erase` CLI): cv2 backend default (no deps), optional big-LaMa via onnxruntime (~3.5-4 GB peak RAM, ~5-6 s/call CPU — does not fit a minimal droplet). - `invisible_watermark.py` — decodes the OPEN DWT-DCT watermarks (SD / SDXL / FLUX) via `imwatermark` (extra `detect`, pulls torch). Fragile two ways: (1) does not survive JPEG re-encode/resize; (2) **carrier-fragile on a broad class of pristine images** -- a clean encode->decode round-trip recovers 48/48 on chatgpt/firefly/random but FAILS (28-39/48, below the `_MATCH_48`=44 gate) on the FLUX fox, doubao, a flat FLUX generation, AND a clean synthetic flat fill with no watermark. The failure does NOT track texture; it goes with a degenerate **all-ones decode that is a CARRIER ARTIFACT, not a watermark** (synthetic clean image reproduces it). So `detect_invisible_watermark` is **positive-only**: trust a hit; a `None` is inconclusive unless a same-carrier positive-control embed first recovers >=44. Verified 2026-06-19; full caveat in `docs/watermarking-landscape.md`. - `trustmark_detector.py` — Adobe TrustMark open decoder (extra `trustmark`). Do NOT remove the JPEG re-encode false-positive gate — a lone TrustMark hit without it is almost always content noise. -- `noai/watermark_remover.py` — `WatermarkRemover` with three diffusion pipelines selected by the explicit `pipeline` ctor arg, never inferred from `model_id`: `sdxl` (plain SDXL img2img), `controlnet` (SDXL + canny ControlNet, **the DEFAULT since 2026-06-09**), and `qwen` (Qwen-Image 20B MMDiT img2img, Apache-2.0, CUDA/cloud-class — best text/structure preservation at the scrub floor; `_load_qwen_pipeline`/`_run_qwen`, bf16, no MPS fallback; call shape in the pure `_build_qwen_kwargs` using `true_cfg_scale`). Removal comes from the img2img `strength`; ControlNet only preserves text/face STRUCTURE — SynthID CAN survive controlnet on photoreal content at low strength. Qwen prototype oracle floors (single-seed, pending seed-repeat cert): OpenAI ~0.10, Gemini ~0.30 (higher than the controlnet Gemini floor — pass explicit `--strength` for Gemini on `qwen` until certified). No face-restore extra ships, by validated decision (every restore approach looked MORE AI-generated). +- `noai/watermark_remover.py` — `WatermarkRemover` with three diffusion pipelines selected by the explicit `pipeline` ctor arg, never inferred from `model_id`: `sdxl` (plain SDXL img2img), `controlnet` (SDXL + canny ControlNet, **the DEFAULT since 2026-06-09**), and `qwen` (Qwen-Image 20B MMDiT img2img, Apache-2.0, CUDA/cloud-class — best **text** preservation (incl. CJK); `_load_qwen_pipeline`/`_run_qwen`, bf16, no MPS fallback; call shape in the pure `_build_qwen_kwargs` using `true_cfg_scale`). Removal comes from the img2img `strength`; ControlNet only preserves text/face STRUCTURE — SynthID CAN survive controlnet on photoreal content at low strength. Qwen prototype oracle floors (single-seed, pending seed-repeat cert): OpenAI ~0.10, Gemini ~0.30 (higher than the controlnet Gemini floor — pass explicit `--strength` for Gemini on `qwen` until certified). Fidelity measured by `scripts/fidelity_metrics.py` (OCR-CER / ArcFace / LPIPS / Laplacian-var, NOT eyeball): at equal strength Qwen wins TEXT, controlnet wins FACES (Qwen smooths faces more) — Qwen is the text-preserving remover, not a universal fidelity win. No face-restore extra ships, by validated decision (every restore approach looked MORE AI-generated). - `noai/tiling.py` — sliding-window tiled diffusion for large inputs (CLI `--tile`). `WatermarkRemover.remove_watermark` branches to `run_tiled` when `tile` is set AND the long side exceeds `tile_size`, refactoring the single-pass `_generate` into a per-tile `_generate_one` (the ControlNet edge map is rebuilt per tile inside it). Pure helpers `plan_tiles` (uniform-size tiles, last one flush to the edge) and `feather_weights` (strictly-positive separable taper -> partition-of-unity blend) are unit-tested without the model. New tile-blend tuning goes in those pure helpers; do not inline blend math into the runner. - `auto_config.py` + the content-detection layer were REMOVED 2026-06-09; `--auto` is a deprecated no-op (controlnet is the default pipeline and the adaptive polish is ON by default and self-gates to a no-op where there is no detail deficit). - `upscaler.py` — optional Real-ESRGAN pre-diffusion super-resolution for small inputs (extra `esrgan`, spandrel only). Manual opt-in; the default `--upscaler` stays `lanczos` and the engine always falls back to Lanczos on absence/error. ESRGAN can degrade faces and thin text. diff --git a/README.md b/README.md index ea5ace5..1c5ac3f 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ It does **not** target watermarks that protect someone else's paid or copyrighte - **AI metadata stripping** — EXIF, PNG text chunks, C2PA provenance manifests (PNG / JPEG / AVIF / HEIF / JPEG-XL, **MP4 / MOV / M4V / M4A** at the container level, and **WebM / MP3 / WAV / FLAC / OGG** losslessly via ffmpeg), XMP DigitalSourceType - **"Made with AI" label removal** — removes the AI-disclosure metadata that platforms read to apply automatic labels (useful for clearing a false-positive label from a human-edited photograph) - **Analog Humanizer** — optional film grain and chromatic aberration post-processing -- **Text and face preservation (default)** — the default pipeline is a canny ControlNet that keeps text and face structure sharp through the removal pass (without copying original pixels, so SynthID is still removed). Use `--pipeline sdxl` for plain SDXL img2img (lighter, no extra model download) on inputs without text or faces. An experimental `--pipeline qwen` runs Qwen-Image (20B, Apache-2.0) img2img, which preserves text (including CJK) and structure better still at the scrub floor; it is CUDA/cloud-class (does not fit MPS), and its strength floors are not yet certified (pass an explicit `--strength`, especially for Gemini content). Canny preserves face *structure*, not *identity* (the regenerated face drifts in likeness). The library does not ship a face-restore extra: every approach evaluated (GFPGAN-on-cleaned, PhotoMaker-V2, InstantID txt2img, InstantID img2img-on-cleaned) regenerated the face via SDXL and made the output look more AI-generated than the cleaned image. The cleaned controlnet output is the least-AI face state achievable without re-introducing SynthID. +- **Text and face preservation (default)** — the default pipeline is a canny ControlNet that keeps text and face structure sharp through the removal pass (without copying original pixels, so SynthID is still removed). Use `--pipeline sdxl` for plain SDXL img2img (lighter, no extra model download) on inputs without text or faces. An experimental `--pipeline qwen` runs Qwen-Image (20B, Apache-2.0) img2img, which preserves **text** (including CJK and small text) better than SDXL at equal strength; it is CUDA/cloud-class (does not fit MPS), and its strength floors are not yet certified (pass an explicit `--strength`, especially for Gemini content). Note: measured fidelity (`scripts/fidelity_metrics.py`) shows Qwen wins on text but controlnet preserves **faces** better (Qwen smooths skin more), so Qwen is not a universal upgrade. Canny preserves face *structure*, not *identity* (the regenerated face drifts in likeness). The library does not ship a face-restore extra: every approach evaluated (GFPGAN-on-cleaned, PhotoMaker-V2, InstantID txt2img, InstantID img2img-on-cleaned) regenerated the face via SDXL and made the output look more AI-generated than the cleaned image. The cleaned controlnet output is the least-AI face state achievable without re-introducing SynthID. - **Batch processing** — process entire directories - **Detection** — three-stage NCC watermark detection with confidence scoring - **Provenance detection (`identify`)** — aggregate C2PA issuer, the C2PA soft-binding forensic-watermark vendor (Adobe TrustMark, Digimarc, Imatag, ...), IPTC "Made with AI" plus the IPTC 2025.1 `AISystemUsed` field, embedded SD/ComfyUI params, EXIF/XMP generator tags, the xAI/Grok EXIF signature, the China TC260 AIGC label (XMP, PNG chunk, EXIF, or JPEG segment), the HuggingFace `hf-job-id` job marker, the SynthID metadata proxy, the C2PA cloud-manifest reference (Adobe Durable Content Credentials, when the embedded manifest is stripped), the visible marks (Gemini sparkle plus the Doubao "豆包AI生成" / Jimeng "即梦AI" / Samsung Galaxy AI "Contenuti generati dall'AI" text marks), the open SD/SDXL/FLUX invisible watermark, and (with the `trustmark` extra) the open Adobe TrustMark watermark into one origin-platform + watermark-inventory verdict (`--json` for machine output) diff --git a/docs/known-limitations.md b/docs/known-limitations.md index ec237e5..d5753f8 100644 --- a/docs/known-limitations.md +++ b/docs/known-limitations.md @@ -138,4 +138,6 @@ See `docs/synthid.md` §5.5 + `docs/controlnet-removal-pipeline-research.md` (ce The scrub still comes from the img2img `strength` (same lever as SDXL); the call shape lives in the pure `_build_qwen_kwargs` (uses Qwen's `true_cfg_scale`, not SDXL's `guidance_scale` — the CLI `--guidance-scale` maps onto it, and ~4.0 is typical vs the SDXL default 7.5). bf16 on CUDA. It is **CUDA/cloud-class — the 20B does not fit MPS — so `_run_qwen` has NO MPS→CPU fallback** (unlike the SDXL paths). Cost on Modal A100-80GB is ~$0.05-0.10/image vs SDXL. -**Prototype oracle floors (Modal A100-80GB, single seed, 2026-06-19 — PENDING seed-repeat cert):** on native-resolution OpenAI and Gemini cert inputs (both controls SynthID-POSITIVE), OpenAI cleared at strength **0.10** and Gemini at **0.30** (0.20 still detected). At those floors CJK text and faces stayed faithful (the zoom comparison showed controlnet-style plastication absent). Two caveats before relying on it: (1) near-floor scrub is SEED-NON-DETERMINISTIC (the general known-limitation above), so these single-seed floors are NOT certified — run a seed-repeat sweep before trusting them; (2) `resolve_strength` is shared and pipeline-independent, so the Gemini default (0.15, the certified controlnet floor) UNDER-scrubs Gemini on `qwen` (whose floor is ~0.30) — **pass an explicit `--strength` for Gemini content on `qwen`** until a Qwen-specific ladder is certified. Flat-graphic content was not in the prototype sample. +**Prototype oracle floors (Modal A100-80GB, single seed, 2026-06-19 — PENDING seed-repeat cert):** on native-resolution OpenAI and Gemini cert inputs (both controls SynthID-POSITIVE), OpenAI cleared at strength **0.10** and Gemini at **0.30** (0.20 still detected). + +**Fidelity vs controlnet was MEASURED, not eyeballed (`scripts/fidelity_metrics.py`; an initial eyeball read was wrong and overturned by the metrics):** at EQUAL strength, Qwen beats controlnet on **text** (OpenAI typography 0.10: OCR CER 0.25 vs 0.37; higher SSIM/PSNR) but controlnet beats Qwen on **faces** (gemini_3, 18 faces, 0.15 each: Laplacian-variance retention 0.62 vs 0.41, face LPIPS 0.09 vs 0.13 — Qwen smooths faces MORE; ArcFace identity ~tied at 0.546/0.543). At each pipeline's OWN scrub floor the face gap widens (Qwen's Gemini floor 0.30 is 2x controlnet's 0.15). **Conclusion: Qwen is the better TEXT-preserving remover, NOT a universal fidelity win — controlnet's canny edge map holds face skin detail better.** Two caveats before relying on it: (1) near-floor scrub is SEED-NON-DETERMINISTIC (the general known-limitation above), so these single-seed floors are NOT certified — run a seed-repeat sweep before trusting them; (2) `resolve_strength` is shared and pipeline-independent, so the Gemini default (0.15, the certified controlnet floor) UNDER-scrubs Gemini on `qwen` (whose floor is ~0.30) — **pass an explicit `--strength` for Gemini content on `qwen`** until a Qwen-specific ladder is certified. Flat-graphic content was not in the prototype sample. diff --git a/docs/module-internals.md b/docs/module-internals.md index e68b2cc..18b9745 100644 --- a/docs/module-internals.md +++ b/docs/module-internals.md @@ -181,7 +181,7 @@ Root cause: bad alpha (under-estimated, max ~0.65) + fixed-no-inpaint + tight bo **`sdxl`** (renamed from `default` 2026-06-09; `default` kept as a back-compat alias via `normalize_profile`) runs plain SDXL img2img (`_run_img2img`); it is the lighter opt-down alternative (no ControlNet weights). -**`qwen`** (`_run_qwen`, `_load_qwen_pipeline`) runs `QwenImageImg2ImgPipeline` on `Qwen/Qwen-Image` (20B MMDiT, Apache-2.0 code AND weights). The scrub still comes from the img2img `strength`; Qwen's value is that it preserves text (incl. CJK) and structure markedly better than SDXL at the scrub floor, so it over-regenerates real photos far less (directly targets the controlnet over-regeneration problem). Specifics: bf16 on CUDA (fp16 risks overflow on the 20B MMDiT — see the dtype branch in `__init__`); loads `QWEN_MODEL_ID` unless `--model` is overridden; the call shape lives in the pure module helper `_build_qwen_kwargs` (unit-tested without torch in `tests/test_platform.py::TestQwenKwargs`), which uses Qwen's `true_cfg_scale` (NOT SDXL's `guidance_scale` — the CLI `--guidance-scale` maps onto it; ~4.0 is typical, the SDXL default 7.5 is high for Qwen) and an explicit `negative_prompt` (`_QWEN_PROMPT`/`_QWEN_NEGATIVE`). It is CUDA/cloud-class (the 20B does not fit MPS), so `_run_qwen` has NO MPS->CPU fallback — an error propagates. `_load_qwen_pipeline` raises a clear ImportError if the installed diffusers lacks `QwenImageImg2ImgPipeline`. **Prototype oracle floors (Modal A100-80GB, single seed, 2026-06-19, PENDING seed-repeat cert): OpenAI clears at strength ~0.10, Gemini at ~0.30 (0.20 still detected) — both controls were SynthID-positive; at those floors CJK text + faces stay faithful where controlnet plasticizes. The Gemini floor (0.30) is HIGHER than the certified controlnet Gemini floor (0.15), and `resolve_strength` is shared/pipeline-independent, so pass an explicit `--strength` for Gemini content on `qwen` until a Qwen-specific ladder is certified.** +**`qwen`** (`_run_qwen`, `_load_qwen_pipeline`) runs `QwenImageImg2ImgPipeline` on `Qwen/Qwen-Image` (20B MMDiT, Apache-2.0 code AND weights). The scrub still comes from the img2img `strength`; Qwen's value is **text preservation** (incl. CJK and small text). **Metric-measured nuance (2026-06-19, `scripts/fidelity_metrics.py`, do NOT trust the eyeball here — it misled): at EQUAL strength Qwen beats controlnet on TEXT (lower OCR CER, higher SSIM/PSNR) but controlnet beats Qwen on FACES (higher Laplacian-variance retention and lower LPIPS — Qwen actually smooths faces MORE; ArcFace identity is ~tied). At each pipeline's own scrub floor the face gap widens further because Qwen's Gemini floor (0.30) is 2x controlnet's (0.15).** So Qwen is the better text-preserving remover, NOT a universal fidelity win — controlnet's canny edge map holds face skin detail better. Specifics: bf16 on CUDA (fp16 risks overflow on the 20B MMDiT — see the dtype branch in `__init__`); loads `QWEN_MODEL_ID` unless `--model` is overridden; the call shape lives in the pure module helper `_build_qwen_kwargs` (unit-tested without torch in `tests/test_platform.py::TestQwenKwargs`), which uses Qwen's `true_cfg_scale` (NOT SDXL's `guidance_scale` — the CLI `--guidance-scale` maps onto it; ~4.0 is typical, the SDXL default 7.5 is high for Qwen) and an explicit `negative_prompt` (`_QWEN_PROMPT`/`_QWEN_NEGATIVE`). It is CUDA/cloud-class (the 20B does not fit MPS), so `_run_qwen` has NO MPS->CPU fallback — an error propagates. `_load_qwen_pipeline` raises a clear ImportError if the installed diffusers lacks `QwenImageImg2ImgPipeline`. **Prototype oracle floors (Modal A100-80GB, single seed, 2026-06-19, PENDING seed-repeat cert): OpenAI clears at strength ~0.10, Gemini at ~0.30 (0.20 still detected) — both controls were SynthID-positive. The Gemini floor (0.30) is HIGHER than the certified controlnet Gemini floor (0.15), and `resolve_strength` is shared/pipeline-independent, so pass an explicit `--strength` for Gemini content on `qwen` until a Qwen-specific ladder is certified.** Fidelity vs controlnet was measured (`scripts/fidelity_metrics.py`), NOT eyeballed — see the metric nuance above: Qwen wins text, controlnet wins faces. **`controlnet`** (**the DEFAULT pipeline since 2026-06-09** for `invisible`/`all`/`batch` and both engine ctors; `_run_controlnet`, `_load_controlnet_pipeline`) runs `StableDiffusionXLControlNetImg2ImgPipeline` with the SDXL-native canny ControlNet `xinsir/controlnet-canny-sdxl-1.0` (`watermark_profiles.CONTROLNET_CANNY_MODEL`): the control image is `cv2.Canny(gray, 100, 200)` stacked to 3 channels (`_CANNY_LOW`/`_CANNY_HIGH`, prompt `_CONTROLNET_PROMPT` / `_CONTROLNET_NEGATIVE`). diff --git a/scripts/fidelity_metrics.py b/scripts/fidelity_metrics.py new file mode 100644 index 0000000..6e057bc --- /dev/null +++ b/scripts/fidelity_metrics.py @@ -0,0 +1,303 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "click", +# "numpy", +# "opencv-python-headless", +# "pillow", +# "scikit-image", +# "rapidfuzz", +# "torch", +# "lpips", +# "easyocr", +# "insightface", +# "onnxruntime", +# ] +# /// +"""Objective fidelity metrics for comparing watermark-removal outputs. + +Given an ORIGINAL (the reference) and one or more cleaned VARIANTS that have all +ALREADY passed the scrub oracle, this scores how much real detail each variant +preserved -- so "closer to the original" is the right axis here (between two +equally-scrubbed outputs, the one that deviates less from the original wins). + +It is a standalone eval tool, NOT part of the package: PEP 723 inline deps let +``uv run`` build a throwaway env so the heavy models (EasyOCR, insightface, +LPIPS) never touch uv.lock or the shipped library. Metrics self-gate: face +metrics run only where faces are detected, text metrics only where text is. + +Four metric groups (all reference = original): + 1. Text -- EasyOCR character error rate (CER) of each variant vs the original's + OCR string. Lower = text better preserved. OCR is noisy, so treat it + as a RELATIVE comparison (every variant scored against the same ref). + 2. Face identity -- insightface (buffalo_l) ArcFace cosine similarity, original + face vs the geometrically-matched variant face. Higher = identity kept. + 3. Face texture -- LPIPS + Laplacian-variance ratio (variant/original) on face + crops. Catches "plastication" (lost high-frequency skin detail): + lapvar ratio < 1 = smoother than the original. + 4. Whole image -- LPIPS / SSIM / PSNR vs the original (context: background too). + +Usage: + uv run scripts/fidelity_metrics.py --original O.png \ + --variant controlnet=C.png --variant qwen=Q.png --ocr-langs en,ru,ch_sim +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import click +import cv2 +import numpy as np +from _plain_console import Console, Table + +logging.basicConfig(level=logging.WARNING, format="%(message)s") +log = logging.getLogger(__name__) +console = Console() + + +# ── helpers ────────────────────────────────────────────────────────── + + +def _load_bgr(path: str) -> np.ndarray: + img = cv2.imread(path, cv2.IMREAD_COLOR) + if img is None: + raise click.ClickException(f"cannot read image: {path}") + return img + + +def _match_size(variant: np.ndarray, ref: np.ndarray) -> np.ndarray: + """Resize a variant to the reference size (outputs differ by a grid-round).""" + if variant.shape[:2] != ref.shape[:2]: + variant = cv2.resize(variant, (ref.shape[1], ref.shape[0]), interpolation=cv2.INTER_LANCZOS4) + return variant + + +# ── text: OCR CER ──────────────────────────────────────────────────── + +# EasyOCR rejects some language combos in one Reader, so group into compatible +# readers and union the detections. Cyrillic and Chinese cannot share a reader. +_OCR_GROUPS = { + "en": ["en"], + "ru": ["ru", "en"], + "ch_sim": ["ch_sim", "en"], +} + + +def _ocr_string(readers: list, bgr: np.ndarray) -> str: + """Union all readers' detections into one position-sorted, whitespace-free string.""" + rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB) + dets: list[tuple[float, float, str]] = [] + for reader in readers: + for box, text, conf in reader.readtext(rgb): + if conf < 0.3 or not text.strip(): + continue + ys = [p[1] for p in box] + xs = [p[0] for p in box] + dets.append((min(ys), min(xs), text.strip())) + # Sort top-to-bottom, then left-to-right (coarse reading order). + dets.sort(key=lambda d: (round(d[0] / 20.0), d[1])) + return "".join(t for _, _, t in dets).replace(" ", "") + + +def _build_ocr_readers(langs: list[str]) -> list: + import easyocr + + seen: set[tuple[str, ...]] = set() + readers = [] + for lang in langs: + group = tuple(_OCR_GROUPS.get(lang, [lang])) + if group in seen: + continue + seen.add(group) + readers.append(easyocr.Reader(list(group), gpu=False, verbose=False)) + return readers + + +# ── face: detection + ArcFace + texture ────────────────────────────── + + +@dataclass +class FaceStats: + n_faces: int = 0 + identity: list[float] = field(default_factory=list) + lpips: list[float] = field(default_factory=list) + lapvar_ratio: list[float] = field(default_factory=list) + + +def _lap_var(bgr: np.ndarray) -> float: + gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY) + return float(cv2.Laplacian(gray, cv2.CV_64F).var()) + + +def _match_face(orig_face: Any, variant_faces: list[Any]) -> Any: + """Nearest variant face to an original face by bbox-center distance (geometry kept).""" + ox, oy = (orig_face.bbox[0] + orig_face.bbox[2]) / 2, (orig_face.bbox[1] + orig_face.bbox[3]) / 2 + best, best_d = None, 1e18 + for vf in variant_faces: + vx, vy = (vf.bbox[0] + vf.bbox[2]) / 2, (vf.bbox[1] + vf.bbox[3]) / 2 + d = (ox - vx) ** 2 + (oy - vy) ** 2 + if d < best_d: + best, best_d = vf, d + return best + + +def _cosine(a: np.ndarray, b: np.ndarray) -> float: + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) + + +def _crop(bgr: np.ndarray, bbox: Any) -> np.ndarray: + h, w = bgr.shape[:2] + x1, y1, x2, y2 = (int(max(0, bbox[0])), int(max(0, bbox[1])), int(min(w, bbox[2])), int(min(h, bbox[3]))) + return bgr[y1:y2, x1:x2] + + +# ── whole image: LPIPS / SSIM / PSNR ───────────────────────────────── + + +def _lpips_model() -> tuple[Any, Any]: + import lpips + import torch + + model = lpips.LPIPS(net="alex", verbose=False) + model.eval() + return model, torch + + +def _lpips_distance(model_torch: tuple[Any, Any], a_bgr: np.ndarray, b_bgr: np.ndarray) -> float: + model, torch = model_torch + + def _t(img: np.ndarray) -> Any: + rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 127.5 - 1.0 + return torch.from_numpy(rgb).permute(2, 0, 1).unsqueeze(0) + + with torch.no_grad(): + return float(model(_t(a_bgr), _t(b_bgr)).item()) + + +def _ssim_psnr(a_bgr: np.ndarray, b_bgr: np.ndarray) -> tuple[float, float]: + from skimage.metrics import peak_signal_noise_ratio, structural_similarity + + a = cv2.cvtColor(a_bgr, cv2.COLOR_BGR2GRAY) + b = cv2.cvtColor(b_bgr, cv2.COLOR_BGR2GRAY) + ssim = float(structural_similarity(a, b)) + psnr = float(peak_signal_noise_ratio(a, b)) + return ssim, psnr + + +# ── main ───────────────────────────────────────────────────────────── + + +def _mean(xs: list[float]) -> float | None: + return sum(xs) / len(xs) if xs else None + + +def _fmt(v: float | None, nd: int = 3) -> str: + return "-" if v is None else f"{v:.{nd}f}" + + +@click.command() +@click.option("--original", required=True, type=click.Path(exists=True), help="Reference (unprocessed) image.") +@click.option( + "--variant", + "variants", + multiple=True, + required=True, + help="LABEL=PATH of a cleaned output (repeatable).", +) +@click.option("--ocr-langs", default="en", help="Comma list of EasyOCR langs (en,ru,ch_sim). Empty = skip text.") +@click.option("--no-faces", is_flag=True, help="Skip face metrics.") +def main(original: str, variants: tuple[str, ...], ocr_langs: str, no_faces: bool) -> None: + """Score each VARIANT against ORIGINAL across the four fidelity groups.""" + ref = _load_bgr(original) + parsed: list[tuple[str, np.ndarray]] = [] + for spec in variants: + if "=" not in spec: + raise click.ClickException(f"--variant must be LABEL=PATH, got {spec!r}") + label, path = spec.split("=", 1) + parsed.append((label, _match_size(_load_bgr(path), ref))) + + langs = [x.strip() for x in ocr_langs.split(",") if x.strip()] + lp = _lpips_model() # AlexNet LPIPS, loaded once and reused for face crops + whole image + + # ── text ── + ocr_cer: dict[str, float | None] = {label: None for label, _ in parsed} + if langs: + console.print(f" OCR ({','.join(langs)})...") + from rapidfuzz.distance import Levenshtein + + readers = _build_ocr_readers(langs) + ref_text = _ocr_string(readers, ref) + if ref_text: + for label, img in parsed: + hyp = _ocr_string(readers, img) + ocr_cer[label] = Levenshtein.normalized_distance(ref_text, hyp) + else: + console.print(" (no text detected in the original; skipping text metric)") + + # ── faces ── + face_stats: dict[str, FaceStats] = {label: FaceStats() for label, _ in parsed} + if not no_faces: + console.print(" Faces (insightface buffalo_l)...") + from insightface.app import FaceAnalysis + + app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"]) + app.prepare(ctx_id=-1, det_size=(640, 640)) + ref_faces = app.get(ref) + if ref_faces: + for label, img in parsed: + vfaces = app.get(img) + st = face_stats[label] + for of in ref_faces: + vf = _match_face(of, vfaces) + if vf is None: + continue + st.n_faces += 1 + st.identity.append(_cosine(of.normed_embedding, vf.normed_embedding)) + oc, vc = _crop(ref, of.bbox), _crop(img, of.bbox) + if oc.size == 0 or vc.size == 0: + continue + vc_r = cv2.resize(vc, (oc.shape[1], oc.shape[0]), interpolation=cv2.INTER_LANCZOS4) + st.lpips.append(_lpips_distance(lp, oc, vc_r)) + ov = _lap_var(oc) + st.lapvar_ratio.append(_lap_var(vc_r) / ov if ov > 1e-6 else 0.0) + else: + console.print(" (no faces detected in the original; skipping face metrics)") + + # ── whole image ── + console.print(" Whole-image LPIPS/SSIM/PSNR...") + whole: dict[str, tuple[float, float, float]] = {} + for label, img in parsed: + ssim, psnr = _ssim_psnr(ref, img) + whole[label] = (_lpips_distance(lp, ref, img), ssim, psnr) + + # ── report ── + table = Table(title=f"Fidelity vs {Path(original).name} (reference)") + for col in ("variant", "text CER↓", "faces", "ID cos↑", "face LPIPS↓", "lapvar↑", "img LPIPS↓", "SSIM↑", "PSNR↑"): + table.add_column(col) + for label, _ in parsed: + st = face_stats[label] + wl, ws, wp = whole[label] + table.add_row( + label, + _fmt(ocr_cer[label]), + str(st.n_faces), + _fmt(_mean(st.identity)), + _fmt(_mean(st.lpips)), + _fmt(_mean(st.lapvar_ratio)), + _fmt(wl), + _fmt(ws), + _fmt(wp, 1), + ) + console.print(table) + console.print( + " Legend: CER lower=better; ID cos higher=better; face LPIPS lower=better; " + "lapvar ratio ~1=detail kept, <1=smoothed/plastic; img LPIPS lower=better; SSIM/PSNR higher=closer." + ) + + +if __name__ == "__main__": + main()