From 1439eb07149d9c7a1349d4018b97e210bbb732b1 Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Mon, 8 Jun 2026 15:20:29 -0700 Subject: [PATCH] feat(photomaker): SynthID-safe face-identity restoration via PhotoMaker-V2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the second face-restore mechanism, selectable via the new CLI option `--restore-faces-method=photomaker`. Unlike the existing GFPGAN path (which runs on the watermarked ORIGINAL and was oracle-confirmed to re-introduce SynthID by partial pixel blending), PhotoMaker carries identity in a SynthID-invariant OpenCLIP embedding and regenerates fresh face pixels conditioned on it — the pixels in the output are diffusion-fresh, so the watermark cannot be transported. The load-bearing assumption (embedding invariance to SynthID-magnitude pixel noise) was empirically validated in the prior commit (smoke test): cosine drift 0.002 under a ±2 LSB low-freq carrier, an order of magnitude less than JPEG90 drift which SynthID survives at >=99% TPR. End-to-end commercial-safe: - PhotoMaker-V2 weights: Apache-2.0 (TencentARC) - ID encoder: OpenCLIP-ViT-H/14 (MIT) - SDXL base: shared with the main pipeline - NO InsightFace (the non-commercial blocker for IP-Adapter FaceID / InstantID / PuLID / Arc2Face) Two-pass architecture (PhotoMaker has no ControlNetImg2img class in diffusers): 1) main controlnet/default removal pass cleans SynthID + drifts faces 2) PhotoMaker txt2img regenerates each face from its embedding, feather-composited back into the cleaned image New module `photomaker_restore.py` mirrors `face_restore.py`: lazy pipeline singleton (double-checked lock), `is_available()` gate, pure `_face_crop_square` and `_composite_faces` helpers, all unit-tested without the model (9 new tests). New `InvisibleEngine._restore_faces_photomaker` runs after the diffusion pass, mirroring `_restore_faces`. CLI flag `--restore-faces-method=[gfpgan|photomaker]` threaded through `cmd_invisible`/`cmd_all`/`cmd_batch` + `_process_batch_image`. New optional `photomaker` extra (Apache-2.0 + Apache-2.0/MIT deps, no basicsr). `[tool.hatch.metadata] allow-direct-references = true` is required because the upstream PhotoMaker package lives only on GitHub. The next step (separate work) is oracle validation: run a 6-image cert sweep through the new pipeline (default/controlnet at the certified strength + --restore-faces-method=photomaker) and confirm SynthID stays clean while face identity is recovered. The required infrastructure (`raiw-app/modal_cert.py`) is already in place. ruff + strict pyright(src/) clean; 586 tests pass (+ 9 new in tests/test_photomaker_restore.py). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 4 +- pyproject.toml | 22 ++ src/remove_ai_watermarks/cli.py | 30 +- src/remove_ai_watermarks/invisible_engine.py | 58 +++- .../photomaker_restore.py | 286 ++++++++++++++++++ tests/test_photomaker_restore.py | 128 ++++++++ uv.lock | 13 +- 7 files changed, 532 insertions(+), 9 deletions(-) create mode 100644 src/remove_ai_watermarks/photomaker_restore.py create mode 100644 tests/test_photomaker_restore.py diff --git a/CLAUDE.md b/CLAUDE.md index ba70da8..47dbed4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,7 +27,8 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - GPU/ML modules (invisible_engine, watermark_remover) are optional — guard imports with `is_available()` checks - Optional detection extras: `detect` (imwatermark — open SD/SDXL/FLUX watermark) and `trustmark` (Adobe TrustMark decoder; pulls torch + downloads weights). Both are guarded by `is_available()` and skipped by `identify` when absent. -- Optional `restore` extra (gfpgan/facexlib/basicsr): the GFPGAN face-identity post-pass (`face_restore.py`, CLI `--restore-faces`, **EXPERIMENTAL, opt-in, OFF by default**). Guarded by `face_restore.is_available()`; when enabled it auto-skips with a debug log when the extra is absent or no face is detected. numpy<2-pinned and Python-3.12-pinned (see the `face_restore.py` Key-modules bullet). +- Optional `restore` extra (gfpgan/facexlib/basicsr): the GFPGAN face-identity post-pass (`face_restore.py`, CLI `--restore-faces` with `--restore-faces-method=gfpgan` (default), **EXPERIMENTAL, opt-in, OFF by default**). Guarded by `face_restore.is_available()`; when enabled it auto-skips with a debug log when the extra is absent or no face is detected. **Footgun for removal: re-introduces SynthID** (see `face_restore.py` bullet) -- removal-priority callers must use `--restore-faces-method=photomaker` instead. numpy<2-pinned and Python-3.12-pinned. +- Optional `photomaker` extra (`photomaker` upstream package + huggingface-hub): the SynthID-safe PhotoMaker-V2 face-identity post-pass (`photomaker_restore.py`, CLI `--restore-faces --restore-faces-method=photomaker`, **EXPERIMENTAL, opt-in, OFF by default**). Commercial-safe end-to-end (PhotoMaker-V2 Apache-2.0 + OpenCLIP-ViT-H/14 MIT; NO InsightFace -- the non-commercial blocker for IP-Adapter FaceID / InstantID / PuLID / Arc2Face). Carries identity in a SynthID-invariant OpenCLIP embedding (validated 2026-06-04: cosine drift 0.002 under SynthID-magnitude pixel noise, an order of magnitude less than JPEG90 drift which SynthID survives) and regenerates fresh face pixels conditioned on it. Heavy (~3 GB SDXL + ~1 GB PhotoMaker adapter, downloaded on first use). Kept OUT of `all`. The `photomaker` extra references the upstream git repo, which requires `[tool.hatch.metadata] allow-direct-references = true`. See `docs/synthid-robust-identity-research.md`. - Optional `esrgan` extra (spandrel only): Real-ESRGAN pre-diffusion super-resolution for small inputs (`upscaler.py`, CLI `--upscaler esrgan` on `invisible`/`all`/`batch`). Guarded by `upscaler.is_available()`; the default upscaler stays Lanczos (cv2, no deps) and the engine falls back to Lanczos when the extra is absent or the model errors. spandrel is MIT and pulls NO basicsr (only torch/torchvision/safetensors/numpy/einops), sidestepping the `restore` extra's basicsr breakage; Real-ESRGAN weights are BSD-3-Clause and download on first use via `torch.hub` (never bundled). Kept OUT of `all` (heavy + model download), same as `restore`. - Tests for the *model-running* paths are limited to availability checks (multi-GB downloads). But the **pure helpers inside ML-adjacent modules are unit-tested without any download** and must stay that way: `_target_size` (native-vs-downscale-cap-vs-upscale-floor, `test_invisible_engine.py`), `humanizer.unsharp_mask`/`adaptive_polish` (`test_humanizer.py`), `auto_config.plan`/detectors (`test_auto_config.py`), and the MPS->CPU fallback control flow via mocked pipelines (`test_img2img_runner.py`, 100% cover). Don't skip these as "ML, needs a model" — only `remove_watermark`/the diffusion bodies do. @@ -47,6 +48,7 @@ You are a **principal Python engineer** maintaining a CLI tool and library for r - `trustmark_detector.py` — `detect_trustmark(path)` decodes the OPEN, keyless **Adobe TrustMark** watermark (the soft binding behind Adobe Durable Content Credentials, `alg` `com.adobe.trustmark.P`) via the optional `trustmark` package (extra `trustmark`; pulls torch, downloads model weights on first use). Mirrors `invisible_watermark.py` (lazy singleton guarded by a double-checked `threading.Lock` so concurrent callers do not double-download the weights, top-of-module pyright pragma, returns None when absent). It detects *provenance*, not AI origin as such (TrustMark also marks human-authored content), so `identify` lists it as a watermark without setting `is_ai_generated`. Other soft-binding vendors (Digimarc/Imatag/Steg.AI/...) have no public decoder — they are only *named* via the `C2PA_SOFT_BINDINGS` scan, not decoded. **False-positive gate (added 2026-05-29):** TrustMark's `wm_present` is a BCH error-correction validity flag that spuriously validates on a content-correlated fraction of un-watermarked images — AI-generated textures trip it far more than camera photos (verified 2026-05-29 on real files: it fires on Gemini/OpenAI/Doubao output that *cannot* carry Adobe's watermark, with a random-bytes decoded secret, while signal-free camera photos did not trip it). A genuine TrustMark is a *durable* soft binding engineered to survive re-encoding, so `detect_trustmark` re-decodes after a mild JPEG round-trip (`_survives_reencode`, `_REENCODE_QUALITY` 95) and requires the same schema both times; every observed false positive collapsed (none survived even q95), so the gate is the durability property the watermark guarantees. The second decode runs only on the rare initial hit, so the cost is negligible. Do NOT remove the gate to "catch more" — a lone TrustMark hit without it is almost always content noise. - `noai/watermark_remover.py` — the `WatermarkRemover` class has two diffusion pipelines, selected by the explicit `pipeline` ctor arg (NOT inferred from `model_id` -- both use the same SDXL base, `DEFAULT_MODEL_ID`). **`default`** runs plain SDXL img2img (`_run_img2img`). **`controlnet`** (**EXPERIMENTAL, opt-in**; `_run_controlnet`, `_load_controlnet_pipeline`) runs `StableDiffusionXLControlNetImg2ImgPipeline` with the SDXL-native canny ControlNet `xinsir/controlnet-canny-sdxl-1.0` (`watermark_profiles.CONTROLNET_CANNY_MODEL`): the control image is `cv2.Canny(gray, 100, 200)` stacked to 3 channels (`_CANNY_LOW`/`_CANNY_HIGH`, prompt `_CONTROLNET_PROMPT` / `_CONTROLNET_NEGATIVE`). **Removal comes from the img2img regeneration (`strength`); the ControlNet only PRESERVES text and face STRUCTURE via the edge map.** No original pixels are copied or frozen, BUT **validation 2026-06-04 disproved the old "so SynthID does not survive" claim: SynthID CAN survive controlnet on photoreal/high-detail content.** At the shared low removal strength the canny edge-conditioning keeps the regeneration so close to the original that the pixel perturbation that destroys SynthID does not happen (oracle-confirmed: an OpenAI bracelet photo + a 9-face grid read **SynthID-detected** after controlnet at strength 0.10/0.15, but **SynthID-not-detected** after the `default` pipeline at the SAME strength + resolution -- only the pipeline differed). **But the reverse also holds: a flat-graphic logo/poster SURVIVED `default` while clearing controlnet** -- removal at the low strength is content×pipeline dependent and neither pipeline is universally safe; the real lever is a higher strength. See the controlnet Known-limitations bullet for the full table + root cause. Canny holds face STRUCTURE but NOT identity (the regenerated face drifts in likeness -- canny carries edges, not identity; face identity is preserved by the optional `--restore-faces` GFPGAN post-pass (EXPERIMENTAL, opt-in, OFF by default) -- see `face_restore.py`). `controlnet_conditioning_scale` (ctor arg, default 1.0) is the structure-preservation knob. Same dtype rule as `default` (fp32 on cpu/mps, fp16 only on cuda/xpu; the fp16-fixed SDXL VAE `_SDXL_FP16_VAE_ID` is swapped in on fp16 GPUs -- issue #29) and the same MPS->CPU fallback (reload on cpu/fp32, drop a non-cpu generator, retry once). - `face_restore.py` — optional GFPGAN face-restoration post-pass (cv2/torch/gfpgan boundary, top-of-file pyright pragma). **EXPERIMENTAL, opt-in, OFF by default.** Runs AFTER the diffusion removal pass (`InvisibleEngine.remove_watermark`, params `restore_faces=False` / `restore_faces_weight=0.5`; CLI `--restore-faces`/`--no-restore-faces` + `--restore-faces-weight` on `invisible`/`all`/`batch`). **WARNING -- this pass can RE-INTRODUCE SynthID into the face regions (oracle-confirmed 2026-06-04); the old "scrubs the watermark / oracle-validated clean" claim was WRONG.** Flow: GFPGANer.enhance runs on the **ORIGINAL (watermarked)** image -> identity faces + RetinaFace boxes (`restorer.face_helper.det_faces`); `_composite_faces` feather-composites those restored face REGIONS into the diffusion-cleaned image. At the default fidelity weight `0.5` GFPGAN BLENDS ~half the original face pixels with the StyleGAN2 prior (it is not a pure GAN re-synthesis), and **SynthID is robust to that partial blend**, so the composited face carries the watermark back IN -- overwriting what the diffusion pass removed. **Confirmed by a clean A/B:** an OpenAI/Gemini face image (`gemini_3`) read SynthID-DETECTED after controlnet @ strength 0.20/0.25 WITH restore, but SynthID-NOT-detected after the SAME controlnet @ 0.20 with `--no-restore-faces` (only restore differed). It is **content-dependent** (a second face image cleared WITH restore -- smaller faces / different blend), which is why the earlier single-image validation read "clean". **So `--restore-faces` as currently wired is a footgun for removal: it can re-add the watermark it is supposed to be scrubbing. Removal-priority callers (raiw.cc) must NOT use restore-on-original, or must switch to one of the fixes below.** **Fix directions (engineering follow-up, not yet done):** (a) run GFPGAN on the DIFFUSION-CLEANED image instead of the original, so the restored face is derived from already-clean pixels (loses some identity sharpness); (b) drop `--restore-faces-weight` well below 0.5 (more StyleGAN2 synthesis, less original -> less SynthID, but identity drifts); (c) leave restore OFF when removal is the priority. Each needs its own oracle re-validation. `is_available()` gates on gfpgan + facexlib; lazily-built `GFPGANer` singleton forces CPU unless CUDA (the pip GFPGANer has an MPS device-mismatch bug; it is a cheap post-pass on a few face crops). `_apply_basicsr_shim()` recreates the removed `torchvision.transforms.functional_tensor` module that basicsr imports. The pure `_composite_faces` helper (Gaussian-feathered rectangular alpha per box, `out = restored*a + base*(1-a)`) is unit-tested without the model (`tests/test_face_restore.py`); the model-running path is gated behind `is_available()`. **Commercial-safe** (GFPGAN Apache-2.0 + RetinaFace MIT); the CodeFormer alternative is NON-COMMERCIAL and is NOT shipped. The `restore` extra (gfpgan/facexlib/basicsr) is kept OUT of `all` (heavy + the GFPGANv1.4 + RetinaFace weights download on first use, never bundled). **`restore` pins numpy<2** (same trap class as the removed faceid/insightface extra): basicsr/gfpgan/facexlib are an old ecosystem, so the extra caps `scipy<1.18` (>=1.18 uses `np.long`, gone in numpy 1.24-1.26) and `numba<0.60` to keep the whole env on one numpy 1.26 resolution; verified the `--extra dev --extra gpu` gate env stays numpy 1.26.4 + `diffusers.loaders.peft` importable with `restore` present. **basicsr 1.4.2 builds only on Python <3.13** (its `setup.py get_version()` uses `exec(...)` + `locals()['__version__']`, which the 3.13 fast-locals change broke -> `KeyError: '__version__'`), so the project is pinned to Python 3.12 via `.python-version` and `[tool.uv.extra-build-dependencies] basicsr = ["setuptools<69"]`. basicsr ships sdist-only (no wheel). +- `photomaker_restore.py` — SynthID-safe face-identity restoration via PhotoMaker-V2 (commercial-safe alternative to `face_restore.py`'s GFPGAN footgun). **EXPERIMENTAL, opt-in via `--restore-faces --restore-faces-method=photomaker`, needs the `photomaker` extra.** Runs AFTER the diffusion removal pass (`InvisibleEngine.remove_watermark` -> `_restore_faces_photomaker`). Flow: YuNet detects faces in the CLEANED image; for each box, the SAME box from the ORIGINAL is square-cropped (`_face_crop_square`) and fed as `input_id_images` to `PhotoMakerStableDiffusionXLPipeline` (txt2img); the regenerated face is feather-composited back via `_composite_faces`. Identity comes from the OpenCLIP-ViT-H/14 embedding of the original face (SynthID-invariant: cosine 0.9977 on SynthID-magnitude noise, an order of magnitude less drift than JPEG90 which SynthID survives), but the PIXELS that land in the output are diffusion-fresh -- so SynthID is not transported back, unlike GFPGAN-on-original. **Commercial-safe end-to-end:** PhotoMaker-V2 Apache-2.0, OpenCLIP-ViT-H/14 MIT, SDXL shared with main pipeline, NO InsightFace. PhotoMaker is fundamentally txt2img in diffusers (`PhotoMakerStableDiffusionXLPipeline`); there is no `PhotoMakerControlNetImg2img` class, so this is a TWO-PASS pipeline: pass 1 (controlnet/default) cleans SynthID + drifts faces, pass 2 (this module) regenerates faces from the SynthID-invariant embedding. Pure helpers (`_face_crop_square`, `_composite_faces`) are unit-tested without the model (`tests/test_photomaker_restore.py`); the model-running path is gated behind `is_available()` and exercised manually via the Modal cert sweep. Lazy `PhotoMakerStableDiffusionXLPipeline` singleton (double-checked lock) downloads `photomaker-v2.bin` from `TencentARC/PhotoMaker-V2` on first use; never bundled. fp16 on CUDA, fp32 on MPS/CPU. See `docs/synthid-robust-identity-research.md` for the load-bearing embedding-invariance proof + license table. - `auto_config.py` — the `--auto` quality-mode planner (EXPERIMENTAL). `plan(image_path) -> AutoConfig | None` inspects the INPUT image (before the diffusion model loads) and picks the pipeline modes, so the run adapts to content. **Designed to run as the FIRST step of the invisible/all pipeline, wherever that runs** — locally or the raiw.cc Modal GPU worker — **never on the 512 MB web host** (image work there OOM-crashes the container; the planner is `_apply_auto` in `cli.py` for the CLI, and raiw-app would call `plan()` inside `RaiwProtect.remove`). **Quality-priority routing:** ControlNet (text/face-structure preservation) is the default; it is skipped for `default` (plain SDXL) only on a clearly structure-less image (`not has_face and not has_text and edge_density < _STRUCTURELESS_EDGE_MAX` 0.008). **CAVEAT (oracle-validated 2026-06-04, see the controlnet Known-limitations bullet): at the low vendor-adaptive strength NEITHER pipeline removes SynthID on all content -- it is content×pipeline dependent (photoreal SURVIVES controlnet / clears default; flat graphics SURVIVE default / clear controlnet; flat text clears both). So `--auto` picking controlnet for faces/photos leaves SynthID on exactly those, and plain `default` would leave it on flat graphics -- pipeline choice alone does NOT guarantee removal. The real lever is a HIGHER strength, oracle-validated per content type. Removal-priority callers (raiw.cc) must oracle-validate strength across content types BEFORE adopting auto; the "must keep SynthID removed" gate in the adoption note below is the blocker this caught.** `restore_faces` is on when a face is present. When a smoothing pass (controlnet/restore) ran, the **adaptive polish** (`humanizer.adaptive_polish`) is applied: it targets the input's Laplacian variance (detail level) with a capped unsharp + edge-masked grain, restoring photo/face texture while **sparing text** (text is already high-frequency, so the deficit is tiny and almost no polish lands -- the old fixed unsharp/grain speckled small text; validated 2026-06-03 on gemini_3 lap-var 84->334 toward the 592 original, openai_1 text near-untouched). **Detection is cv2-only and torch-free** (~100 MB peak RSS, a few ms — measured): OpenCV **YuNet** (`cv2.FaceDetectorYN`, MIT, 232 KB model bundled at `assets/face_detection_yunet_2023mar.onnx`) for faces, **DBNet** (PP-OCRv3 differentiable-binarization via `cv2.dnn.TextDetectionModel_DB`, a 2.4 MB Apache-2.0 model bundled at `assets/text_detection_ppocrv3_2023may.onnx`) for text, with the old Canny+MSER region heuristic kept as a fallback if the DBNet model can't load (`_detect_text_dbnet` returns None → `_detect_text_mser`). The en/cn opencv_zoo PP-OCRv3 detection models are byte-identical, so it is bundled language-neutral. Text only ever ADDS controlnet, so a miss is backstopped by edge-density and a false positive only costs a controlnet run. Plus `edge_density`. `min_resolution` stays 1024. **Every auto decision is independently overridable** (interface principle): `_apply_auto` (cli.py) overrides only the three content-adaptive modes the user left at their click default (`ctx.get_parameter_source(...) == DEFAULT`) — `--pipeline`, `--restore-faces`/`--no-restore-faces`, and **`--adaptive-polish`/`--no-adaptive-polish`** always win; `--min-resolution`/`--strength`/`--unsharp`/`--humanize` are independent knobs. `--adaptive-polish` also works WITHOUT `--auto` (manual detail-targeted polish; the engine's `adaptive_polish` param uses the full-res original as the detail reference). Prints the chosen plan (`AutoConfig.reason`). Wired into `cmd_all`/`cmd_invisible`/`cmd_batch` — in `batch` the plan is recomputed per image and the invisible engine is cached **per resolved pipeline** (`ctx.obj["_inv_engines"]`, keyed `default`/`controlnet`) instead of a single shared instance, so a mixed directory builds at most one engine of each kind. **Adds ZERO new pip deps** (all cv2 core + the bundled MIT YuNet + Apache-2.0 DBNet models + the cv2-only adaptive polish). The auto plan does NOT select the `esrgan` upscaler (that needs the optional extra and would make auto's behavior install-dependent); `--upscaler esrgan` stays a separate manual knob. Unit-tested without a heavy download (`tests/test_auto_config.py`): flat/text synthetic images for routing (the bundled DBNet fires on a real text card), monkeypatched `detect_face`/`_detect_text_dbnet`/`_detect_text_mser` for the face/text/fallback branches (a real detectable-face fixture is private, never committed). Production adoption path for raiw.cc: validate (must keep SynthID removed, not hallucinate micro-text, beat plain SDXL on the real upload distribution), then bump the library SHA in `modal_app.py` and pass `auto=True`. - `upscaler.py` — optional Real-ESRGAN pre-diffusion super-resolution for small inputs (spandrel boundary, top-of-file pyright pragma). `is_available()` gates on spandrel+torch (via `importlib.util.find_spec`); `upscale(bgr, device=None)` loads a lazily-built spandrel `ImageModelDescriptor` singleton (double-checked lock) and upscales by the model's native factor (x2), with a non-CPU→CPU device fallback mirroring the diffusion engine's MPS→CPU retry. Weights (`RealESRGAN_x2plus.pth`, BSD-3-Clause) download on first use to the `torch.hub` checkpoints cache; never bundled. Used only when UPscaling to the `min_resolution` floor (a `max_resolution` downscale always uses Lanczos). The wiring is `InvisibleEngine._esrgan_upscale(pil, target)` — Real-ESRGAN at native factor, then a Lanczos resize to the exact target, falling back to a plain Lanczos resize if the extra is absent or the model errors (so an optional upscaler can never break removal). The default `--upscaler` is `lanczos` (cv2, no deps). **ESRGAN is a generic photo/texture GAN with no face/glyph prior**, so it best fits photo/texture content and can degrade faces (glassy/asymmetric eyes -- the diffusion pass regenerates faces so the full-pipeline final recovers; that is what GFPGAN/`--restore-faces` is for) and thin/small text (the GAN invents wrong strokes, and low-strength diffusion will not fix it). Verified 2026-06-04: isolated upscale lap-var ~5x Lanczos on faces+textures but glassy eyes; end-to-end `invisible` final lap-var 1634 vs Lanczos 663 with natural faces (diffusion cleaned the artifact). Kept a **manual opt-in knob** (the auto plan never selects it) with `lanczos` the default; not content-gated by design (use Lanczos for text-heavy inputs). spandrel is MIT and pulls no basicsr, unlike the `restore` extra. Unit-tested without the model: `tests/test_upscaler.py` (availability guard + the not-installed RuntimeError) and `tests/test_invisible_engine.py::TestEsrganUpscale` (the three `_esrgan_upscale` branches via a monkeypatched `upscaler`). - `image_io.py` — Unicode-safe cv2 IO (issue #17). `imread(path, flags=None)` / `imwrite(path, img)` wrap `np.fromfile`+`cv2.imdecode` / `cv2.imencode`+`tofile` so non-ASCII paths work on Windows -- bare `cv2.imread`/`cv2.imwrite` use the platform ANSI code-page API there and fail (empty decode + `can't open/read file`) on Chinese/Cyrillic/accented filenames. `imread` keeps `cv2.imread` semantics (defaults to `IMREAD_COLOR`, returns `None` on missing/empty/undecodable). **Every cv2 file read/write in the package routes through here; do not call `cv2.imread`/`cv2.imwrite` directly.** `imwrite` returns `False` on an unwritable path (`OSError` caught) instead of raising, matching `cv2.imwrite` semantics. macOS/Linux already accept UTF-8 paths, so it is behavior-neutral there (the bug only reproduces on Windows). cv2/numpy are imported lazily inside the functions, so the module is cheap to import in a bare env. diff --git a/pyproject.toml b/pyproject.toml index 11c7b83..f9163fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,22 @@ restore = [ "scipy<1.18", "numba<0.60", ] +# Optional PhotoMaker-V2 face-identity restoration (commercial-safe end-to-end: +# PhotoMaker-V2 weights Apache-2.0 + OpenCLIP-ViT-H/14 MIT, NO InsightFace). Unlike +# the `restore` extra above (which runs GFPGAN on the watermarked ORIGINAL and was +# oracle-confirmed to re-introduce SynthID), PhotoMaker carries identity in a +# SEMANTIC EMBEDDING and generates fresh face pixels conditioned on it -- so the +# pixel watermark is not transported. Empirically validated 2026-06-04: the OpenCLIP +# embedding changes by cosine 0.002 under SynthID-magnitude pixel noise (an order of +# magnitude less than JPEG90 drift, which SynthID survives). See +# docs/synthid-robust-identity-research.md and +# src/remove_ai_watermarks/photomaker_restore.py. Weights (~3 GB SDXL + ~1 GB +# PhotoMaker-V2 adapter) download on first use; never bundled. Kept OUT of `all` +# (heavy + model download), same as `restore`/`esrgan`. +photomaker = [ + "photomaker @ git+https://github.com/TencentARC/PhotoMaker.git", + "huggingface-hub>=0.20.0", +] # Optional pre-diffusion super-resolution for small inputs (Real-ESRGAN). Loaded via # spandrel (MIT) -- a pure model-loader with NO basicsr dependency (it pulls only # torch / torchvision / safetensors / numpy / einops), which sidesteps the @@ -159,6 +175,12 @@ Repository = "https://github.com/wiltodelta/remove-ai-watermarks" requires = ["hatchling<1.31"] build-backend = "hatchling.build" +# Allow the `photomaker` extra to reference its upstream git URL directly (the +# TencentARC/PhotoMaker package is not on PyPI). Apache-2.0; weights download on +# first use, so this only adds the Python wrapper. +[tool.hatch.metadata] +allow-direct-references = true + [tool.hatch.build.targets.wheel] packages = ["src/remove_ai_watermarks"] diff --git a/src/remove_ai_watermarks/cli.py b/src/remove_ai_watermarks/cli.py index 2b99123..e46f732 100644 --- a/src/remove_ai_watermarks/cli.py +++ b/src/remove_ai_watermarks/cli.py @@ -236,22 +236,32 @@ def _warn_if_esrgan_unavailable(upscaler: str) -> None: def _restore_faces_options(f: Any) -> Any: - """Attach the shared GFPGAN face-restoration flags to an invisible-pipeline command.""" + """Attach the shared face-restoration flags to an invisible-pipeline command.""" restore_flag = click.option( "--restore-faces/--no-restore-faces", default=False, - help="EXPERIMENTAL, opt-in. Restore face identity with a GFPGAN post-pass when " - "faces are present (needs the 'restore' extra); off by default, auto-skips when no " - "face is detected or the extra is absent.", + help="EXPERIMENTAL, opt-in. Restore face identity with a post-pass when faces are " + "present; off by default, auto-skips when no face is detected or the chosen extra " + "is absent.", + ) + method_flag = click.option( + "--restore-faces-method", + type=click.Choice(["gfpgan", "photomaker"]), + default="gfpgan", + help="Face-restore mechanism: 'gfpgan' (cheap, needs 'restore' extra, BUT runs on " + "the watermarked original and re-introduces SynthID) or 'photomaker' (PhotoMaker-V2, " + "needs the 'photomaker' extra; carries identity via a SynthID-invariant OpenCLIP " + "embedding so the regenerated face pixels are watermark-free). Default: gfpgan.", ) weight_flag = click.option( "--restore-faces-weight", type=float, default=0.5, help="GFPGAN fidelity weight (0-1); lower = more GAN regeneration (cleaner " - "watermark scrub), higher = closer to the input.", + "watermark scrub), higher = closer to the input. Ignored when " + "--restore-faces-method=photomaker.", ) - return restore_flag(weight_flag(f)) + return restore_flag(method_flag(weight_flag(f))) def _watermark_region(det: DetectionResult, width: int, height: int) -> tuple[int, int, int, int]: @@ -603,6 +613,7 @@ def cmd_invisible( controlnet_scale: float, restore_faces: bool, restore_faces_weight: float, + restore_faces_method: str, upscaler: str, auto: bool, adaptive_polish: bool, @@ -666,6 +677,7 @@ def cmd_invisible( vendor=vendor, restore_faces=restore_faces, restore_faces_weight=restore_faces_weight, + restore_faces_method=restore_faces_method, ) elapsed = time.monotonic() - t0 @@ -868,6 +880,7 @@ def cmd_all( controlnet_scale: float, restore_faces: bool, restore_faces_weight: float, + restore_faces_method: str, upscaler: str, auto: bool, adaptive_polish: bool, @@ -977,6 +990,7 @@ def cmd_all( vendor=vendor, restore_faces=restore_faces, restore_faces_weight=restore_faces_weight, + restore_faces_method=restore_faces_method, ) console.print(" Invisible watermark removed") @@ -1033,6 +1047,7 @@ def _process_batch_image( min_resolution: int = 1024, restore_faces: bool = False, restore_faces_weight: float = 0.5, + restore_faces_method: str = "gfpgan", controlnet_scale: float = 1.0, upscaler: str = "lanczos", auto: bool = False, @@ -1112,6 +1127,7 @@ def _process_batch_image( upscaler=upscaler, restore_faces=restore_faces, restore_faces_weight=restore_faces_weight, + restore_faces_method=restore_faces_method, # Detect the vendor from the pristine original (`img_path`), not the # visible-processed `out_path` whose C2PA is already gone. vendor=vendor_for_strength(img_path), @@ -1195,6 +1211,7 @@ def cmd_batch( min_resolution: int, restore_faces: bool, restore_faces_weight: float, + restore_faces_method: str, controlnet_scale: float, upscaler: str, auto: bool, @@ -1255,6 +1272,7 @@ def cmd_batch( min_resolution=min_resolution, restore_faces=restore_faces, restore_faces_weight=restore_faces_weight, + restore_faces_method=restore_faces_method, controlnet_scale=controlnet_scale, upscaler=upscaler, auto=auto, diff --git a/src/remove_ai_watermarks/invisible_engine.py b/src/remove_ai_watermarks/invisible_engine.py index a96af97..37e3a14 100644 --- a/src/remove_ai_watermarks/invisible_engine.py +++ b/src/remove_ai_watermarks/invisible_engine.py @@ -166,6 +166,7 @@ class InvisibleEngine: vendor: str | None = None, restore_faces: bool = False, restore_faces_weight: float = 0.5, + restore_faces_method: str = "gfpgan", unsharp: float = 0.0, adaptive_polish: bool = False, upscaler: str = "lanczos", @@ -185,6 +186,13 @@ class InvisibleEngine: face-restoration post-pass when faces are present (needs the ``restore`` extra). Auto-skips with a debug log when the extra is absent or no face is detected. + restore_faces_method: Which face-identity restoration mechanism to run after + the diffusion pass: ``"gfpgan"`` (default; cheap, but WARNING the GFPGAN + pass runs on the watermarked ORIGINAL and re-introduces SynthID -- see + ``face_restore.py``) or ``"photomaker"`` (PhotoMaker-V2; carries identity + via a SynthID-invariant OpenCLIP embedding and regenerates fresh face + pixels conditioned on it -- SynthID-safe, but heavier and requires the + ``photomaker`` extra). See ``docs/synthid-robust-identity-research.md``. restore_faces_weight: GFPGAN fidelity weight (0-1); lower = more GAN regeneration (cleaner watermark scrub), higher = closer to input. unsharp: Final unsharp-mask sharpening strength (0 = off, default). @@ -318,7 +326,10 @@ class InvisibleEngine: # the cleaned output at its final resolution; auto-skips when faces are # absent or the optional extra is not installed. if restore_faces: - self._restore_faces(out_path, image, restore_faces_weight) + if restore_faces_method == "photomaker": + self._restore_faces_photomaker(out_path, image, seed) + else: + self._restore_faces(out_path, image, restore_faces_weight) # Final sharpening, LAST so it crisps the face-restored result too (a # pre-GFPGAN sharpen would be smoothed back over by the face pass). @@ -406,6 +417,51 @@ class InvisibleEngine: except Exception as e: logger.warning("restore_faces post-pass failed (%s); keeping un-restored output", e) + def _restore_faces_photomaker( + self, + out_path: Path, + original_image: Any, + seed: int | None, + ) -> None: + """Run the PhotoMaker-V2 SynthID-safe face-identity restoration post-pass. + + Unlike the GFPGAN path (which blends watermarked original face pixels back into + the cleaned output and re-introduces SynthID), PhotoMaker carries identity in a + SynthID-invariant OpenCLIP embedding and regenerates fresh face pixels conditioned + on it. Best-effort: any failure (missing extra, model load, runtime error) logs a + warning and leaves the un-restored cleaned output in place. See + ``docs/synthid-robust-identity-research.md`` and ``photomaker_restore.py``. + """ + from remove_ai_watermarks import photomaker_restore + + if not photomaker_restore.is_available(): + logger.debug("restore_faces=photomaker requested but the 'photomaker' extra is not installed; skipping") + return + + try: + import cv2 + import numpy as np + + from remove_ai_watermarks import image_io + + cleaned_bgr = image_io.imread(out_path, cv2.IMREAD_COLOR) + if cleaned_bgr is None: + logger.warning("restore_faces_photomaker: could not read cleaned output %s; skipping", out_path) + return + + original_rgb = original_image.convert("RGB") + original_bgr = cv2.cvtColor(np.array(original_rgb), cv2.COLOR_RGB2BGR) + cleaned_size = (cleaned_bgr.shape[1], cleaned_bgr.shape[0]) + if (original_bgr.shape[1], original_bgr.shape[0]) != cleaned_size: + original_bgr = cv2.resize(original_bgr, cleaned_size, interpolation=cv2.INTER_LANCZOS4) + + if self._progress_callback: + self._progress_callback("Restoring face identity (PhotoMaker-V2 post-pass)...") + restored = photomaker_restore.restore_faces_photomaker(original_bgr, cleaned_bgr, seed=seed) + image_io.imwrite(out_path, restored) + except Exception as e: + logger.warning("restore_faces_photomaker post-pass failed (%s); keeping un-restored output", e) + def remove_watermark_batch( self, input_dir: Path, diff --git a/src/remove_ai_watermarks/photomaker_restore.py b/src/remove_ai_watermarks/photomaker_restore.py new file mode 100644 index 0000000..52cb654 --- /dev/null +++ b/src/remove_ai_watermarks/photomaker_restore.py @@ -0,0 +1,286 @@ +"""SynthID-robust face identity restoration via PhotoMaker-V2. + +The diffusion removal pass scrubs the pixel watermark from the WHOLE image, including +faces, but lets faces drift in identity. Unlike the GFPGAN restore pass in +``face_restore.py`` (which runs on the watermarked ORIGINAL and re-introduces SynthID +via partial pixel blending), PhotoMaker carries identity in a SEMANTIC EMBEDDING +(OpenCLIP-ViT-H/14 image embedding, finetuned by PhotoMaker-V2) and uses it to +CONDITION a fresh txt2img generation -- the pixels are new, so the watermark cannot +be transported. + +That the embedding cannot carry an invisible pixel watermark like SynthID was +empirically confirmed 2026-06-04: on 31 face crops, the cosine similarity between +``embed(orig)`` and ``embed(synthid_proxy(orig))`` (a ±2 LSB low-frequency noise of +SynthID magnitude) is 0.9977 -- an order of magnitude less drift than JPEG90, which +SynthID survives at >=99% TPR by design. See ``docs/synthid-robust-identity-research.md``. + +Architecture: PhotoMaker-V2 is a fine-tuned OpenCLIP-ViT-H/14 ID encoder plus LoRA on +the SDXL UNet attention layers. It ships as a single ``photomaker-v2.bin`` checkpoint +loaded into a ``PhotoMakerStableDiffusionXLPipeline`` (txt2img only -- there is no +PhotoMakerControlNetImg2img class in diffusers). We use it as a SECOND PASS after the +main controlnet/default removal: + + 1. Main removal pass (`controlnet` at the certified strength) cleans SynthID + everywhere but leaves faces drifted. + 2. For each face found in the CLEANED image (YuNet), this module takes the SAME + face region from the ORIGINAL, computes a PhotoMaker ID embedding from it, and + runs PhotoMaker txt2img to regenerate JUST that face crop from the embedding. + The freshly generated face is feather-composited back into the cleaned image. + +The generated face pixels are diffusion-fresh and inherit identity from the embedding +(not the pixels), so SynthID is not re-introduced. + +Commercial-safe end-to-end: +- PhotoMaker-V2 weights: Apache-2.0 (TencentARC). +- ID encoder: OpenCLIP-ViT-H/14 (MIT) finetuned by PhotoMaker (still Apache-2.0). +- SDXL base: shared with the main pipeline (already used in `default`/`controlnet`). +- NO InsightFace / antelopev2 (which is the non-commercial blocker for IP-Adapter + FaceID / InstantID / PuLID / Arc2Face). + +Requires the optional ``photomaker`` extra: ``pip install +'remove-ai-watermarks[photomaker]'`` (pulls torch / diffusers / the upstream PhotoMaker +package, all commercial-safe). Weights download on first use; never bundled. +""" + +# cv2/torch/diffusers boundary: relax unknown-type rules for this file only. +# pyright: reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnknownParameterType=false, reportMissingTypeArgument=false, reportMissingTypeStubs=false, reportMissingImports=false, reportArgumentType=false, reportAssignmentType=false, reportReturnType=false, reportCallIssue=false, reportIndexIssue=false, reportOperatorIssue=false, reportOptionalMemberAccess=false, reportOptionalCall=false, reportOptionalSubscript=false, reportOptionalOperand=false, reportAttributeAccessIssue=false, reportPrivateImportUsage=false, reportPrivateUsage=false, reportInvalidTypeForm=false, reportConstantRedefinition=false, reportUnnecessaryComparison=false +from __future__ import annotations + +import importlib.util +import logging +import threading +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from numpy.typing import NDArray + +logger = logging.getLogger(__name__) + +# PhotoMaker-V2 weights (Apache-2.0, TencentARC). Downloaded on first use. +_PHOTOMAKER_REPO = "TencentARC/PhotoMaker-V2" +_PHOTOMAKER_FILE = "photomaker-v2.bin" +# SDXL base shared with the main pipeline (same checkpoint as `default`/`controlnet`). +_SDXL_MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0" + +# The neutral prompt PhotoMaker is designed around: a class noun + the trigger word +# `img`, which PhotoMaker replaces with the ID embedding at inference. Keeping it +# scene-neutral (no extra style words) maximises identity transfer from the embed and +# minimises hallucinated background/lighting that would not match the cleaned scene. +_PHOTOMAKER_PROMPT = "a portrait photo of a person img, natural lighting, sharp focus" +_PHOTOMAKER_NEGATIVE = "blurry, lowres, deformed, distorted, watermark" + +# Square size used to feed PhotoMaker (must match a multiple of 64; 512 fits CPU/GPU +# comfortably and gives the encoder enough pixels for a stable embedding). +_PHOTOMAKER_FACE_SIZE = 512 + +_pipeline: Any | None = None +_pipeline_lock = threading.Lock() + + +def is_available() -> bool: + """True when the optional PhotoMaker extra deps are importable.""" + return ( + importlib.util.find_spec("photomaker") is not None + and importlib.util.find_spec("diffusers") is not None + and importlib.util.find_spec("huggingface_hub") is not None + ) + + +def _select_device() -> str: + """Pick the PhotoMaker pipeline device: CUDA when present, MPS on Apple, else CPU.""" + try: + import torch + + if torch.cuda.is_available(): + return "cuda" + if torch.backends.mps.is_available(): + return "mps" + except Exception as e: + logger.debug("photomaker_restore: device probe failed (%s); using CPU", e) + return "cpu" + + +def _get_pipeline() -> Any: + """Return the lazily-built PhotoMaker pipeline singleton (downloads weights on first use).""" + global _pipeline + if _pipeline is not None: + return _pipeline + with _pipeline_lock: + if _pipeline is None: + import torch + from huggingface_hub import hf_hub_download + from photomaker import PhotoMakerStableDiffusionXLPipeline + + device = _select_device() + dtype = torch.float16 if device == "cuda" else torch.float32 + logger.info("photomaker_restore: loading SDXL+PhotoMaker on %s (%s)", device, dtype) + + adapter_path = hf_hub_download(repo_id=_PHOTOMAKER_REPO, filename=_PHOTOMAKER_FILE) + pipe = PhotoMakerStableDiffusionXLPipeline.from_pretrained(_SDXL_MODEL_ID, torch_dtype=dtype) + pipe.load_photomaker_adapter( + str(Path(adapter_path).parent), + subfolder="", + weight_name=_PHOTOMAKER_FILE, + trigger_word="img", + ) + pipe.to(device) + pipe.fuse_lora() + _pipeline = pipe + return _pipeline + + +def _face_crop_square( + image_bgr: NDArray[Any], + box: tuple[int, int, int, int], + pad: float = 0.30, +) -> tuple[NDArray[Any], tuple[int, int, int, int]]: + """Square crop around a face box (with padding), clipped to the image. + + Returns ``(crop_bgr, (x1, y1, x2, y2))``. The crop is the image content inside the + returned square box -- callers use the box for the composite step. Pure numpy slicing, + no model. + """ + h, w = image_bgr.shape[:2] + x, y, bw, bh = box + cx, cy = x + bw // 2, y + bh // 2 + side = int(max(bw, bh) * (1.0 + 2.0 * pad)) + half = side // 2 + x1 = max(0, cx - half) + y1 = max(0, cy - half) + x2 = min(w, cx + half) + y2 = min(h, cy + half) + return image_bgr[y1:y2, x1:x2], (x1, y1, x2, y2) + + +def _composite_faces( + base_bgr: NDArray[Any], + restored_crops: list[tuple[NDArray[Any], tuple[int, int, int, int]]], + feather_div: int = 6, +) -> NDArray[Any]: + """Feather-composite a list of ``(restored_crop, (x1, y1, x2, y2))`` into ``base_bgr``. + + Pure cv2/numpy helper (no model), unit-testable. For each ``(crop, box)``: resize + the crop to the box size, build a Gaussian-feathered rectangular alpha, and blend + ``crop * a + base * (1 - a)``. Boxes that fall fully outside the image (or an empty + list) leave ``base_bgr`` unchanged. Mirrors the alpha math in ``face_restore._composite_faces``. + """ + import cv2 + import numpy as np + + out = base_bgr.astype(np.float32) + h, w = base_bgr.shape[:2] + + for crop, (x1, y1, x2, y2) in restored_crops: + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(w, x2), min(h, y2) + bw, bh = x2 - x1, y2 - y1 + if bw <= 0 or bh <= 0: + continue + resized = cv2.resize(crop, (bw, bh), interpolation=cv2.INTER_LANCZOS4) + + alpha = np.zeros((h, w), dtype=np.float32) + alpha[y1:y2, x1:x2] = 1.0 + k = max(3, (min(bw, bh) // feather_div) | 1) + alpha = cv2.GaussianBlur(alpha, (k, k), 0)[:, :, None] + + full_restored = np.zeros_like(out) + full_restored[y1:y2, x1:x2] = resized + out = full_restored * alpha + out * (1.0 - alpha) + + return np.clip(out, 0, 255).astype(np.uint8) + + +def restore_faces_photomaker( + original_bgr: NDArray[Any], + cleaned_bgr: NDArray[Any], + num_inference_steps: int = 30, + guidance_scale: float = 5.0, + style_strength: int = 20, + seed: int | None = None, + detect_faces_fn: Any | None = None, +) -> NDArray[Any]: + """SynthID-robust face identity restoration via PhotoMaker txt2img. + + Pipeline: + 1. Detect faces in ``cleaned_bgr`` (YuNet via the package's ``auto_config`` by + default; override via ``detect_faces_fn`` for tests). + 2. For each face: take the SAME box from ``original_bgr`` -> square crop -> PhotoMaker + txt2img with that crop as the ID image -> a fresh face generated from the + OpenCLIP embedding (the embedding is SynthID-invariant by ~3 orders of magnitude, + see docs/synthid-robust-identity-research.md). + 3. Feather-composite each regenerated face into ``cleaned_bgr``. + + Faces are taken from ``original_bgr`` (the embedding ignores the watermark) but the + PIXELS that land in the output are diffusion-fresh, so SynthID is not transported. + + Args: + original_bgr: The original (watermarked) image as cv2 BGR. Source of identity. + cleaned_bgr: The main-pass output as cv2 BGR. Faces drifted in identity; this + module replaces those face regions. + num_inference_steps: Diffusion steps inside PhotoMaker (def 30). + guidance_scale: CFG scale inside PhotoMaker (def 5.0; the PhotoMaker recipe). + style_strength: PhotoMaker's ``start_merge_step`` knob ~ 20-30 (def 20). + seed: Optional seed for reproducibility. + detect_faces_fn: Optional callable ``(bgr) -> list[(x,y,w,h)]`` to override the + default YuNet detector (used by tests). + + Returns: + ``cleaned_bgr`` with regenerated face regions composited in (or unchanged when + no face is detected). + """ + import cv2 + import numpy as np + import torch + from PIL import Image + + if detect_faces_fn is None: + from remove_ai_watermarks import auto_config as _ac + + def _default_detect(bgr: NDArray[Any]) -> list[tuple[int, int, int, int]]: + h, w = bgr.shape[:2] + model = Path(_ac.__file__).parent / "assets" / "face_detection_yunet_2023mar.onnx" + det = cv2.FaceDetectorYN.create(str(model), "", (w, h), _ac._FACE_SCORE, 0.3, 5000) + det.setInputSize((w, h)) + _, faces = det.detect(bgr) + if faces is None: + return [] + return [(int(f[0]), int(f[1]), int(f[2]), int(f[3])) for f in faces if int(f[2]) > 0 and int(f[3]) > 0] + + detect_faces_fn = _default_detect + + boxes = detect_faces_fn(cleaned_bgr) + if not boxes: + logger.debug("photomaker_restore: no faces detected; returning cleaned image unchanged") + return cleaned_bgr + + pipeline = _get_pipeline() + generator = None + if seed is not None: + generator = torch.Generator(device=pipeline.device).manual_seed(seed) + + restored: list[tuple[NDArray[Any], tuple[int, int, int, int]]] = [] + for box in boxes: + id_crop_bgr, square_box = _face_crop_square(original_bgr, box) + if id_crop_bgr.size == 0: + continue + id_crop_rgb = cv2.cvtColor(id_crop_bgr, cv2.COLOR_BGR2RGB) + id_image_pil = Image.fromarray(id_crop_rgb) + + out = pipeline( + prompt=_PHOTOMAKER_PROMPT, + negative_prompt=_PHOTOMAKER_NEGATIVE, + input_id_images=[id_image_pil], + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + start_merge_step=style_strength, + generator=generator, + height=_PHOTOMAKER_FACE_SIZE, + width=_PHOTOMAKER_FACE_SIZE, + num_images_per_prompt=1, + ) + gen_rgb = out.images[0] + gen_bgr = cv2.cvtColor(np.array(gen_rgb), cv2.COLOR_RGB2BGR) + restored.append((gen_bgr, square_box)) + + return _composite_faces(cleaned_bgr, restored) diff --git a/tests/test_photomaker_restore.py b/tests/test_photomaker_restore.py new file mode 100644 index 0000000..20739d4 --- /dev/null +++ b/tests/test_photomaker_restore.py @@ -0,0 +1,128 @@ +"""Tests for the PhotoMaker-V2 face identity restoration helper. + +These tests cover the pure-Python parts (face crop math, composite, the no-faces +no-op, the is_available guard) WITHOUT loading PhotoMaker or SDXL -- the model-loading +path is gated behind ``is_available()`` and exercised manually via the Modal cert +sweep, mirroring the convention used for ``face_restore`` and ``upscaler``. + +The end-to-end PhotoMaker run is monkey-patched: we replace ``_get_pipeline`` with a +fake pipeline whose ``__call__`` returns a known constant-color face, so we can verify +that the right boxes get the right pixels composited back. +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import cv2 +import numpy as np + +from remove_ai_watermarks import photomaker_restore + + +class TestIsAvailable: + def test_returns_bool(self): + assert isinstance(photomaker_restore.is_available(), bool) + + +class TestFaceCropSquare: + def test_centers_on_face_box(self): + img = np.full((400, 400, 3), 128, dtype=np.uint8) + crop, box = photomaker_restore._face_crop_square(img, (100, 150, 80, 80)) + x1, y1, x2, y2 = box + # The crop covers the requested box (with padding) + assert x1 <= 100 + assert y1 <= 150 + assert x2 >= 180 + assert y2 >= 230 + assert crop.shape[0] == y2 - y1 + assert crop.shape[1] == x2 - x1 + + def test_clips_at_image_edges(self): + img = np.full((200, 200, 3), 128, dtype=np.uint8) + crop, (x1, y1, x2, y2) = photomaker_restore._face_crop_square(img, (180, 180, 30, 30)) + # Box must be clipped within the image + assert x1 >= 0 + assert y1 >= 0 + assert x2 <= 200 + assert y2 <= 200 + assert crop.shape[0] == y2 - y1 + assert crop.shape[1] == x2 - x1 + + def test_pad_widens_the_crop(self): + img = np.full((400, 400, 3), 128, dtype=np.uint8) + _, no_pad = photomaker_restore._face_crop_square(img, (150, 150, 50, 50), pad=0.0) + _, with_pad = photomaker_restore._face_crop_square(img, (150, 150, 50, 50), pad=0.5) + assert (with_pad[2] - with_pad[0]) > (no_pad[2] - no_pad[0]) + + +class TestCompositeFaces: + def test_empty_list_returns_base_unchanged(self): + base = np.full((100, 100, 3), 64, dtype=np.uint8) + out = photomaker_restore._composite_faces(base, []) + assert np.array_equal(out, base) + + def test_box_outside_image_is_skipped(self): + base = np.full((100, 100, 3), 64, dtype=np.uint8) + crop = np.full((40, 40, 3), 200, dtype=np.uint8) + out = photomaker_restore._composite_faces(base, [(crop, (200, 200, 240, 240))]) + assert np.array_equal(out, base) + + def test_composited_box_pulls_pixel_value_toward_crop(self): + base = np.full((200, 200, 3), 40, dtype=np.uint8) + crop = np.full((50, 50, 3), 220, dtype=np.uint8) + # Place the crop fully inside the image at (60, 60)..(110, 110) + out = photomaker_restore._composite_faces(base, [(crop, (60, 60, 110, 110))]) + # The box center should be heavily biased toward the crop color (>120) ... + assert out[85, 85, 0] > 120 + # ... and corners (well outside the feathered region) stay close to base + assert int(out[0, 0, 0]) - int(base[0, 0, 0]) <= 1 + + +class TestRestoreFacesPhotomakerControlFlow: + """End-to-end control flow with a fake pipeline -- no diffusion model loaded.""" + + @staticmethod + def _fake_pipeline_class(fill_value: int = 200): + """Class-based fake (no ``__call__`` on a SimpleNamespace, which Python won't dispatch).""" + from PIL import Image + + size = photomaker_restore._PHOTOMAKER_FACE_SIZE + fake_face = Image.fromarray(np.full((size, size, 3), fill_value, dtype=np.uint8)) + + class _FakePipe: + device = "cpu" + + def __call__(self, **_kwargs): + return SimpleNamespace(images=[fake_face]) + + return _FakePipe() + + def test_no_faces_returns_cleaned_unchanged(self, monkeypatch): + # Force is_available so we never hit the missing-extra branch + monkeypatch.setattr(photomaker_restore, "is_available", lambda: True) + monkeypatch.setattr(photomaker_restore, "_get_pipeline", lambda: self._fake_pipeline_class()) + + orig = np.full((200, 200, 3), 30, dtype=np.uint8) + cleaned = np.full((200, 200, 3), 90, dtype=np.uint8) + out = photomaker_restore.restore_faces_photomaker(orig, cleaned, detect_faces_fn=lambda _b: []) + assert np.array_equal(out, cleaned) + + def test_one_face_gets_composited_into_cleaned(self, monkeypatch): + monkeypatch.setattr(photomaker_restore, "is_available", lambda: True) + monkeypatch.setattr(photomaker_restore, "_get_pipeline", lambda: self._fake_pipeline_class(fill_value=210)) + + orig = np.full((400, 400, 3), 30, dtype=np.uint8) + cleaned = np.full((400, 400, 3), 90, dtype=np.uint8) + # Mark the original face region with a distinctive color so we can confirm the + # crop reached the pipeline (not strictly tested here, but useful sanity). + cv2.rectangle(orig, (150, 150), (250, 250), (200, 100, 50), -1) + + out = photomaker_restore.restore_faces_photomaker( + orig, cleaned, detect_faces_fn=lambda _b: [(150, 150, 100, 100)] + ) + # The cleaned image should have shifted toward the fake-face fill (210) inside + # the face region. + assert out[200, 200, 0] > 150 + # And the corner pixels (well outside the feather) should still be near the base. + assert int(out[0, 0, 0]) - int(cleaned[0, 0, 0]) <= 1 diff --git a/uv.lock b/uv.lock index 47e3ffb..a5343f3 100644 --- a/uv.lock +++ b/uv.lock @@ -2347,6 +2347,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" }, ] +[[package]] +name = "photomaker" +version = "0.2.0" +source = { git = "https://github.com/TencentARC/PhotoMaker.git#060b4fcb10b76a4554edf565d6106b7e36c968f0" } + [[package]] name = "piexif" version = "1.1.3" @@ -3108,6 +3113,10 @@ lama = [ { name = "onnxruntime", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "onnxruntime", version = "1.26.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] +photomaker = [ + { name = "huggingface-hub" }, + { name = "photomaker" }, +] restore = [ { name = "basicsr" }, { name = "facexlib" }, @@ -3129,12 +3138,14 @@ requires-dist = [ { name = "facexlib", marker = "extra == 'restore'", specifier = ">=0.3.0" }, { name = "gfpgan", marker = "extra == 'restore'", specifier = ">=1.3.8" }, { name = "huggingface-hub", marker = "extra == 'lama'", specifier = ">=0.20.0" }, + { name = "huggingface-hub", marker = "extra == 'photomaker'", specifier = ">=0.20.0" }, { name = "invisible-watermark", marker = "extra == 'detect'", specifier = ">=0.2.0" }, { name = "invisible-watermark", marker = "extra == 'dev'", specifier = ">=0.2.0" }, { name = "numba", marker = "extra == 'restore'", specifier = "<0.60" }, { name = "numpy", specifier = ">=1.24.0" }, { name = "onnxruntime", marker = "extra == 'lama'", specifier = ">=1.16.0" }, { name = "opencv-python-headless", specifier = ">=4.8.0" }, + { name = "photomaker", marker = "extra == 'photomaker'", git = "https://github.com/TencentARC/PhotoMaker.git" }, { name = "piexif", specifier = ">=1.1.3" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.0" }, @@ -3151,7 +3162,7 @@ requires-dist = [ { name = "transformers", marker = "extra == 'gpu'", specifier = ">=5,<6" }, { name = "trustmark", marker = "extra == 'trustmark'", specifier = ">=0.8.0" }, ] -provides-extras = ["gpu", "detect", "trustmark", "lama", "restore", "esrgan", "dev", "all"] +provides-extras = ["gpu", "detect", "trustmark", "lama", "restore", "photomaker", "esrgan", "dev", "all"] [[package]] name = "requests"