From 6f4aa4c7b180d7aedbb0def7fb2e5fec1442e24a Mon Sep 17 00:00:00 2001 From: Victor Kuznetsov Date: Thu, 4 Jun 2026 17:43:27 -0700 Subject: [PATCH] fix(invisible): retry in fp32 on a degenerate fp16 output (#41) The fp16-fix VAE swap (#29) is gated to the default SDXL checkpoint, so a custom model_id, a stale pre-fix install, or a fal/custom loader can still decode to an all-black/NaN frame in fp16 (reporter: gpt-image 1448x1086, the `image_processor.py invalid value encountered in cast` warning). Add a model-agnostic backstop in remove_watermark: after generation, if the run was fp16 and the output is degenerate (_is_degenerate_image: near-zero mean and variance), rebuild the pipeline in fp32 on the same device and re-run once. fp32 is the verified-clean path, so a black image is never returned regardless of model_id or version. Mirrors the MPS->CPU fallback's self-mutation pattern; batch inherits it. Verified e2e on MPS by forcing fp16 with the swap disabled (first pass black, guard fired, retry clean). Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 2 +- .../noai/watermark_remover.py | 57 +++++++++++++------ tests/test_platform.py | 28 +++++++++ 3 files changed, 70 insertions(+), 17 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index dda079d..01ffb89 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -80,7 +80,7 @@ Who embeds what, and whether it is locally detectable (so we know which gaps are ## Known limitations - `invisible` pipeline processes at **native resolution for inputs whose long side is >= 1024px**, and **auto-upscales smaller inputs UP to a 1024px floor** (`min_resolution=1024`, the default; `--min-resolution 0` disables) before diffusion -- SDXL img2img distorts badly on a tiny latent (a 381x512 portrait wrecks at native, the #36 follow-up), and the output is restored to the original input size so the floor is a transparent quality boost (it adds time/memory on small inputs). The floor upscale uses Lanczos by default; **`--upscaler esrgan`** (opt-in, the `esrgan` extra) runs Real-ESRGAN first for better detail before the Lanczos resize to the exact target (`upscaler.py` / `InvisibleEngine._esrgan_upscale`, falls back to Lanczos if the extra is absent). `max_resolution=0` (default) means no downscale cap, matching the hosted raiw.cc backend (fal fast-sdxl, no pre-downscale). The old forced downscale-to-1024 -> upscale-back round-trip for LARGE images was the main quality loss (issue #10) and is gone; at strength ~0.05 SDXL img2img does not need a downscale. **Final `--unsharp` post-filter (`humanizer.unsharp_mask`, opt-in, default 0):** applied LAST (after the GFPGAN face pass, else it would be smoothed over) to counter the soft/over-smoothed look diffusion + restoration leave (an AI tell); ~0.5-0.8 safe, higher risks halos. Pairs with `--humanize` (grain adds sensor-noise texture, unsharp adds crispness). `--max-resolution N` re-introduces an opt-in long-side cap purely to bound GPU/MPS memory on very large inputs (it reintroduces the lossy round-trip). For huge images that OOM at native, tile-based diffusion is still the proper long-term fix. **Concrete MPS data points (the OOM is memory-tier-dependent, NOT a hard MPS limit):** on a ~24 GB unified-memory machine (verified 2026-05-25, 1254x1254 gpt-image SDXL, fp32) native res OOMs at the *UNet* step (peak ~17 GiB), not only the VAE decode, and the auto-fallback in `img2img_runner` reloads on CPU and finishes (slow, ~13 min) -- the output is still weight-identical and defeats SynthID, so "looks hung/crashed" on Mac is usually this CPU fallback, not a pipeline error. On a **32 GB** unified-memory machine the same default SDXL pass runs entirely on MPS with **no CPU fallback** (verified 2026-05-31, 1122x1402 gpt-image, `all`/default, ~155 s end-to-end), so 32 GB clears the native-res UNet peak that 24 GB could not. Adding `enable_vae_tiling()` alone does NOT prevent the 24 GB OOM (the peak is the UNet, not the VAE). The fast Mac workarounds for memory-constrained machines are fp16 on MPS (roughly halves memory) or `--max-resolution` to cap the long side; neither is wired as the default. The `controlnet` pipeline adds the canny ControlNet weights on top of SDXL, so its peak is a bit higher than the plain `default` pass; the same MPS->CPU fallback covers an OOM. The native-vs-cap-vs-floor decision lives in the pure helper `invisible_engine._target_size(w, h, max_resolution, min_resolution)` (returns `None` for native, a target tuple for a downscale cap OR an upscale floor; cap takes precedence, the floor is skipped on a min>max misconfig) so it is unit-tested (`tests/test_invisible_engine.py::TestTargetSize`, the #10/#15/#36 regression guard) without loading the model -- keep that logic in the helper, don't re-inline it. -- **fp16 VAE black-output fix (issue #29, 2026-05-30):** on a **CUDA/XPU fp16** backend the stock SDXL VAE overflows to NaN and the *plain* img2img path decodes to an **all-black** image (reproduced on the raiw.cc result: a 1086x1448 input -> a uniformly black 4.6 KB PNG, mean 0). `watermark_remover._load_pipeline` / `_load_controlnet_pipeline` swap in the fp16-fixed SDXL VAE (`madebyollin/sdxl-vae-fp16-fix` = `_SDXL_FP16_VAE_ID`) when `_needs_fp16_vae_fix(model_id, DEFAULT_MODEL_ID, is_fp16)` is true -- only the default SDXL checkpoint on fp16. **cpu/mps run fp32** (the stock VAE is fine there, which is why the bug never reproduces on Mac). A custom non-SDXL `model_id` keeps its own VAE (the fp16-fix VAE is SDXL-architecture-specific). The decision is a pure helper, unit-tested without a download (`tests/test_platform.py::TestFp16VaeFix`); the actual black->clean recovery needs a CUDA GPU. **Confirmed on real CUDA hardware 2026-06-03:** running `all` on a 1086x1448 OpenAI gpt-image (the #29 repro size) at fp16 produced a normal (non-black) output, so the fp16-fix VAE swap resolves the all-black decode. (It was not reproducible on this MPS machine, which runs fp32, so the verification had to happen on an NVIDIA box.) +- **fp16 VAE black-output fix (issue #29, 2026-05-30):** on a **CUDA/XPU fp16** backend the stock SDXL VAE overflows to NaN and the *plain* img2img path decodes to an **all-black** image (reproduced on the raiw.cc result: a 1086x1448 input -> a uniformly black 4.6 KB PNG, mean 0). `watermark_remover._load_pipeline` / `_load_controlnet_pipeline` swap in the fp16-fixed SDXL VAE (`madebyollin/sdxl-vae-fp16-fix` = `_SDXL_FP16_VAE_ID`) when `_needs_fp16_vae_fix(model_id, DEFAULT_MODEL_ID, is_fp16)` is true -- only the default SDXL checkpoint on fp16. **cpu/mps run fp32** (the stock VAE is fine there, which is why the bug never reproduces on Mac). A custom non-SDXL `model_id` keeps its own VAE (the fp16-fix VAE is SDXL-architecture-specific). The decision is a pure helper, unit-tested without a download (`tests/test_platform.py::TestFp16VaeFix`); the actual black->clean recovery needs a CUDA GPU. **Confirmed on real CUDA hardware 2026-06-03:** running `all` on a 1086x1448 OpenAI gpt-image (the #29 repro size) at fp16 produced a normal (non-black) output, so the fp16-fix VAE swap resolves the all-black decode. (It was not reproducible on this MPS machine, which runs fp32, so the verification had to happen on an NVIDIA box.) **Follow-up safety net (issue #41, 2026-06-04):** the swap is gated to `model_id == DEFAULT_MODEL_ID`, so a custom model, a stale pre-fix install, or a fal/custom loader can still hit the black decode -- a new reporter did (gpt-image 1448x1086, the #29 size, with the exact `image_processor.py:142 invalid value encountered in cast` warning the NaN->0 cast emits). `remove_watermark` now adds a model-agnostic backstop: after generation, if the run was fp16 AND the output is degenerate (`_is_degenerate_image`: mean and std both below `_DEGENERATE_THRESHOLD` 1.0 -- a uniform all-black/NaN frame; the variance guard spares a legitimately dark-but-textured photo), it rebuilds the pipeline in fp32 on the SAME device and re-runs once. fp32 is the verified-clean path, so the user never gets a black image regardless of model_id/version. Mirrors the existing MPS->CPU fallback's self-mutation pattern (reset `torch_dtype` + clear `_pipeline`/`_controlnet_pipeline`); `batch` inherits it through `remove_watermark`, and once one image trips it the rest of the batch stays on the safe fp32. The detector is a pure helper, unit-tested without a model (`tests/test_platform.py::TestDegenerateOutputGuard`); the full fp16->detect->fp32-retry chain was verified e2e on this MPS machine by forcing fp16 with the swap disabled (first pass black, guard fired, retry produced a normal image). CAVEAT: the fp32 retry uses ~2x memory, so on a VRAM-constrained GPU it can OOM (a visible error, still better than a silent black frame; the MPS->CPU fallback covers that path). The reporter's "CPU also black" symptom is NOT reproducible here -- fp32 (cpu/mps) decodes clean -- so it points at an old version or a non-fp32 run, pending their version + command. - Pyright first run is slow (2-3 min) due to ML deps (torch/diffusers/transformers stubs); full-project `uv run pyright` can stall for many minutes โ€” scope it to changed files. - A third-party PIL plugin autoload (e.g. an HEIF/AVIF plugin) can raise a non-OSError (`ModuleNotFoundError`), not `UnidentifiedImageError`, when opening a file. Code that opens user-supplied or unknown-format files should `except Exception`, not just `OSError`/`UnidentifiedImageError`. - **rich was dropped (CLI + scripts print plain text via `click.echo`).** `cli.py` renders through small `_Console`/`_Table`/`_Progress` shims; the analysis scripts (`scripts/synthid_corpus.py`, `synthid_pixel_probe.py`, `text_detection_benchmark.py`, `corpus_gap_scan.py`) import `Console`/`Table` from the shared `scripts/_plain_console.py` shim (markup like `[bold]`/`[/]` is stripped, tables render aligned). Consequences: (1) `rich` is NOT a dependency, so anything that imports it breaks a clean `uv sync --frozen` (CI installs core+dev only) โ€” this exact gap red-failed CI after the refactor when those 4 scripts still imported rich; if you add a script, use the `_plain_console` shim, not rich. (2) The old `[gpu]`-bracket-eaten bug (#19) is gone โ€” plain `click.echo` prints `pip install 'remove-ai-watermarks[gpu]'` verbatim, no escaping needed (regression-guarded by `tests/test_cli.py::TestGpuHintMarkup`). (3) No Unicode glyphs / colors / progress bars in CLI output by design. diff --git a/src/remove_ai_watermarks/noai/watermark_remover.py b/src/remove_ai_watermarks/noai/watermark_remover.py index 3634047..53bed4f 100644 --- a/src/remove_ai_watermarks/noai/watermark_remover.py +++ b/src/remove_ai_watermarks/noai/watermark_remover.py @@ -79,6 +79,28 @@ def _needs_fp16_vae_fix(model_id: str, default_model_id: str, is_fp16: bool) -> return is_fp16 and model_id == default_model_id +# An fp16 VAE/UNet overflow decodes to NaN, which diffusers' postprocess casts to 0 +# -> a uniform all-black frame (issues #29, #41). The VAE swap above prevents it for +# the default checkpoint, but a custom model_id, a stale install, or a fal/custom +# loader can still bypass it. Detecting a degenerate output and retrying in fp32 (the +# path verified clean) is the model-agnostic safety net: never hand back a black image. +# One threshold serves both guards: a NaN->0 collapse drives mean and variance to ~0. +_DEGENERATE_THRESHOLD = 1.0 + + +def _is_degenerate_image(image: Image.Image) -> bool: + """True if a generated image collapsed to an all-black/NaN frame (#29/#41). + + A NaN fp16 decode casts to 0, so the output is a uniform near-zero image: an + extremely low mean AND near-zero variance. The variance guard keeps a + legitimately dark-but-textured photo (low mean, real detail) from being flagged. + """ + import numpy as np + + arr = np.asarray(image.convert("RGB"), dtype=np.float32) + return float(arr.mean()) < _DEGENERATE_THRESHOLD and float(arr.std()) < _DEGENERATE_THRESHOLD + + _CUDA_FIX_ENV_KEY = "NOAI_CUDA_FIXED" @@ -513,22 +535,25 @@ class WatermarkRemover: _total_start = time.monotonic() - if self.model_profile == "controlnet": - cleaned_image = self._run_controlnet( - init_image, - strength, - num_inference_steps, - guidance_scale, - generator, - ) - else: - cleaned_image = self._run_img2img( - init_image, - strength, - num_inference_steps, - guidance_scale, - generator, - ) + def _generate() -> Image.Image: + if self.model_profile == "controlnet": + return self._run_controlnet(init_image, strength, num_inference_steps, guidance_scale, generator) + return self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator) + + cleaned_image = _generate() + + # Safety net for the fp16 all-black/NaN decode (#29/#41): if an fp16 run + # produced a degenerate (uniform black) frame -- the VAE swap did not engage + # for this model/version -- retry once in fp32 on the same device (verified + # clean) so the user never gets a black image. Skipped when an MPS->CPU + # fallback already moved us to fp32. + if self.torch_dtype == torch.float16 and _is_degenerate_image(cleaned_image): + logger.warning("fp16 output was degenerate (all-black/NaN, #29/#41); retrying in fp32 on %s.", self.device) + self._set_progress("Output was black (fp16 overflow); retrying in fp32...") + self.torch_dtype = torch.float32 + self._pipeline = None + self._controlnet_pipeline = None + cleaned_image = _generate() self._set_progress(f"Regeneration complete ยท Output: {w}x{h}px {cleaned_image.mode}") diff --git a/tests/test_platform.py b/tests/test_platform.py index ca1033e..c5d019f 100644 --- a/tests/test_platform.py +++ b/tests/test_platform.py @@ -9,7 +9,9 @@ from __future__ import annotations from pathlib import Path from unittest.mock import MagicMock, patch +import numpy as np import pytest +from PIL import Image from remove_ai_watermarks.noai.progress import is_mps_error from remove_ai_watermarks.noai.utils import get_image_format, is_supported_format @@ -298,3 +300,29 @@ class TestFp16VaeFix: from remove_ai_watermarks.noai.watermark_remover import _needs_fp16_vae_fix assert _needs_fp16_vae_fix("runwayml/stable-diffusion-v1-5", self.DEFAULT, is_fp16=True) is False + + +class TestDegenerateOutputGuard: + """The fp16 black-output safety net (#29/#41): detect an all-black/NaN frame so + ``remove_watermark`` can retry in fp32. Pure image statistics, no model needed.""" + + def test_all_black_is_degenerate(self): + from remove_ai_watermarks.noai.watermark_remover import _is_degenerate_image + + black = Image.fromarray(np.zeros((64, 64, 3), np.uint8)) + assert _is_degenerate_image(black) is True + + def test_normal_image_is_not_degenerate(self): + from remove_ai_watermarks.noai.watermark_remover import _is_degenerate_image + + rng = np.random.default_rng(0) + normal = Image.fromarray(rng.integers(0, 256, (64, 64, 3), dtype=np.uint8)) + assert _is_degenerate_image(normal) is False + + def test_dark_but_textured_image_is_not_degenerate(self): + """A legitimately dark photo with real detail must NOT be flagged (variance guard).""" + from remove_ai_watermarks.noai.watermark_remover import _is_degenerate_image + + rng = np.random.default_rng(1) + dark = Image.fromarray(rng.integers(0, 40, (64, 64, 3), dtype=np.uint8)) + assert _is_degenerate_image(dark) is False