Add per-region high-resolution text protection (regenerate crisp, scrub everywhere) (#31)

Replace the default text-protection path. Differential Diffusion froze text in latent space, which left SynthID intact inside text (violating remove-everywhere) and still softened sub-8px strokes (VAE latent limit). _run_region_hires instead scrubs the whole image, then re-scrubs each detected text block at high resolution and feather-composites it back: every pixel is regenerated (watermark removed everywhere) while small text stays crisp (high-res strokes span >1 latent cell). merge_text_regions + feather_paste are pure and unit-tested; each re-scrubbed patch is phase-correlated back to the original crop to null the ~1-2px round-trip offset. Synthetic 18px multilingual text: text-region SSIM 0.28 -> 0.48, visually garbled -> readable across Latin/Cyrillic/CJK. Legacy _run_differential / build_change_map remain but are no longer the default. Prod use still requires confirming via the SynthID oracle that re-scrubbed text zones read watermark-free. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-30 19:21:37 +02:00 · 2026-05-30 12:59:29 -07:00
parent c928ee6e42
commit e4f558dccf
5 changed files with 237 additions and 4 deletions
@@ -481,7 +481,7 @@ class WatermarkRemover:
                generator,
            )
        elif protect_text and self._can_protect_text():
-            cleaned_image = self._run_differential(
+            cleaned_image = self._run_region_hires(
                init_image,
                strength,
                num_inference_steps,
@@ -621,6 +621,90 @@ class WatermarkRemover:
        self._diff_pipeline = None
        return self._load_differential_pipeline()

+    # Region high-res text scrub: defaults tuned so each text block is upscaled
+    # enough that strokes exceed the VAE's ~8px latent cell, capped so a single
+    # region never blows past the GPU/MPS memory budget.
+    _REGION_HIRES_SCALE = 3.0
+    _REGION_MAX_MEGAPIXELS = 1.3
+
+    def _run_region_hires(
+        self,
+        init_image: Image.Image,
+        strength: float,
+        num_inference_steps: int,
+        guidance_scale: float,
+        generator: Any,
+    ) -> Image.Image:
+        """Scrub the whole image, then RE-scrub each detected text block at high
+        resolution and composite it back.
+
+        Unlike the Differential-Diffusion path (which freezes text in latent space
+        and so leaves the watermark intact there), every pixel here is regenerated
+        -- the watermark is removed everywhere. Small text survives because each
+        text block is upscaled before its img2img pass, so strokes span more than
+        one VAE latent cell (the ~8px floor that softens text at native scale);
+        the scrubbed crop is downscaled and feather-composited back. Falls back to
+        the plain global scrub when no text is detected.
+        """
+        import math
+
+        import cv2
+        import numpy as np
+
+        from remove_ai_watermarks import text_protector
+
+        base = self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator)
+
+        bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
+        try:
+            boxes = text_protector.TextProtector().detect_text_boxes(bgr)
+        except Exception as exc:
+            logger.warning("Text detection failed (%s); keeping the global scrub.", exc)
+            return base
+        if not boxes:
+            self._set_progress("No text detected; global scrub only.")
+            return base
+
+        width, height = init_image.size
+        regions = text_protector.merge_text_regions(boxes, height, width)
+        orig_bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
+        out_bgr = cv2.cvtColor(np.array(base), cv2.COLOR_RGB2BGR)
+        budget = self._REGION_MAX_MEGAPIXELS * 1_000_000
+
+        done = 0
+        for x, y, w, h in regions:
+            area = max(1, w * h)
+            # INTEGER scale so the upscale -> scrub -> downscale round-trip is an
+            # exact dimensional inverse (a fractional factor truncates and shifts
+            # the composited text ~1-2px, which is invisible but tanks alignment).
+            scale = int(min(self._REGION_HIRES_SCALE, math.sqrt(budget / area)))
+            if scale < 2:
+                # Region too large to even double within the budget: upscaling
+                # buys nothing here; the global scrub covers it (documented limit
+                # for very large text areas -- tiling is the future fix).
+                continue
+            crop = orig_bgr[y : y + h, x : x + w]
+            up = cv2.resize(crop, (w * scale, h * scale), interpolation=cv2.INTER_LANCZOS4)
+            up_pil = Image.fromarray(cv2.cvtColor(up, cv2.COLOR_BGR2RGB))
+            scrubbed = self._run_img2img(up_pil, strength, num_inference_steps, guidance_scale, generator)
+            down = cv2.resize(cv2.cvtColor(np.array(scrubbed), cv2.COLOR_RGB2BGR), (w, h), interpolation=cv2.INTER_AREA)
+            # The up -> scrub -> down round-trip can offset the re-rendered text by
+            # a pixel or two (the diffusion pipeline rounds dims to a multiple of
+            # 8, so the inverse resize is not perfectly centered). Phase-correlate
+            # the patch back to the original crop and translate it so the glyphs
+            # land exactly where they were -- otherwise a sub-pixel shift garbles
+            # the composite even though the text is crisp.
+            cg = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY).astype(np.float32)
+            dg = cv2.cvtColor(down, cv2.COLOR_BGR2GRAY).astype(np.float32)
+            (sx, sy), _resp = cv2.phaseCorrelate(cg, dg)
+            if abs(sx) > 0.1 or abs(sy) > 0.1:
+                m = np.float32([[1, 0, -sx], [0, 1, -sy]])
+                down = cv2.warpAffine(down, m, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
+            out_bgr = text_protector.feather_paste(out_bgr, down, x, y)
+            done += 1
+        self._set_progress(f"Re-scrubbed {done}/{len(regions)} text region(s) at high resolution.")
+        return Image.fromarray(cv2.cvtColor(out_bgr, cv2.COLOR_BGR2RGB))
+
    def _run_differential(
        self,
        init_image: Image.Image,
@@ -151,6 +151,87 @@ def build_change_map(
    return change_map


+def merge_text_regions(
+    boxes: list[NDArray[Any]],
+    height: int,
+    width: int,
+    dilate_frac: float = 0.012,
+    pad_frac: float = 0.02,
+    max_regions: int = 8,
+) -> list[tuple[int, int, int, int]]:
+    """Group detected text polygons into a few padded axis-aligned rectangles.
+
+    The DB detector returns one box per word/line; the region-high-res text scrub
+    runs a separate diffusion pass per region, so we coalesce nearby boxes into a
+    handful of *local* blocks (a light dilation merges within a paragraph but not
+    across the whole image, so each block stays small enough to upscale within a
+    memory budget). Returns ``(x, y, w, h)`` rects, largest-area first, clipped to
+    the image and capped at ``max_regions``.
+    """
+    import cv2
+    import numpy as np
+
+    mask = np.zeros((height, width), np.uint8)
+    if not boxes:
+        return []
+    cv2.fillPoly(mask, [np.asarray(b, np.int32) for b in boxes], 1)
+    k = max(1, int(min(height, width) * dilate_frac))
+    mask = cv2.dilate(mask, cv2.getStructuringElement(cv2.MORPH_RECT, (k, k)))
+    n, _labels, stats, _c = cv2.connectedComponentsWithStats(mask, 8)
+    pad = int(min(height, width) * pad_frac)
+    rects: list[tuple[int, int, int, int]] = []
+    for i in range(1, n):
+        x, y, w, h = (
+            int(stats[i, cv2.CC_STAT_LEFT]),
+            int(stats[i, cv2.CC_STAT_TOP]),
+            int(stats[i, cv2.CC_STAT_WIDTH]),
+            int(stats[i, cv2.CC_STAT_HEIGHT]),
+        )
+        x0, y0 = max(0, x - pad), max(0, y - pad)
+        x1, y1 = min(width, x + w + pad), min(height, y + h + pad)
+        rects.append((x0, y0, x1 - x0, y1 - y0))
+    rects.sort(key=lambda r: -(r[2] * r[3]))
+    return rects[:max_regions]
+
+
+def feather_paste(
+    base: NDArray[Any],
+    patch: NDArray[Any],
+    x: int,
+    y: int,
+    feather: int = 8,
+) -> NDArray[Any]:
+    """Alpha-composite ``patch`` into ``base`` at ``(x, y)`` with a feathered edge.
+
+    Used to drop a separately re-scrubbed (high-resolution) text region back into
+    the globally-scrubbed image without a visible seam. Returns a new array;
+    ``base`` is not modified. ``patch`` is clipped to ``base`` bounds.
+    """
+    import numpy as np
+
+    out = base.copy()
+    bh, bw = base.shape[:2]
+    ph, pw = patch.shape[:2]
+    x0, y0 = max(0, x), max(0, y)
+    x1, y1 = min(bw, x + pw), min(bh, y + ph)
+    if x1 <= x0 or y1 <= y0:
+        return out
+    patch_roi = patch[y0 - y : y1 - y, x0 - x : x1 - x].astype(np.float32)
+    base_roi = out[y0:y1, x0:x1].astype(np.float32)
+    rh, rw = base_roi.shape[:2]
+    alpha = np.ones((rh, rw), np.float32)
+    f = max(0, min(feather, rh // 2, rw // 2))
+    if f > 0:
+        ramp = np.linspace(0.0, 1.0, f, dtype=np.float32)
+        alpha[:f, :] *= ramp[:, None]
+        alpha[rh - f :, :] *= ramp[::-1, None]
+        alpha[:, :f] *= ramp[None, :]
+        alpha[:, rw - f :] *= ramp[None, ::-1]
+    a3 = alpha[:, :, None]
+    out[y0:y1, x0:x1] = (patch_roi * a3 + base_roi * (1.0 - a3)).astype(base.dtype)
+    return out
+
+
 class TextProtector:
    """Detect text regions with PP-OCRv3 for diffusion change-map protection."""