Add per-region high-resolution text protection (regenerate crisp, scrub everywhere) (#31)

Replace the default text-protection path. Differential Diffusion froze text in
latent space, which left SynthID intact inside text (violating remove-everywhere)
and still softened sub-8px strokes (VAE latent limit). _run_region_hires instead
scrubs the whole image, then re-scrubs each detected text block at high resolution
and feather-composites it back: every pixel is regenerated (watermark removed
everywhere) while small text stays crisp (high-res strokes span >1 latent cell).

merge_text_regions + feather_paste are pure and unit-tested; each re-scrubbed
patch is phase-correlated back to the original crop to null the ~1-2px round-trip
offset. Synthetic 18px multilingual text: text-region SSIM 0.28 -> 0.48, visually
garbled -> readable across Latin/Cyrillic/CJK. Legacy _run_differential /
build_change_map remain but are no longer the default. Prod use still requires
confirming via the SynthID oracle that re-scrubbed text zones read watermark-free.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Victor Kuznetsov
2026-05-30 12:59:29 -07:00
committed by GitHub
parent c928ee6e42
commit e4f558dccf
5 changed files with 237 additions and 4 deletions
@@ -481,7 +481,7 @@ class WatermarkRemover:
generator,
)
elif protect_text and self._can_protect_text():
cleaned_image = self._run_differential(
cleaned_image = self._run_region_hires(
init_image,
strength,
num_inference_steps,
@@ -621,6 +621,90 @@ class WatermarkRemover:
self._diff_pipeline = None
return self._load_differential_pipeline()
# Region high-res text scrub: defaults tuned so each text block is upscaled
# enough that strokes exceed the VAE's ~8px latent cell, capped so a single
# region never blows past the GPU/MPS memory budget.
_REGION_HIRES_SCALE = 3.0
_REGION_MAX_MEGAPIXELS = 1.3
def _run_region_hires(
self,
init_image: Image.Image,
strength: float,
num_inference_steps: int,
guidance_scale: float,
generator: Any,
) -> Image.Image:
"""Scrub the whole image, then RE-scrub each detected text block at high
resolution and composite it back.
Unlike the Differential-Diffusion path (which freezes text in latent space
and so leaves the watermark intact there), every pixel here is regenerated
-- the watermark is removed everywhere. Small text survives because each
text block is upscaled before its img2img pass, so strokes span more than
one VAE latent cell (the ~8px floor that softens text at native scale);
the scrubbed crop is downscaled and feather-composited back. Falls back to
the plain global scrub when no text is detected.
"""
import math
import cv2
import numpy as np
from remove_ai_watermarks import text_protector
base = self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator)
bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
try:
boxes = text_protector.TextProtector().detect_text_boxes(bgr)
except Exception as exc:
logger.warning("Text detection failed (%s); keeping the global scrub.", exc)
return base
if not boxes:
self._set_progress("No text detected; global scrub only.")
return base
width, height = init_image.size
regions = text_protector.merge_text_regions(boxes, height, width)
orig_bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
out_bgr = cv2.cvtColor(np.array(base), cv2.COLOR_RGB2BGR)
budget = self._REGION_MAX_MEGAPIXELS * 1_000_000
done = 0
for x, y, w, h in regions:
area = max(1, w * h)
# INTEGER scale so the upscale -> scrub -> downscale round-trip is an
# exact dimensional inverse (a fractional factor truncates and shifts
# the composited text ~1-2px, which is invisible but tanks alignment).
scale = int(min(self._REGION_HIRES_SCALE, math.sqrt(budget / area)))
if scale < 2:
# Region too large to even double within the budget: upscaling
# buys nothing here; the global scrub covers it (documented limit
# for very large text areas -- tiling is the future fix).
continue
crop = orig_bgr[y : y + h, x : x + w]
up = cv2.resize(crop, (w * scale, h * scale), interpolation=cv2.INTER_LANCZOS4)
up_pil = Image.fromarray(cv2.cvtColor(up, cv2.COLOR_BGR2RGB))
scrubbed = self._run_img2img(up_pil, strength, num_inference_steps, guidance_scale, generator)
down = cv2.resize(cv2.cvtColor(np.array(scrubbed), cv2.COLOR_RGB2BGR), (w, h), interpolation=cv2.INTER_AREA)
# The up -> scrub -> down round-trip can offset the re-rendered text by
# a pixel or two (the diffusion pipeline rounds dims to a multiple of
# 8, so the inverse resize is not perfectly centered). Phase-correlate
# the patch back to the original crop and translate it so the glyphs
# land exactly where they were -- otherwise a sub-pixel shift garbles
# the composite even though the text is crisp.
cg = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY).astype(np.float32)
dg = cv2.cvtColor(down, cv2.COLOR_BGR2GRAY).astype(np.float32)
(sx, sy), _resp = cv2.phaseCorrelate(cg, dg)
if abs(sx) > 0.1 or abs(sy) > 0.1:
m = np.float32([[1, 0, -sx], [0, 1, -sy]])
down = cv2.warpAffine(down, m, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
out_bgr = text_protector.feather_paste(out_bgr, down, x, y)
done += 1
self._set_progress(f"Re-scrubbed {done}/{len(regions)} text region(s) at high resolution.")
return Image.fromarray(cv2.cvtColor(out_bgr, cv2.COLOR_BGR2RGB))
def _run_differential(
self,
init_image: Image.Image,
@@ -151,6 +151,87 @@ def build_change_map(
return change_map
def merge_text_regions(
boxes: list[NDArray[Any]],
height: int,
width: int,
dilate_frac: float = 0.012,
pad_frac: float = 0.02,
max_regions: int = 8,
) -> list[tuple[int, int, int, int]]:
"""Group detected text polygons into a few padded axis-aligned rectangles.
The DB detector returns one box per word/line; the region-high-res text scrub
runs a separate diffusion pass per region, so we coalesce nearby boxes into a
handful of *local* blocks (a light dilation merges within a paragraph but not
across the whole image, so each block stays small enough to upscale within a
memory budget). Returns ``(x, y, w, h)`` rects, largest-area first, clipped to
the image and capped at ``max_regions``.
"""
import cv2
import numpy as np
mask = np.zeros((height, width), np.uint8)
if not boxes:
return []
cv2.fillPoly(mask, [np.asarray(b, np.int32) for b in boxes], 1)
k = max(1, int(min(height, width) * dilate_frac))
mask = cv2.dilate(mask, cv2.getStructuringElement(cv2.MORPH_RECT, (k, k)))
n, _labels, stats, _c = cv2.connectedComponentsWithStats(mask, 8)
pad = int(min(height, width) * pad_frac)
rects: list[tuple[int, int, int, int]] = []
for i in range(1, n):
x, y, w, h = (
int(stats[i, cv2.CC_STAT_LEFT]),
int(stats[i, cv2.CC_STAT_TOP]),
int(stats[i, cv2.CC_STAT_WIDTH]),
int(stats[i, cv2.CC_STAT_HEIGHT]),
)
x0, y0 = max(0, x - pad), max(0, y - pad)
x1, y1 = min(width, x + w + pad), min(height, y + h + pad)
rects.append((x0, y0, x1 - x0, y1 - y0))
rects.sort(key=lambda r: -(r[2] * r[3]))
return rects[:max_regions]
def feather_paste(
base: NDArray[Any],
patch: NDArray[Any],
x: int,
y: int,
feather: int = 8,
) -> NDArray[Any]:
"""Alpha-composite ``patch`` into ``base`` at ``(x, y)`` with a feathered edge.
Used to drop a separately re-scrubbed (high-resolution) text region back into
the globally-scrubbed image without a visible seam. Returns a new array;
``base`` is not modified. ``patch`` is clipped to ``base`` bounds.
"""
import numpy as np
out = base.copy()
bh, bw = base.shape[:2]
ph, pw = patch.shape[:2]
x0, y0 = max(0, x), max(0, y)
x1, y1 = min(bw, x + pw), min(bh, y + ph)
if x1 <= x0 or y1 <= y0:
return out
patch_roi = patch[y0 - y : y1 - y, x0 - x : x1 - x].astype(np.float32)
base_roi = out[y0:y1, x0:x1].astype(np.float32)
rh, rw = base_roi.shape[:2]
alpha = np.ones((rh, rw), np.float32)
f = max(0, min(feather, rh // 2, rw // 2))
if f > 0:
ramp = np.linspace(0.0, 1.0, f, dtype=np.float32)
alpha[:f, :] *= ramp[:, None]
alpha[rh - f :, :] *= ramp[::-1, None]
alpha[:, :f] *= ramp[None, :]
alpha[:, rw - f :] *= ramp[None, ::-1]
a3 = alpha[:, :, None]
out[y0:y1, x0:x1] = (patch_roi * a3 + base_roi * (1.0 - a3)).astype(base.dtype)
return out
class TextProtector:
"""Detect text regions with PP-OCRv3 for diffusion change-map protection."""