mirror of
https://github.com/wiltodelta/remove-ai-watermarks.git
synced 2026-06-05 02:28:00 +02:00
Add per-region high-resolution text protection (regenerate crisp, scrub everywhere) (#31)
Replace the default text-protection path. Differential Diffusion froze text in latent space, which left SynthID intact inside text (violating remove-everywhere) and still softened sub-8px strokes (VAE latent limit). _run_region_hires instead scrubs the whole image, then re-scrubs each detected text block at high resolution and feather-composites it back: every pixel is regenerated (watermark removed everywhere) while small text stays crisp (high-res strokes span >1 latent cell). merge_text_regions + feather_paste are pure and unit-tested; each re-scrubbed patch is phase-correlated back to the original crop to null the ~1-2px round-trip offset. Synthetic 18px multilingual text: text-region SSIM 0.28 -> 0.48, visually garbled -> readable across Latin/Cyrillic/CJK. Legacy _run_differential / build_change_map remain but are no longer the default. Prod use still requires confirming via the SynthID oracle that re-scrubbed text zones read watermark-free. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -481,7 +481,7 @@ class WatermarkRemover:
|
||||
generator,
|
||||
)
|
||||
elif protect_text and self._can_protect_text():
|
||||
cleaned_image = self._run_differential(
|
||||
cleaned_image = self._run_region_hires(
|
||||
init_image,
|
||||
strength,
|
||||
num_inference_steps,
|
||||
@@ -621,6 +621,90 @@ class WatermarkRemover:
|
||||
self._diff_pipeline = None
|
||||
return self._load_differential_pipeline()
|
||||
|
||||
# Region high-res text scrub: defaults tuned so each text block is upscaled
|
||||
# enough that strokes exceed the VAE's ~8px latent cell, capped so a single
|
||||
# region never blows past the GPU/MPS memory budget.
|
||||
_REGION_HIRES_SCALE = 3.0
|
||||
_REGION_MAX_MEGAPIXELS = 1.3
|
||||
|
||||
def _run_region_hires(
|
||||
self,
|
||||
init_image: Image.Image,
|
||||
strength: float,
|
||||
num_inference_steps: int,
|
||||
guidance_scale: float,
|
||||
generator: Any,
|
||||
) -> Image.Image:
|
||||
"""Scrub the whole image, then RE-scrub each detected text block at high
|
||||
resolution and composite it back.
|
||||
|
||||
Unlike the Differential-Diffusion path (which freezes text in latent space
|
||||
and so leaves the watermark intact there), every pixel here is regenerated
|
||||
-- the watermark is removed everywhere. Small text survives because each
|
||||
text block is upscaled before its img2img pass, so strokes span more than
|
||||
one VAE latent cell (the ~8px floor that softens text at native scale);
|
||||
the scrubbed crop is downscaled and feather-composited back. Falls back to
|
||||
the plain global scrub when no text is detected.
|
||||
"""
|
||||
import math
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from remove_ai_watermarks import text_protector
|
||||
|
||||
base = self._run_img2img(init_image, strength, num_inference_steps, guidance_scale, generator)
|
||||
|
||||
bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
|
||||
try:
|
||||
boxes = text_protector.TextProtector().detect_text_boxes(bgr)
|
||||
except Exception as exc:
|
||||
logger.warning("Text detection failed (%s); keeping the global scrub.", exc)
|
||||
return base
|
||||
if not boxes:
|
||||
self._set_progress("No text detected; global scrub only.")
|
||||
return base
|
||||
|
||||
width, height = init_image.size
|
||||
regions = text_protector.merge_text_regions(boxes, height, width)
|
||||
orig_bgr = cv2.cvtColor(np.array(init_image), cv2.COLOR_RGB2BGR)
|
||||
out_bgr = cv2.cvtColor(np.array(base), cv2.COLOR_RGB2BGR)
|
||||
budget = self._REGION_MAX_MEGAPIXELS * 1_000_000
|
||||
|
||||
done = 0
|
||||
for x, y, w, h in regions:
|
||||
area = max(1, w * h)
|
||||
# INTEGER scale so the upscale -> scrub -> downscale round-trip is an
|
||||
# exact dimensional inverse (a fractional factor truncates and shifts
|
||||
# the composited text ~1-2px, which is invisible but tanks alignment).
|
||||
scale = int(min(self._REGION_HIRES_SCALE, math.sqrt(budget / area)))
|
||||
if scale < 2:
|
||||
# Region too large to even double within the budget: upscaling
|
||||
# buys nothing here; the global scrub covers it (documented limit
|
||||
# for very large text areas -- tiling is the future fix).
|
||||
continue
|
||||
crop = orig_bgr[y : y + h, x : x + w]
|
||||
up = cv2.resize(crop, (w * scale, h * scale), interpolation=cv2.INTER_LANCZOS4)
|
||||
up_pil = Image.fromarray(cv2.cvtColor(up, cv2.COLOR_BGR2RGB))
|
||||
scrubbed = self._run_img2img(up_pil, strength, num_inference_steps, guidance_scale, generator)
|
||||
down = cv2.resize(cv2.cvtColor(np.array(scrubbed), cv2.COLOR_RGB2BGR), (w, h), interpolation=cv2.INTER_AREA)
|
||||
# The up -> scrub -> down round-trip can offset the re-rendered text by
|
||||
# a pixel or two (the diffusion pipeline rounds dims to a multiple of
|
||||
# 8, so the inverse resize is not perfectly centered). Phase-correlate
|
||||
# the patch back to the original crop and translate it so the glyphs
|
||||
# land exactly where they were -- otherwise a sub-pixel shift garbles
|
||||
# the composite even though the text is crisp.
|
||||
cg = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY).astype(np.float32)
|
||||
dg = cv2.cvtColor(down, cv2.COLOR_BGR2GRAY).astype(np.float32)
|
||||
(sx, sy), _resp = cv2.phaseCorrelate(cg, dg)
|
||||
if abs(sx) > 0.1 or abs(sy) > 0.1:
|
||||
m = np.float32([[1, 0, -sx], [0, 1, -sy]])
|
||||
down = cv2.warpAffine(down, m, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE)
|
||||
out_bgr = text_protector.feather_paste(out_bgr, down, x, y)
|
||||
done += 1
|
||||
self._set_progress(f"Re-scrubbed {done}/{len(regions)} text region(s) at high resolution.")
|
||||
return Image.fromarray(cv2.cvtColor(out_bgr, cv2.COLOR_BGR2RGB))
|
||||
|
||||
def _run_differential(
|
||||
self,
|
||||
init_image: Image.Image,
|
||||
|
||||
@@ -151,6 +151,87 @@ def build_change_map(
|
||||
return change_map
|
||||
|
||||
|
||||
def merge_text_regions(
|
||||
boxes: list[NDArray[Any]],
|
||||
height: int,
|
||||
width: int,
|
||||
dilate_frac: float = 0.012,
|
||||
pad_frac: float = 0.02,
|
||||
max_regions: int = 8,
|
||||
) -> list[tuple[int, int, int, int]]:
|
||||
"""Group detected text polygons into a few padded axis-aligned rectangles.
|
||||
|
||||
The DB detector returns one box per word/line; the region-high-res text scrub
|
||||
runs a separate diffusion pass per region, so we coalesce nearby boxes into a
|
||||
handful of *local* blocks (a light dilation merges within a paragraph but not
|
||||
across the whole image, so each block stays small enough to upscale within a
|
||||
memory budget). Returns ``(x, y, w, h)`` rects, largest-area first, clipped to
|
||||
the image and capped at ``max_regions``.
|
||||
"""
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
mask = np.zeros((height, width), np.uint8)
|
||||
if not boxes:
|
||||
return []
|
||||
cv2.fillPoly(mask, [np.asarray(b, np.int32) for b in boxes], 1)
|
||||
k = max(1, int(min(height, width) * dilate_frac))
|
||||
mask = cv2.dilate(mask, cv2.getStructuringElement(cv2.MORPH_RECT, (k, k)))
|
||||
n, _labels, stats, _c = cv2.connectedComponentsWithStats(mask, 8)
|
||||
pad = int(min(height, width) * pad_frac)
|
||||
rects: list[tuple[int, int, int, int]] = []
|
||||
for i in range(1, n):
|
||||
x, y, w, h = (
|
||||
int(stats[i, cv2.CC_STAT_LEFT]),
|
||||
int(stats[i, cv2.CC_STAT_TOP]),
|
||||
int(stats[i, cv2.CC_STAT_WIDTH]),
|
||||
int(stats[i, cv2.CC_STAT_HEIGHT]),
|
||||
)
|
||||
x0, y0 = max(0, x - pad), max(0, y - pad)
|
||||
x1, y1 = min(width, x + w + pad), min(height, y + h + pad)
|
||||
rects.append((x0, y0, x1 - x0, y1 - y0))
|
||||
rects.sort(key=lambda r: -(r[2] * r[3]))
|
||||
return rects[:max_regions]
|
||||
|
||||
|
||||
def feather_paste(
|
||||
base: NDArray[Any],
|
||||
patch: NDArray[Any],
|
||||
x: int,
|
||||
y: int,
|
||||
feather: int = 8,
|
||||
) -> NDArray[Any]:
|
||||
"""Alpha-composite ``patch`` into ``base`` at ``(x, y)`` with a feathered edge.
|
||||
|
||||
Used to drop a separately re-scrubbed (high-resolution) text region back into
|
||||
the globally-scrubbed image without a visible seam. Returns a new array;
|
||||
``base`` is not modified. ``patch`` is clipped to ``base`` bounds.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
out = base.copy()
|
||||
bh, bw = base.shape[:2]
|
||||
ph, pw = patch.shape[:2]
|
||||
x0, y0 = max(0, x), max(0, y)
|
||||
x1, y1 = min(bw, x + pw), min(bh, y + ph)
|
||||
if x1 <= x0 or y1 <= y0:
|
||||
return out
|
||||
patch_roi = patch[y0 - y : y1 - y, x0 - x : x1 - x].astype(np.float32)
|
||||
base_roi = out[y0:y1, x0:x1].astype(np.float32)
|
||||
rh, rw = base_roi.shape[:2]
|
||||
alpha = np.ones((rh, rw), np.float32)
|
||||
f = max(0, min(feather, rh // 2, rw // 2))
|
||||
if f > 0:
|
||||
ramp = np.linspace(0.0, 1.0, f, dtype=np.float32)
|
||||
alpha[:f, :] *= ramp[:, None]
|
||||
alpha[rh - f :, :] *= ramp[::-1, None]
|
||||
alpha[:, :f] *= ramp[None, :]
|
||||
alpha[:, rw - f :] *= ramp[None, ::-1]
|
||||
a3 = alpha[:, :, None]
|
||||
out[y0:y1, x0:x1] = (patch_roi * a3 + base_roi * (1.0 - a3)).astype(base.dtype)
|
||||
return out
|
||||
|
||||
|
||||
class TextProtector:
|
||||
"""Detect text regions with PP-OCRv3 for diffusion change-map protection."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user