remove-ai-watermarks/scripts/text_detection_benchmark.py

"""Multilingual recall benchmark for the text-protection detector.

Measures the core lever of text protection (`text_protector.TextProtector`): if
the PP-OCRv3 DB detector misses a text region, that text is NOT preserved during
the SDXL watermark-removal pass and gets deformed (issue #14). This renders short
text in several scripts at several font sizes on two canvas sizes, runs detection,
and reports the fraction of each known text bbox the detector covers.

Findings (2026-05-29):
  - Detection is script-agnostic: DB segments text *regions*, not characters, so
    Latin / Cyrillic / CJK / Hangul / Arabic / digits score identically. Language
    was never the lever.
  - The only lever is resolution. A fixed small detector input downscaled large
    canvases so far that small text was missed. Detecting at the native long side
    (capped, see ``text_protector._DET_MAX_LONG_SIDE``) lifts overall hit-rate
    from 0.91 to 1.00 and the worst cell (~16 px text on a 2048 canvas) from
    0.06 to 1.00.

This needs the detector model (downloaded on first use) and a font that covers
all the scripts (macOS "Arial Unicode"; on Linux install a Noto super-font).
No GPU. Run:

    uv run python scripts/text_detection_benchmark.py
"""

from __future__ import annotations

import logging
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any

import cv2
import numpy as np
from _plain_console import Console
from PIL import Image, ImageDraw, ImageFont

from remove_ai_watermarks import text_protector as tp

log = logging.getLogger(__name__)
console = Console()

# A single font covering every tested script isolates "language" from "font".
_FONT_CANDIDATES = [
    "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
    "/Library/Fonts/Arial Unicode.ttf",
    "/usr/share/fonts/truetype/noto/NotoSansCJK-Regular.ttc",
    "/usr/share/fonts/truetype/unifont/unifont.ttf",
]
SCRIPTS = {
    "Latin": "Generated by AI",
    "Cyrillic": "Сгенерировано ИИ",
    "CJK": "豆包AI生成内容",
    "Hangul": "AI로 생성됨",
    "Arabic": "أنشئ بالذكاء",
    "Digits": "0123456789",
}
FONT_SIZES = [16, 24, 32, 48, 64]
CANVASES = [1024, 2048]
PLACEMENTS = [(0.08, 0.15), (0.30, 0.55), (0.10, 0.82)]
BG_COLORS = [(35, 40, 60), (210, 205, 200)]


def _find_font() -> str:
    for path in _FONT_CANDIDATES:
        if Path(path).exists():
            return path
    raise SystemExit(
        "No multi-script font found. Install one (macOS ships 'Arial Unicode'; "
        "on Linux: a Noto CJK/super font) and add its path to _FONT_CANDIDATES."
    )


def _render(
    font_path: str,
    canvas: int,
    text: str,
    font_size: int,
    place: tuple[float, float],
    bg: tuple[int, int, int],
) -> tuple[Image.Image, tuple[int, int, int, int]]:
    img = Image.new("RGB", (canvas, canvas), bg)
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype(font_path, font_size)
    x, y = int(place[0] * canvas), int(place[1] * canvas)
    fg = (245, 245, 245) if sum(bg) < 360 else (20, 20, 20)
    draw.text((x, y), text, font=font, fill=fg)
    return img, draw.textbbox((x, y), text, font=font)


def _coverage(boxes: list[Any], bbox: tuple[int, int, int, int], h: int, w: int) -> float | None:
    gt = np.zeros((h, w), np.uint8)
    cv2.rectangle(gt, (bbox[0], bbox[1]), (bbox[2], bbox[3]), 1, -1)
    area = int(gt.sum())
    if area == 0:
        return None
    det = np.zeros((h, w), np.uint8)
    if boxes:
        cv2.fillPoly(det, [np.asarray(b, np.int32) for b in boxes], 1)
    return int((gt & det).sum()) / area


def _hitrate(values: list[float], thr: float = 0.5) -> float:
    return sum(c >= thr for c in values) / len(values) if values else float("nan")


def main() -> int:
    logging.basicConfig(level=logging.WARNING)
    if not tp.is_available():
        raise SystemExit("text detector unavailable (need opencv with cv2.dnn.TextDetectionModel_DB)")
    font_path = _find_font()
    detector = tp.TextProtector()

    by_script_size: dict[tuple[str, int], list[float]] = defaultdict(list)
    by_size_canvas: dict[tuple[int, int], list[float]] = defaultdict(list)
    by_script: dict[str, list[float]] = defaultdict(list)

    for canvas in CANVASES:
        for script, text in SCRIPTS.items():
            for font_size in FONT_SIZES:
                for idx, place in enumerate(PLACEMENTS):
                    img, bbox = _render(font_path, canvas, text, font_size, place, BG_COLORS[idx % len(BG_COLORS)])
                    bgr = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
                    cov = _coverage(detector.detect_text_boxes(bgr), bbox, canvas, canvas)
                    if cov is None:
                        continue
                    by_script_size[(script, font_size)].append(cov)
                    by_size_canvas[(font_size, canvas)].append(cov)
                    by_script[script].append(cov)

    console.print("=== hit-rate (coverage>=0.5) by script x font-size ===")
    console.print("script".ljust(10) + "".join(f"{fs:>7}" for fs in FONT_SIZES))
    for script in SCRIPTS:
        console.print(
            script.ljust(10) + "".join(f"{_hitrate(by_script_size[(script, fs)]):>7.2f}" for fs in FONT_SIZES)
        )

    console.print("\n=== hit-rate by font-size x canvas (the downscale effect) ===")
    console.print("size".ljust(8) + "".join(f"{c:>8}" for c in CANVASES))
    for fs in FONT_SIZES:
        console.print(str(fs).ljust(8) + "".join(f"{_hitrate(by_size_canvas[(fs, c)]):>8.2f}" for c in CANVASES))

    overall = _hitrate([c for vals in by_script.values() for c in vals])
    console.print(f"\nOVERALL hit-rate: {overall:.2f}  (detector max long side = {tp._DET_MAX_LONG_SIDE})")
    return 0


if __name__ == "__main__":
    sys.exit(main())