OmniSafeBench-MM/attacks/mml/attack.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from __future__ import annotations
from dataclasses import dataclass, fields
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from pathlib import Path
import os
import json
import random
import re
from PIL import Image, ImageFont, ImageDraw
import textwrap

from core.data_formats import TestCase
from core.base_classes import BaseAttack


from . import utils

# Import prompt templates from const
from .const import (
    wr_game_prompt,
    mirror_game_prompt,
    base64_game_prompt,
    rotate_game_prompt,
)

# Build mapping
DATAFORMAT2PROMPT = {
    "images_wr": wr_game_prompt,
    "images_mirror": mirror_game_prompt,
    "images_base64": base64_game_prompt,
    "images_rotate": rotate_game_prompt,
}


# ===================== Utility Functions =====================
def _get_font(font_path: Optional[str], size: int):
    return ImageFont.truetype(font_path, size)


def text_step_by_step(text: str, steps=3, wrap=False, wrap_width=15):
    text = text.removesuffix("\n")
    if wrap:
        text = textwrap.fill(text, width=wrap_width)
    for idx in range(1, steps + 1):
        text += f"\n{idx}. "
    return text


def text_to_image(
    text: str,
    font_path: Optional[str] = None,
    font_size: int = 80,
    image_size: Tuple[int, int] = (760, 760),
    margin_xy: Tuple[int, int] = (20, 10),
    spacing: int = 11,
    bg="#FFFFFF",
    fg="#000000",
) -> Image.Image:
    font = _get_font(font_path, font_size)
    im = Image.new("RGB", image_size, bg)
    dr = ImageDraw.Draw(im)
    dr.text(margin_xy, text, fill=fg, font=font, spacing=spacing)
    return im


def gen_image_from_text(
    text: str,
    *,
    font_path: Optional[str] = None,
    font_size: int = 80,
    image_size=(760, 760),
    margin_xy=(20, 10),
    wrap_width=15,
    steps=3,
) -> Image.Image:
    text2 = text_step_by_step(text, steps=steps, wrap=True, wrap_width=wrap_width)
    image = text_to_image(
        text2,
        font_path=font_path,
        font_size=font_size,
        margin_xy=margin_xy,
        image_size=image_size,
    )
    return image


def scramble_str(title: str):
    """
    Extract words from title and randomly shuffle order
    """
    import re

    words = re.findall(r"\b\w+\b", title)
    scrambled = words[:]
    random.shuffle(scrambled)
    return scrambled


# ===================== Configuration =====================
@dataclass
class MMLConfig:
    """MML attack configuration - new architecture independent configuration class"""

    # Rendering parameters
    font_path: Optional[str] = None
    font_size: int = 80
    img_size: Tuple[int, int] = (760, 760)
    wrap_width: int = 15
    steps: int = 3
    margin_xy: Tuple[int, int] = (20, 20)
    aug_type: str = "wr"


# ===================== MML Attack Implementation =====================
class MMLAttack(BaseAttack):
    """
    MML attack method - new architecture implementation
    Generates four types of attack images: word-replace, base64, mirror, rotation
    """

    # Configuration class
    CONFIG_CLASS = MMLConfig

    def __init__(self, config: Dict[str, Any] = None, output_image_dir: str = None):
        """Initialize MML attack method

        Args:
            config: Configuration dictionary, will be loaded from config file and passed to parent class
            output_image_dir: Output image directory path
        """
        # Call parent class initialization, automatically handle configuration conversion
        super().__init__(config, output_image_dir)

        # Mapping for prompt field in original script
        self.dataformat2prompt = DATAFORMAT2PROMPT

    def generate_test_case(
        self,
        original_prompt: str,
        image_path: str,
        case_id: str,
        **kwargs,
    ) -> TestCase:
        """
        Generate test case - MML attack implementation

        Args:
            original_prompt: Original prompt string (harmful behavior text)
            image_path: Original image path (MML generates its own image, doesn't use provided image)
            case_id: Test case ID
            **kwargs: Additional parameters

        Returns:
            Single test case object
        """

        # MML attack generates four types of images, we need to generate a test case for each type
        # Here we generate word-replace type as an example, actual implementation may need to generate all four types
        # prepare subdirs
        subdirs = {
            "wr": self.output_image_dir / "word_replace",
            "base64": self.output_image_dir / "base64",
            "mirror": self.output_image_dir / "mirror",
            "rotation": self.output_image_dir / "rotation",
        }

        subdirs[self.cfg.aug_type].mkdir(parents=True, exist_ok=True)

        if self.cfg.aug_type == "wr":
            res = utils.text_transformers(original_prompt)

            wr_img = gen_image_from_text(
                res.get("replaced_prompt", original_prompt),
                font_path=self.cfg.font_path,
                font_size=self.cfg.font_size,
                image_size=self.cfg.img_size,
                wrap_width=self.cfg.wrap_width,
                margin_xy=self.cfg.margin_xy,
                steps=self.cfg.steps,
            )
            wr_name = f"{case_id}-wr.png"
            img_path = str(subdirs["wr"] / wr_name)
            wr_img.save(img_path)
            adv_prompt = self.dataformat2prompt.get("images_wr", "images_wr").format(
                res.get("replaced_map", ""), res.get("replaced_prompt", "")
            )
        elif self.cfg.aug_type == "base64":
            # Generate base64 image
            base64_src = res.get("base64_prompt", original_prompt)
            base64_img = gen_image_from_text(
                base64_src,
                font_path=self.cfg.font_path,
                font_size=self.cfg.font_size,
                image_size=self.cfg.img_size,
                wrap_width=self.cfg.wrap_width,
                steps=self.cfg.steps,
            )
            base64_name = f"{case_id}-base64.png"
            img_path = str(subdirs["base64"] / base64_name)
            base64_img.save(img_path)
            adv_prompt = self.dataformat2prompt.get(
                "images_base64", "images_base64"
            ).format(res.get("base64_prompt", ""))
        else:
            # Generate normal image for mirror and rotation transformation
            normal_img = gen_image_from_text(
                original_prompt,
                font_path=self.cfg.font_path,
                font_size=self.cfg.font_size,
                image_size=self.cfg.img_size,
                wrap_width=self.cfg.wrap_width,
                steps=self.cfg.steps,
            )
            scrambled_prompt = scramble_str(original_prompt)
            img_res = utils.image_transformer(normal_img, transform_type="all")
            # Apply image transformation
            if self.cfg.aug_type == "mirror":
                mirror_img = img_res.get("mirror", None)

                mirror_name = f"{case_id}-mirror.png"
                img_path = str(subdirs["mirror"] / mirror_name)
                mirror_img.save(img_path)

                adv_prompt = self.dataformat2prompt.get(
                    "images_mirror", "images_mirror"
                )
            elif self.cfg.aug_type == "rotation":
                rotation_img = img_res.get("rotation", None)
                rotation_name = f"{case_id}-rotation.png"
                img_path = str(subdirs["rotation"] / rotation_name)
                rotation_img.save(img_path)
                adv_prompt = self.dataformat2prompt.get(
                    "images_rotate", "images_rotate"
                ).format(scrambled_prompt)

        return self.create_test_case(
            case_id=case_id,
            jailbreak_prompt=adv_prompt,
            jailbreak_image_path=str(img_path),
            original_prompt=original_prompt,
            original_image_path=str(image_path),
            metadata={"aug_type": self.cfg.aug_type},
        )