mirror of
https://github.com/jiaxiaojunQAQ/OmniSafeBench-MM.git
synced 2026-02-13 18:22:46 +00:00
106 lines
3.2 KiB
Python
106 lines
3.2 KiB
Python
# attacks/umk/attack.py
|
|
from __future__ import annotations
|
|
from typing import Any, Dict, Iterable, Tuple, Optional, Union
|
|
import os
|
|
import json
|
|
import csv
|
|
import random
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
|
|
from .utils.torchattacks.attacks.pixle import *
|
|
from .utils.torchattacks.attacks.bim import *
|
|
from .utils.torchattacks.attacks.pgd_uap_v1 import *
|
|
from .utils.torchattacks.attacks.pgdl2 import *
|
|
|
|
import numpy as np
|
|
|
|
from core.data_formats import TestCase
|
|
from core.base_classes import BaseAttack
|
|
from dataclasses import dataclass
|
|
|
|
|
|
def denorm(image, device):
|
|
mean = (0.48145466, 0.4578275, 0.40821073)
|
|
std = (0.26862954, 0.26130258, 0.27577711)
|
|
mean = torch.tensor(mean).to(device)
|
|
std = torch.tensor(std).to(device)
|
|
|
|
image_denorm = image * std.view(1, -1, 1, 1) + mean.view(1, -1, 1, 1)
|
|
|
|
return image_denorm
|
|
|
|
|
|
# -----------------------
|
|
# Default configuration (can be overridden in config.json)
|
|
# -----------------------
|
|
@dataclass
|
|
class ImgJPConfig:
|
|
device: str = "cuda"
|
|
eps: float = 32
|
|
alpha: float = 1
|
|
steps: int = 3000
|
|
target_model: str = "minigpt4.minigpt4_model.MiniGPT4"
|
|
|
|
|
|
class ImgJPAttack(BaseAttack):
|
|
CONFIG_CLASS = ImgJPConfig
|
|
|
|
def __init__(self, config: Dict[str, Any] = None, output_image_dir: str = None):
|
|
super().__init__(config, output_image_dir)
|
|
# Parameter settings
|
|
self.eps = self.cfg.eps
|
|
self.alpha = self.cfg.alpha
|
|
self.steps = self.cfg.steps
|
|
self.device = self.cfg.device
|
|
|
|
import importlib
|
|
|
|
self.target_model_name = str(getattr(self.cfg, "target_model", ""))
|
|
module_path, cls_name = self.target_model_name.rsplit(".", 1)
|
|
mod = importlib.import_module(f"multimodalmodels.{module_path}")
|
|
self.model = getattr(mod, cls_name)()
|
|
self.adv_image = None
|
|
|
|
def generate_test_case(
|
|
self, original_prompt: str, image_path: str, case_id: str, **kwargs
|
|
) -> TestCase:
|
|
if self.adv_image is None:
|
|
|
|
random_number = random.randint(1, 2000)
|
|
random.seed(random_number)
|
|
np.random.seed(random_number)
|
|
torch.manual_seed(random_number)
|
|
|
|
attack = PGD(
|
|
self.model,
|
|
eps=self.eps / 255,
|
|
alpha=self.alpha / 255,
|
|
steps=self.steps,
|
|
# nprompt=self.model.train_num,
|
|
random_start=False,
|
|
)
|
|
|
|
attack.set_mode_targeted_by_label()
|
|
mean = (0.48145466, 0.4578275, 0.40821073)
|
|
std = (0.26862954, 0.26130258, 0.27577711)
|
|
attack.set_normalization_used(mean, std)
|
|
|
|
image = torch.zeros(1, 3, 224, 224).to(self.device)
|
|
images = []
|
|
images.append(image)
|
|
adv_img = attack(images, self.model.shift_labels_1)
|
|
self.adv_image = denorm(adv_img[0])
|
|
|
|
self.adv_image_path = self.output_image_dir / f"imgjp_adv.jpg"
|
|
self.adv_image.save(self.adv_image_path)
|
|
|
|
return self.create_test_case(
|
|
case_id=case_id,
|
|
jailbreak_prompt=original_prompt,
|
|
jailbreak_image_path=str(self.adv_image_path),
|
|
original_prompt=original_prompt,
|
|
original_image_path=str(image_path),
|
|
)
|