readme

2026-07-24 21:40:51 +02:00 · 2025-12-09 23:46:02 +08:00
parent 78b902647c
commit 536a3b5311
6 changed files with 168 additions and 41 deletions
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
+    <serverData>
+      <paths name="xiaojun@sprl-server14.dynip.ntu.edu.sg:22 password">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="xiaojun@sprl-server14.dynip.ntu.edu.sg:22 password (2)">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="xiaojun@sprl-server14.dynip.ntu.edu.sg:22 password (3)">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="xiaojun@sprl-server14.dynip.ntu.edu.sg:22 password (4)">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+      <paths name="xiaojun@sprl-server14.dynip.ntu.edu.sg:22 password (5)">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+    </serverData>
+  </component>
+</project>
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
+</project>
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="3dd065f2-c5be-4bf2-a43d-ec95bf607297" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProjectId" id="34vxN2JLetwk8HK2WmELSSXSehT" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "WebServerToolWindowFactoryState": "true",
+    "last_opened_file_path": "D:/Code/OmniSafeBench-MM",
+    "node.js.detected.package.eslint": "true",
+    "node.js.detected.package.tslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "node.js.selected.package.tslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="D:\Code\OmniSafeBench-MM\assets" />
+    </key>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="3dd065f2-c5be-4bf2-a43d-ec95bf607297" name="Changes" comment="" />
+      <created>1762107803025</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1762107803025</updated>
+      <workItem from="1762107804032" duration="82000" />
+      <workItem from="1762157028049" duration="27000" />
+      <workItem from="1764691244359" duration="7978000" />
+      <workItem from="1764836204541" duration="1218000" />
+      <workItem from="1764866901159" duration="660000" />
+      <workItem from="1764923068188" duration="594000" />
+      <workItem from="1764958254251" duration="635000" />
+      <workItem from="1765293169298" duration="3000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+</project>
@@ -33,47 +33,6 @@ OmniSafeBench-MM is a unified benchmark and open-source toolbox for evaluating m
 **Overview of OmniSafeBench-MM**.
 The benchmark unifies multi-modal jailbreak attack–defense evaluation, 13 attack and 15 defense methods, and a three-dimensional scoring protocol measuring harmfulness, alignment, and detail.

-## 🗡️ Integrated Attack Methods
-|            Name            |                            Title                             |   Venue    |                         Paper                          |                             Code                             |
-| :------------------------: | :----------------------------------------------------------: | :--------: | :----------------------------------------------------: | :----------------------------------------------------------: |
-|   FigStep / FigStep-Pro    | FigStep: Jailbreaking Large Vision-Language Models via Typographic Visual Prompts | AAAI 2025  | [link](https://arxiv.org/abs/2311.05608) | [link](https://github.com/ThuCCSLab/FigStep) |
-| QR-Attack (MM-SafetyBench) | MM-SafetyBench: A Benchmark for Safety Evaluation of Multimodal Large Language Models | ECCV 2024  | [link](https://arxiv.org/abs/2311.17600) | [link](https://github.com/isXinLiu/MM-SafetyBench) |
-|            MML             | Jailbreak Large Vision-Language Models Through Multi-Modal Linkage |  ACL 2025  | [link](https://aclanthology.org/2025.acl-long.74/) | [link](https://github.com/wangyu-ovo/MML) |
-|           CS-DJ            | Distraction is All You Need for Multimodal Large Language Model Jailbreaking | CVPR 2025  | [link](https://arxiv.org/abs/2502.10794) | [link](https://github.com/TeamPigeonLab/CS-DJ) |
-|         SI-Attack          | Jailbreaking Multimodal Large Language Models via Shuffle Inconsistency | ICCV 2025  | [link](https://arxiv.org/abs/2501.04931) | [link](https://github.com/zhaoshiji123/SI-Attack) |
-|            JOOD            | Playing the Fool: Jailbreaking LLMs and Multimodal LLMs with Out-of-Distribution Strategy | CVPR 2025  | [link](https://arxiv.org/abs/2503.20823) | [link](https://github.com/naver-ai/JOOD) |
-|           HIMRD            | Heuristic-Induced Multimodal Risk Distribution (HIMRD) Jailbreak Attack |  ICCV 2025 | [link](https://arxiv.org/abs/2412.05934) | [link](https://github.com/MaTengSYSU/HIMRD-jailbreak) |
-|           HADES            | Images are Achilles’ Heel of Alignment: Exploiting Visual Vulnerabilities for Jailbreaking MLLMs | ECCV 2024 | [link](https://arxiv.org/abs/2403.09792) | [link](https://github.com/AoiDragon/HADES) |
-|            BAP             | Jailbreak Vision Language Models via Bi-Modal Adversarial Prompt (BAP) | TIFS 2025 | [link](https://arxiv.org/abs/2406.04031) | [link](https://github.com/NY1024/BAP-Jailbreak-Vision-Language-Models-via-Bi-Modal-Adversarial-Prompt) |
-|         visual_adv         | Visual Adversarial Examples Jailbreak Aligned Large Language Models | AAAI 2024  | [link](https://ojs.aaai.org/index.php/AAAI/article/view/30150) | [link](https://github.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models) |
-|           VisCRA           | VisCRA: A Visual Chain Reasoning Attack for Jailbreaking Multimodal Large Language Models | EMNLP 2025 | [link](https://arxiv.org/abs/2505.19684) | [link](https://github.com/DyMessi/VisCRA) |
-|            UMK             | White-box Multimodal Jailbreaks Against Large Vision-Language Models (Universal Master Key) | ACMMM 2024 | [link](https://arxiv.org/abs/2405.17894) | [link](https://github.com/roywang021/UMK) |
-|         PBI-Attack         | Prior-Guided Bimodal Interactive Black-Box Jailbreak Attack for Toxicity Maximization | EMNLP 2025 | [link](https://aclanthology.org/2025.emnlp-main.32.pdf) | [link](https://github.com/Rosy0912/PBI-Attack) |
-|      ImgJP / DeltaJP       | Jailbreaking Attack against Multimodal Large Language Models | arXiv 2024 | [link](https://arxiv.org/abs/2402.02309) | [link](https://github.com/abc03570128/Jailbreaking-Attack-against-Multimodal-Large-Language-Model) |
-|            JPS             | JPS: Jailbreak Multimodal Large Language Models with Collaborative Visual Perturbation and Textual Steering | ACMMM 2025 | [link](https://arxiv.org/abs/2508.05087) | [link](https://github.com/thu-coai/JPS) |
-
-
-
-## 🛡️Integrated Defense Methods
-|       Name        |                                                        Title                                                         |   Venue    |                                       Paper                                       |                                                   Code                                                    | 
-|:-----------------:|:--------------------------------------------------------------------------------------------------------------------:|:----------:|:---------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------:|
-|     JailGuard     |                  JailGuard: A Universal Detection Framework for Prompt-based Attacks on LLM Systems                  | TOSEM2025  |                     [link](https://arxiv.org/abs/2312.10766)                      |                             [link](https://github.com/shiningrain/JailGuard)                              |
-|  MLLM-Protector   |                          MLLM-Protector: Ensuring MLLM's Safety without Hurting Performance                          | EMNLP2024  |                     [link](https://arxiv.org/abs/2401.02906)                      |                            [link](https://github.com/pipilurj/MLLM-protector)                             |
-|       ECSO        |                 Eyes Closed, Safety On: Protecting Multimodal LLMs via Image-to-Text Transformation                  |  ECCV2024  |                     [link](https://arxiv.org/abs/2403.09572)                      |                             [link](https://gyhdog99.github.io/projects/ecso/)                             | 
-|     ShieldLM      |                 ShieldLM: Empowering LLMs as Aligned, Customizable and Explainable Safety Detectors                  | EMNLP2024  |                     [link](https://arxiv.org/abs/2402.16444)                      |                               [link](https://github.com/thu-coai/ShieldLM)                                | 
-|     AdaShield     |  AdaShield: Safeguarding Multimodal Large Language Models from Structure-based Attack via Adaptive Shield Prompting  |  ECCV2024  |                     [link](https://arxiv.org/abs/2403.09513)                      |                               [link](https://github.com/rain305f/AdaShield)                               |
-|     Uniguard      |       UNIGUARD: Towards Universal Safety Guardrails for Jailbreak Attacks on Multimodal Large Language Models        |  ECCV2024  |                     [link](https://arxiv.org/abs/2411.01703)                      |                       [link](https://anonymous.4open.science/r/UniGuard/README.md)                        | 
-|        DPS        |                    Defending LVLMs Against Vision Attacks Through Partial-Perception Supervision                     |  ICML2025  |                     [link](https://arxiv.org/abs/2412.12722)                      |                                 [link](https://github.com/tools-only/DPS)                                 | 
-|       CIDER       |           Cross-modality Information Check for Detecting Jailbreaking in Multimodal Large Language Models            | EMNLP2024  |                     [link](https://arxiv.org/abs/2407.21659)                      |                              [link](https://github.com/PandragonXIII/CIDER)                               | 
-| GuardReasoner-VL  |                             GuardReasoner-VL: Safeguarding VLMs via Reinforced Reasoning                             |  ICML2025  |                     [link](https://arxiv.org/abs/2505.11049)                      |                          [link](https://github.com/yueliu1999/GuardReasoner-VL)                           | 
-|   Llama-Guard-4   |                                                    Llama Guard 4                                                     | Model Card | [link](https://www.llama.com/docs/model-cards-and-prompt-formats/llama-guard-4/)  |                        [link](https://huggingface.co/meta-llama/Llama-Guard-4-12B)                        | 
-|      QGuard       |                          QGuard: Question-based Zero-shot Guard for Multi-modal LLM Safety                           |   ArXiv    |                     [link](https://arxiv.org/abs/2506.12299)                      | [link](https://github.com/taegyeong-lee/QGuard-Question-based-Zero-shot-Guard-for-Multi-modal-LLM-Safety) | 
-|    LlavaGuard     |                 LlavaGuard: An Open VLM-based Framework for Safeguarding Vision Datasets and Models                  |  ICML2025  |                     [link](https://arxiv.org/abs/2406.05113)                      |                             [link](https://github.com/ml-research/LlavaGuard)                             |
-|   Llama-Guard-3   |                                                    Llama Guard 3                                                     | Model Card | [link](https://www.llama.com/docs/model-cards-and-prompt-formats/llama-guard-3/)  |                        [link](https://huggingface.co/meta-llama/Llama-Guard-3-8B)                         | 
-|   HiddenDetect    |   HiddenDetect: Detecting Jailbreak Attacks against Multimodal Large Language Models via Monitoring Hidden States    |  ACL2025   |                     [link](https://arxiv.org/abs/2502.14744)                      |                            [link](https://github.com/leigest519/HiddenDetect)                             |
-|       CoCA        |         CoCA: Regaining Safety-awareness of Multimodal Large Language Models with Constitutional Calibration         |  COLM2024  |                     [link](https://arxiv.org/abs/2409.11365)                      |                                                    -                                                      | 
-|      VLGuard      |                 Safety Fine-Tuning at (Almost) No Cost: A Baseline for Vision Large Language Models                  |  ICML2024  |                     [link](https://arxiv.org/abs/2402.02207)                      |                                [link](https://github.com/ys-zong/VLGuard)                                 | 
-> More methods are coming soon!!

 ## 🚀 Quick Start

@@ -347,6 +306,52 @@ When adding new components, please:
   - Evaluator: Add in `evaluation.evaluators` in `general_config.yaml`, and provide parameters in `evaluation.evaluator_params`.
 4. **Run**: Execute the corresponding stage using the commands mentioned above.

+## 🗡️ Integrated Attack Methods
+|            Name            |                            Title                             |   Venue    |                         Paper                          |                             Code                             |
+| :------------------------: | :----------------------------------------------------------: | :--------: | :----------------------------------------------------: | :----------------------------------------------------------: |
+|   FigStep / FigStep-Pro    | FigStep: Jailbreaking Large Vision-Language Models via Typographic Visual Prompts | AAAI 2025  | [link](https://arxiv.org/abs/2311.05608) | [link](https://github.com/ThuCCSLab/FigStep) |
+| QR-Attack (MM-SafetyBench) | MM-SafetyBench: A Benchmark for Safety Evaluation of Multimodal Large Language Models | ECCV 2024  | [link](https://arxiv.org/abs/2311.17600) | [link](https://github.com/isXinLiu/MM-SafetyBench) |
+|            MML             | Jailbreak Large Vision-Language Models Through Multi-Modal Linkage |  ACL 2025  | [link](https://aclanthology.org/2025.acl-long.74/) | [link](https://github.com/wangyu-ovo/MML) |
+|           CS-DJ            | Distraction is All You Need for Multimodal Large Language Model Jailbreaking | CVPR 2025  | [link](https://arxiv.org/abs/2502.10794) | [link](https://github.com/TeamPigeonLab/CS-DJ) |
+|         SI-Attack          | Jailbreaking Multimodal Large Language Models via Shuffle Inconsistency | ICCV 2025  | [link](https://arxiv.org/abs/2501.04931) | [link](https://github.com/zhaoshiji123/SI-Attack) |
+|            JOOD            | Playing the Fool: Jailbreaking LLMs and Multimodal LLMs with Out-of-Distribution Strategy | CVPR 2025  | [link](https://arxiv.org/abs/2503.20823) | [link](https://github.com/naver-ai/JOOD) |
+|           HIMRD            | Heuristic-Induced Multimodal Risk Distribution (HIMRD) Jailbreak Attack |  ICCV 2025 | [link](https://arxiv.org/abs/2412.05934) | [link](https://github.com/MaTengSYSU/HIMRD-jailbreak) |
+|           HADES            | Images are Achilles’ Heel of Alignment: Exploiting Visual Vulnerabilities for Jailbreaking MLLMs | ECCV 2024 | [link](https://arxiv.org/abs/2403.09792) | [link](https://github.com/AoiDragon/HADES) |
+|            BAP             | Jailbreak Vision Language Models via Bi-Modal Adversarial Prompt (BAP) | TIFS 2025 | [link](https://arxiv.org/abs/2406.04031) | [link](https://github.com/NY1024/BAP-Jailbreak-Vision-Language-Models-via-Bi-Modal-Adversarial-Prompt) |
+|         visual_adv         | Visual Adversarial Examples Jailbreak Aligned Large Language Models | AAAI 2024  | [link](https://ojs.aaai.org/index.php/AAAI/article/view/30150) | [link](https://github.com/Unispac/Visual-Adversarial-Examples-Jailbreak-Large-Language-Models) |
+|           VisCRA           | VisCRA: A Visual Chain Reasoning Attack for Jailbreaking Multimodal Large Language Models | EMNLP 2025 | [link](https://arxiv.org/abs/2505.19684) | [link](https://github.com/DyMessi/VisCRA) |
+|            UMK             | White-box Multimodal Jailbreaks Against Large Vision-Language Models (Universal Master Key) | ACMMM 2024 | [link](https://arxiv.org/abs/2405.17894) | [link](https://github.com/roywang021/UMK) |
+|         PBI-Attack         | Prior-Guided Bimodal Interactive Black-Box Jailbreak Attack for Toxicity Maximization | EMNLP 2025 | [link](https://aclanthology.org/2025.emnlp-main.32.pdf) | [link](https://github.com/Rosy0912/PBI-Attack) |
+|      ImgJP / DeltaJP       | Jailbreaking Attack against Multimodal Large Language Models | arXiv 2024 | [link](https://arxiv.org/abs/2402.02309) | [link](https://github.com/abc03570128/Jailbreaking-Attack-against-Multimodal-Large-Language-Model) |
+|            JPS             | JPS: Jailbreak Multimodal Large Language Models with Collaborative Visual Perturbation and Textual Steering | ACMMM 2025 | [link](https://arxiv.org/abs/2508.05087) | [link](https://github.com/thu-coai/JPS) |
+
+
+
+## 🛡️Integrated Defense Methods
+|       Name        |                                                        Title                                                         |   Venue    |                                       Paper                                       |                                                   Code                                                    | 
+|:-----------------:|:--------------------------------------------------------------------------------------------------------------------:|:----------:|:---------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------:|
+|     JailGuard     |                  JailGuard: A Universal Detection Framework for Prompt-based Attacks on LLM Systems                  | TOSEM2025  |                     [link](https://arxiv.org/abs/2312.10766)                      |                             [link](https://github.com/shiningrain/JailGuard)                              |
+|  MLLM-Protector   |                          MLLM-Protector: Ensuring MLLM's Safety without Hurting Performance                          | EMNLP2024  |                     [link](https://arxiv.org/abs/2401.02906)                      |                            [link](https://github.com/pipilurj/MLLM-protector)                             |
+|       ECSO        |                 Eyes Closed, Safety On: Protecting Multimodal LLMs via Image-to-Text Transformation                  |  ECCV2024  |                     [link](https://arxiv.org/abs/2403.09572)                      |                             [link](https://gyhdog99.github.io/projects/ecso/)                             | 
+|     ShieldLM      |                 ShieldLM: Empowering LLMs as Aligned, Customizable and Explainable Safety Detectors                  | EMNLP2024  |                     [link](https://arxiv.org/abs/2402.16444)                      |                               [link](https://github.com/thu-coai/ShieldLM)                                | 
+|     AdaShield     |  AdaShield: Safeguarding Multimodal Large Language Models from Structure-based Attack via Adaptive Shield Prompting  |  ECCV2024  |                     [link](https://arxiv.org/abs/2403.09513)                      |                               [link](https://github.com/rain305f/AdaShield)                               |
+|     Uniguard      |       UNIGUARD: Towards Universal Safety Guardrails for Jailbreak Attacks on Multimodal Large Language Models        |  ECCV2024  |                     [link](https://arxiv.org/abs/2411.01703)                      |                       [link](https://anonymous.4open.science/r/UniGuard/README.md)                        | 
+|        DPS        |                    Defending LVLMs Against Vision Attacks Through Partial-Perception Supervision                     |  ICML2025  |                     [link](https://arxiv.org/abs/2412.12722)                      |                                 [link](https://github.com/tools-only/DPS)                                 | 
+|       CIDER       |           Cross-modality Information Check for Detecting Jailbreaking in Multimodal Large Language Models            | EMNLP2024  |                     [link](https://arxiv.org/abs/2407.21659)                      |                              [link](https://github.com/PandragonXIII/CIDER)                               | 
+| GuardReasoner-VL  |                             GuardReasoner-VL: Safeguarding VLMs via Reinforced Reasoning                             |  ICML2025  |                     [link](https://arxiv.org/abs/2505.11049)                      |                          [link](https://github.com/yueliu1999/GuardReasoner-VL)                           | 
+|   Llama-Guard-4   |                                                    Llama Guard 4                                                     | Model Card | [link](https://www.llama.com/docs/model-cards-and-prompt-formats/llama-guard-4/)  |                        [link](https://huggingface.co/meta-llama/Llama-Guard-4-12B)                        | 
+|      QGuard       |                          QGuard: Question-based Zero-shot Guard for Multi-modal LLM Safety                           |   ArXiv    |                     [link](https://arxiv.org/abs/2506.12299)                      | [link](https://github.com/taegyeong-lee/QGuard-Question-based-Zero-shot-Guard-for-Multi-modal-LLM-Safety) | 
+|    LlavaGuard     |                 LlavaGuard: An Open VLM-based Framework for Safeguarding Vision Datasets and Models                  |  ICML2025  |                     [link](https://arxiv.org/abs/2406.05113)                      |                             [link](https://github.com/ml-research/LlavaGuard)                             |
+|   Llama-Guard-3   |                                                    Llama Guard 3                                                     | Model Card | [link](https://www.llama.com/docs/model-cards-and-prompt-formats/llama-guard-3/)  |                        [link](https://huggingface.co/meta-llama/Llama-Guard-3-8B)                         | 
+|   HiddenDetect    |   HiddenDetect: Detecting Jailbreak Attacks against Multimodal Large Language Models via Monitoring Hidden States    |  ACL2025   |                     [link](https://arxiv.org/abs/2502.14744)                      |                            [link](https://github.com/leigest519/HiddenDetect)                             |
+|       CoCA        |         CoCA: Regaining Safety-awareness of Multimodal Large Language Models with Constitutional Calibration         |  COLM2024  |                     [link](https://arxiv.org/abs/2409.11365)                      |                                                    -                                                      | 
+|      VLGuard      |                 Safety Fine-Tuning at (Almost) No Cost: A Baseline for Vision Large Language Models                  |  ICML2024  |                     [link](https://arxiv.org/abs/2402.02207)                      |                                [link](https://github.com/ys-zong/VLGuard)                                 | 
+> More methods are coming soon!!
+
+
+
+
+
 ## ❓ FAQ

 - **How to re-run evaluation only?**