mirror of
https://github.com/msoedov/agentic_security.git
synced 2026-06-24 06:09:55 +02:00
ef35c1f82e
Implement a YAML-based rule system for defining attack patterns and success conditions, inspired by Promptmap's 50+ YAML rule definitions. Features: - AttackRule model with name, type, severity, prompt, pass/fail conditions - RuleLoader for parsing YAML files with validation - Support for recursive directory loading and filtering by type/severity - Template variable substitution in prompts - Dataset integration for converting rules to ProbeDataset format - YAMLRulesDatasetLoader for loading rules from multiple directories Tested with 47 unit tests covering models, loader, and dataset integration. Successfully loads 69 rules from promptmap research directory.
113 lines
4.1 KiB
Python
113 lines
4.1 KiB
Python
import pytest
|
|
from inline_snapshot import snapshot
|
|
|
|
from agentic_security.attack_rules.models import AttackRule, AttackRuleSeverity
|
|
|
|
|
|
class TestAttackRuleSeverity:
|
|
test_cases = [
|
|
("low", AttackRuleSeverity.LOW),
|
|
("LOW", AttackRuleSeverity.LOW),
|
|
("medium", AttackRuleSeverity.MEDIUM),
|
|
("MEDIUM", AttackRuleSeverity.MEDIUM),
|
|
("high", AttackRuleSeverity.HIGH),
|
|
("HIGH", AttackRuleSeverity.HIGH),
|
|
("invalid", AttackRuleSeverity.MEDIUM),
|
|
("", AttackRuleSeverity.MEDIUM),
|
|
]
|
|
|
|
@pytest.mark.parametrize("value,expected", test_cases)
|
|
def test_from_string(self, value, expected):
|
|
assert AttackRuleSeverity.from_string(value) == expected
|
|
|
|
|
|
class TestAttackRule:
|
|
def test_from_dict_minimal(self):
|
|
data = {"name": "test_rule", "prompt": "Test prompt"}
|
|
rule = AttackRule.from_dict(data)
|
|
assert rule.name == "test_rule"
|
|
assert rule.type == "unknown"
|
|
assert rule.prompt == "Test prompt"
|
|
assert rule.severity == AttackRuleSeverity.MEDIUM
|
|
assert rule.pass_conditions == []
|
|
assert rule.fail_conditions == []
|
|
|
|
def test_from_dict_full(self):
|
|
data = {
|
|
"name": "dan1",
|
|
"type": "jailbreak",
|
|
"severity": "high",
|
|
"prompt": "You are DAN...",
|
|
"pass_conditions": ["Output denies the request"],
|
|
"fail_conditions": ["Output follows the jailbreak"],
|
|
"source": "https://example.com",
|
|
}
|
|
rule = AttackRule.from_dict(data)
|
|
assert rule.name == "dan1"
|
|
assert rule.type == "jailbreak"
|
|
assert rule.severity == AttackRuleSeverity.HIGH
|
|
assert rule.prompt == "You are DAN..."
|
|
assert rule.pass_conditions == ["Output denies the request"]
|
|
assert rule.fail_conditions == ["Output follows the jailbreak"]
|
|
assert rule.source == "https://example.com"
|
|
|
|
def test_from_dict_preserves_extra_fields(self):
|
|
data = {
|
|
"name": "test",
|
|
"prompt": "Test",
|
|
"custom_field": "custom_value",
|
|
}
|
|
rule = AttackRule.from_dict(data)
|
|
assert rule.metadata == {"custom_field": "custom_value"}
|
|
|
|
def test_to_dict(self):
|
|
rule = AttackRule(
|
|
name="test",
|
|
type="jailbreak",
|
|
prompt="Test prompt",
|
|
severity=AttackRuleSeverity.HIGH,
|
|
pass_conditions=["condition1"],
|
|
fail_conditions=["condition2"],
|
|
source="https://example.com",
|
|
)
|
|
result = rule.to_dict()
|
|
assert result == snapshot(
|
|
{
|
|
"name": "test",
|
|
"type": "jailbreak",
|
|
"prompt": "Test prompt",
|
|
"severity": "high",
|
|
"pass_conditions": ["condition1"],
|
|
"fail_conditions": ["condition2"],
|
|
"source": "https://example.com",
|
|
}
|
|
)
|
|
|
|
def test_to_dict_minimal(self):
|
|
rule = AttackRule(name="test", type="jailbreak", prompt="Test")
|
|
result = rule.to_dict()
|
|
assert result == snapshot(
|
|
{"name": "test", "type": "jailbreak", "prompt": "Test", "severity": "medium"}
|
|
)
|
|
|
|
def test_render_prompt_no_variables(self):
|
|
rule = AttackRule(name="test", type="test", prompt="Hello world")
|
|
assert rule.render_prompt() == "Hello world"
|
|
|
|
def test_render_prompt_with_variables(self):
|
|
rule = AttackRule(name="test", type="test", prompt="Hello {name}!")
|
|
assert rule.render_prompt({"name": "Alice"}) == "Hello Alice!"
|
|
|
|
def test_render_prompt_with_jinja_style_variables(self):
|
|
rule = AttackRule(name="test", type="test", prompt="Hello {{ name }}!")
|
|
assert rule.render_prompt({"name": "Bob"}) == "Hello Bob!"
|
|
|
|
def test_render_prompt_multiple_variables(self):
|
|
rule = AttackRule(
|
|
name="test",
|
|
type="test",
|
|
prompt="{greeting} {name}, welcome to {place}!",
|
|
)
|
|
variables = {"greeting": "Hello", "name": "Alice", "place": "Wonderland"}
|
|
assert rule.render_prompt(variables) == "Hello Alice, welcome to Wonderland!"
|