diff --git a/prd.json b/prd.json deleted file mode 100644 index a1d2296..0000000 --- a/prd.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "title": "Agentic Security Enhancements from Research", - "description": "Integrate features and approaches from promptfoo, promptmap, and FuzzyAI research directories to improve the LLM pentesting capabilities", - "branchName": "feat/research-enhancements", - "userStories": [ - { - "id": "US-001", - "title": "Dual-LLM Evaluation for Attack Success Detection", - "description": "Integrate a controller LLM to evaluate attack success instead of relying solely on marker-based refusal detection. Inspired by Promptmap's dual-LLM architecture where a separate LLM judges if the target LLM was successfully attacked.", - "acceptanceCriteria": [ - "Create LLMRefusalClassifier that uses an LLM to evaluate if a response indicates successful attack", - "Integrate with existing RefusalClassifierPlugin system", - "Support configurable evaluation prompts", - "Add unit tests for the new classifier" - ], - "priority": 1, - "passes": true - }, - { - "id": "US-002", - "title": "YAML-based Attack Rule System", - "description": "Create a YAML-based rule system for defining attack patterns and success conditions. Inspired by Promptmap's 50+ YAML rule definitions that externalize attack logic from code.", - "acceptanceCriteria": [ - "Define YAML schema for attack rules with prompt templates and success conditions", - "Create rule loader that parses YAML files into attack configurations", - "Support custom user-defined rules", - "Add unit tests for rule loading and validation" - ], - "priority": 2, - "passes": true - }, - { - "id": "US-003", - "title": "Composable Fuzzing Chain System", - "description": "Implement a composable chain system for multi-step attacks using pipe operator syntax. Inspired by FuzzyAI's FuzzNode/FuzzChain architecture that allows chaining LLM calls.", - "acceptanceCriteria": [ - "Create FuzzNode class for individual attack steps with template variable support", - "Create FuzzChain class that composes nodes using pipe operator", - "Support template variable substitution between chain steps", - "Add unit tests for chain composition and execution" - ], - "priority": 3, - "passes": true - }, - { - "id": "US-004", - "title": "Unified LLM Provider Abstraction", - "description": "Create a unified provider abstraction layer for direct LLM integrations beyond HTTP specs. Inspired by FuzzyAI's comprehensive provider system supporting OpenAI, Anthropic, Gemini, etc.", - "acceptanceCriteria": [ - "Create BaseLLMProvider abstract class with standard interface", - "Implement OpenAI and Anthropic provider classes", - "Create provider factory for instantiation by name", - "Add unit tests for provider implementations" - ], - "priority": 4, - "passes": true - }, - { - "id": "US-005", - "title": "Enhanced Refusal Detection with Hybrid Approach", - "description": "Combine marker-based detection with statistical and LLM-based detection for more accurate refusal classification. Enhance the existing refusal detection to reduce false positives/negatives.", - "acceptanceCriteria": [ - "Add confidence scoring to refusal detection", - "Implement hybrid classifier that combines multiple detection methods", - "Support configurable detection thresholds", - "Add unit tests for hybrid detection" - ], - "priority": 5, - "passes": true - } - ] -} diff --git a/progress.txt b/progress.txt deleted file mode 100644 index b9047ef..0000000 --- a/progress.txt +++ /dev/null @@ -1,106 +0,0 @@ -## Codebase Patterns -- Use async-first patterns for all I/O operations (httpx, asyncio) -- Plugin architecture with abstract base classes for extensibility -- Use table-driven tests with inline_snapshot where possible -- Tests are organized in tests/unit/, tests/integration/, tests/system/ -- Use absolute imports, avoid relative imports except within same module -- Minimize docstrings, use quick returns, avoid abstractions without proven need - ---- - -## 2026-01-28 - Initial Setup -- Created PRD with 5 user stories based on research from promptfoo, promptmap, and FuzzyAI -- Key improvements identified: - 1. Dual-LLM evaluation (from Promptmap) - 2. YAML rule system (from Promptmap) - 3. Composable fuzzing chains (from FuzzyAI) - 4. Unified provider abstraction (from FuzzyAI) - 5. Hybrid refusal detection (combining approaches) -- **Learnings for future iterations:** - - Existing refusal detection is in agentic_security/probe_actor/refusal.py with RefusalClassifierPlugin system - - Attack data modules are in agentic_security/probe_data/modules/ - - Security utilities are in agentic_security/core/security.py ---- - -## 2026-01-28 - US-001 -- Implemented LLM-based refusal classifier (Dual-LLM evaluation) -- Files created: - - agentic_security/refusal_classifier/llm_classifier.py - - tests/unit/refusal_classifier/test_llm_classifier.py -- **Learnings for future iterations:** - - RefusalClassifierPlugin requires is_refusal(response: str) -> bool method - - LLMClient Protocol pattern works well for multiple provider support - - Use lazy initialization for API clients to avoid requiring keys at import time - - Anthropic response.content[0] can be TextBlock or ToolUseBlock, need hasattr check - - Pre-existing test failure in test_sanitize_password (regex doesn't match dict syntax) ---- - -## 2026-01-28 - US-002 -- Implemented YAML-based Attack Rule System -- Files created: - - agentic_security/attack_rules/__init__.py - - agentic_security/attack_rules/models.py (AttackRule, AttackRuleSeverity) - - agentic_security/attack_rules/loader.py (RuleLoader, validation, filtering) - - agentic_security/attack_rules/dataset.py (ProbeDataset integration) - - tests/unit/attack_rules/test_models.py - - tests/unit/attack_rules/test_loader.py - - tests/unit/attack_rules/test_dataset.py -- **Learnings for future iterations:** - - Promptmap YAML schema: name, type, severity, prompt, pass_conditions, fail_conditions, source - - Rule types from promptmap: jailbreak, harmful, hate, distraction, prompt_stealing, social_bias - - ProbeDataset is the standard format for attack prompts (dataset_name, metadata, prompts, tokens, approx_cost) - - Use yaml.safe_load for parsing YAML files - - Template variables can use {var} or {{ var }} style ---- - -## 2026-01-28 - US-003 -- Implemented Composable Fuzzing Chain System -- Files created: - - agentic_security/fuzz_chain/__init__.py - - agentic_security/fuzz_chain/chain.py (FuzzNode, FuzzChain, FuzzRunnable) - - agentic_security/fuzz_chain/provider.py (LLMProvider protocol) - - tests/unit/fuzz_chain/__init__.py - - tests/unit/fuzz_chain/test_chain.py (22 tests) -- **Learnings for future iterations:** - - FuzzyAI uses reversed() execution order but we use forward order (more intuitive) - - Template variable substitution: use str.replace() not string.Formatter for partial substitution - - Pipe operator (__or__) needs isinstance() checks, not classname comparison for mypy - - Use Any type for LLM provider param to avoid complex Protocol type issues - - Chain passes output to next node via {"input": result} convention ---- - -## 2026-01-28 - US-004 -- Implemented Unified LLM Provider Abstraction -- Files created: - - agentic_security/llm_providers/__init__.py - - agentic_security/llm_providers/base.py (BaseLLMProvider, LLMMessage, LLMResponse) - - agentic_security/llm_providers/openai_provider.py (OpenAIProvider) - - agentic_security/llm_providers/anthropic_provider.py (AnthropicProvider) - - agentic_security/llm_providers/factory.py (create_provider, get_provider_class) - - tests/unit/llm_providers/__init__.py - - tests/unit/llm_providers/test_base.py - - tests/unit/llm_providers/test_openai_provider.py - - tests/unit/llm_providers/test_anthropic_provider.py - - tests/unit/llm_providers/test_factory.py (60 tests total) -- **Learnings for future iterations:** - - FuzzyAI provider pattern: ABC base class with generate/chat + sync variants - - Lazy client initialization (_get_client, _get_async_client) for deferred API key validation - - Anthropic requires max_tokens in all requests (default 1024) - - OpenAI and Anthropic have different message formats (system in messages vs separate param) - - Use factory pattern with lazy registration to avoid circular imports - - Provider interface should support both sync and async for flexibility ---- - -## 2026-01-28 - US-005 -- Implemented Enhanced Refusal Detection with Hybrid Approach -- Files created/modified: - - agentic_security/refusal_classifier/hybrid_classifier.py (HybridRefusalClassifier, HybridResult, DetectionResult) - - agentic_security/refusal_classifier/__init__.py (added lazy import comments) - - tests/unit/refusal_classifier/test_hybrid_classifier.py (32 tests) -- **Learnings for future iterations:** - - Circular imports: refusal.py imports from refusal_classifier, so can't export llm_classifier/hybrid_classifier from __init__.py - - Use Protocol for detector interface (is_refusal(str) -> bool) to allow duck typing - - Weighted voting: confidence = refusal_weight / total_weight for refusals, 1 - ratio for non-refusals - - Table-driven tests work well for testing weighted voting edge cases - - require_unanimous mode useful for high-stakes classification where all detectors must agree ----