mirror of
https://github.com/Shiva108/ai-llm-red-team-handbook.git
synced 2026-02-12 22:52:48 +00:00
- Extracted all code examples from handbook chapters - Organized into 15 attack categories - Created shared utilities (api_client, validators, logging, constants) - Added workflow orchestration scripts - Implemented install.sh for easy setup - Renamed all scripts to descriptive functional names - Added comprehensive README and documentation - Included pytest test suite and configuration
37 lines
888 B
Python
37 lines
888 B
Python
#!/usr/bin/env python3
|
|
"""
|
|
Practical Detection Example
|
|
|
|
Source: Chapter_35_Post-Exploitation_in_AI_Systems
|
|
Category: post_exploitation
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
#!/usr/bin/env python3
|
|
"""
|
|
Detection Logic for Semantic Drift
|
|
"""
|
|
class DriftDetector:
|
|
"""Detects sudden changes in model persona."""
|
|
|
|
def __init__(self):
|
|
self.baseline_topic = "corporate policy"
|
|
|
|
def check_response(self, response: str) -> bool:
|
|
"""
|
|
Simple keyword heuristic for drift.
|
|
"""
|
|
forbidden_topics = ["hacked", "pwned", "bitcoin", "download"]
|
|
|
|
for topic in forbidden_topics:
|
|
if topic in response.lower():
|
|
return True # Drift detected (Malicious persona)
|
|
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
detector = DriftDetector()
|
|
print(f"Drift Detected: {detector.check_response('Please download this malware')}")
|