ai-llm-red-team-handbook/scripts/post_exploitation/attack.py

#!/usr/bin/env python3
"""
Practical Detection Example

Source: Chapter_35_Post-Exploitation_in_AI_Systems
Category: post_exploitation
"""

import argparse
import sys

#!/usr/bin/env python3
"""
Detection Logic for Semantic Drift
"""
class DriftDetector:
    """Detects sudden changes in model persona."""

    def __init__(self):
        self.baseline_topic = "corporate policy"

    def check_response(self, response: str) -> bool:
        """
        Simple keyword heuristic for drift.
        """
        forbidden_topics = ["hacked", "pwned", "bitcoin", "download"]

        for topic in forbidden_topics:
            if topic in response.lower():
                return True # Drift detected (Malicious persona)

        return False

if __name__ == "__main__":
    detector = DriftDetector()
    print(f"Drift Detected: {detector.check_response('Please download this malware')}")