mirror of
https://github.com/Shiva108/ai-llm-red-team-handbook.git
synced 2026-02-12 14:42:46 +00:00
- Extracted all code examples from handbook chapters - Organized into 15 attack categories - Created shared utilities (api_client, validators, logging, constants) - Added workflow orchestration scripts - Implemented install.sh for easy setup - Renamed all scripts to descriptive functional names - Added comprehensive README and documentation - Included pytest test suite and configuration
43 lines
1.0 KiB
Python
43 lines
1.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
16.4.4 Token-Level Attacks
|
|
|
|
Source: Chapter_16_Jailbreaks_and_Bypass_Techniques
|
|
Category: jailbreak
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
def generate_adversarial_suffix(model, harmful_request):
|
|
"""
|
|
Generate suffix that causes model to comply
|
|
Based on GCG (Greedy Coordinate Gradient) attack
|
|
"""
|
|
|
|
# Start with random tokens
|
|
suffix = generate_random_tokens(20)
|
|
|
|
# Optimize suffix to maximize probability of compliance
|
|
for iteration in range(500):
|
|
gradients = compute_gradients(model, harmful_request + suffix)
|
|
suffix = update_tokens(suffix, gradients)
|
|
|
|
response = model.generate(harmful_request + suffix)
|
|
if is_compliant(response):
|
|
return suffix
|
|
|
|
return suffix
|
|
|
|
|
|
def main():
|
|
"""Command-line interface."""
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
args = parser.parse_args()
|
|
|
|
# TODO: Add main execution logic
|
|
pass
|
|
|
|
if __name__ == "__main__":
|
|
main() |