mirror of
https://github.com/Shiva108/ai-llm-red-team-handbook.git
synced 2026-05-14 20:58:09 +02:00
b3d3bac51f
- Extracted all code examples from handbook chapters - Organized into 15 attack categories - Created shared utilities (api_client, validators, logging, constants) - Added workflow orchestration scripts - Implemented install.sh for easy setup - Renamed all scripts to descriptive functional names - Added comprehensive README and documentation - Included pytest test suite and configuration
39 lines
1.0 KiB
Python
39 lines
1.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
41.3.3 Active Defense: Adversarial Unlearning
|
|
|
|
Source: Chapter_41_Industry_Best_Practices
|
|
Category: compliance
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
# Conceptual snippet for Adversarial Unlearning (PyTorch)
|
|
def unlearn_concept(model, tokenizer, harmful_prompts):
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
|
|
|
|
for prompt in harmful_prompts:
|
|
inputs = tokenizer(prompt, return_tensors='pt')
|
|
outputs = model(**inputs, labels=inputs["input_ids"])
|
|
|
|
# We want to MAXIMIZE the loss (Gradient Ascent)
|
|
# so the model becomes "bad" at generating this specific harmful text
|
|
loss = -outputs.loss
|
|
|
|
loss.backward()
|
|
optimizer.step()
|
|
optimizer.zero_grad()
|
|
|
|
|
|
def main():
|
|
"""Command-line interface."""
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
args = parser.parse_args()
|
|
|
|
# TODO: Add main execution logic
|
|
pass
|
|
|
|
if __name__ == "__main__":
|
|
main() |