#!/usr/bin/env python3 """ 41.4.1 Token-Bucket Rate Limiting (Cost Control) Source: Chapter_41_Industry_Best_Practices Category: compliance """ import time import argparse import sys class TokenBucket: """ Implements a Token Bucket algorithm to strictly limit LLM usage (cost) per user. """ def __init__(self, capacity: int, fill_rate: float): self.capacity = capacity # Max tokens user can burst self.fill_rate = fill_rate # Tokens added per second self.tokens = capacity self.last_update = time.time() def consume(self, estimated_tokens: int) -> bool: now = time.time() # Refill bucket based on time passed added = (now - self.last_update) * self.fill_rate self.tokens = min(self.capacity, self.tokens + added) self.last_update = now if self.tokens >= estimated_tokens: self.tokens -= estimated_tokens return True return False # Usage # User gets 1000 tokens max, refills at 10 tokens/sec limiter = TokenBucket(capacity=1000, fill_rate=10) # User sends a massive 8000 token prompt if limiter.consume(8000): print("Request Allowed") else: print("429 Too Many Requests (Quota Exceeded)") def main(): """Command-line interface.""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") args = parser.parse_args() # TODO: Add main execution logic pass if __name__ == "__main__": main()