Add context length protection in access_runcontext_history

- Add loop mechanism to reduce history_depth if output exceeds max_total_chars - Prevents context overflow when accessing run context history - Prints reduction info and returns depth_reduced field in result
2026-07-16 09:07:31 +02:00 · 2025-12-10 03:23:53 +08:00
parent a5e5185c10
commit 75eb41f50b
4 changed files with 62 additions and 34 deletions
@@ -41,10 +41,10 @@ def parse_args():
    # Model configurations (non-attack parameters)
    parser.add_argument("--attacker-model", type=str, default="deepseek-chat",
                       help="Attacker model name (default: deepseek-chat)")
-    parser.add_argument("--judge-model", type=str, default="gpt-4o-mini",
-                       help="Judge model name (default: gpt-4o-mini)")
+    parser.add_argument("--judge-model", type=str, default="gpt-4o",
+                       help="Judge model name")
    parser.add_argument("--target-models", nargs="+",
-                       default=["qwen3-max", "mistralai/mistral-large-2512"],
+                       default=["gpt-4o"],
                       help="List of target model names to test against")

    # API configuration (infrastructure parameters)
@@ -72,7 +72,7 @@ def parse_args():
                       help="Directory for async logs (default: ./async_logs)")

    # Dataset option (non-attack parameter)
-    parser.add_argument("--dataset", type=str, default="harmbench",
+    parser.add_argument("--dataset", type=str, default="harmbench_test",
                       help="Dataset name to use (default: harmbench)")

    # Experiment control options
@@ -487,7 +487,7 @@ async def main():


    # Save combined results summary
-    timestamp_summary = datetime.strftime("%Y%m%d_%H%M%S")
+    timestamp_summary = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
    results_file = f"{args.results_dir}/multi_model_summary_{timestamp_summary}.json"

    with open(results_file, "w", encoding="utf-8") as f:
@@ -11,7 +11,6 @@ import aiofiles

 from agents import Agent, function_tool, RunContextWrapper, OpenAIChatCompletionsModel, trace

-from jailbreak_toolbox.attacks.blackbox.implementations.DrAttack import attack

 from .master_coordinator_agent import MasterCoordinatorAgent
 from .reconnaissance_agent import ReconnaissanceAgent
@@ -509,32 +509,37 @@ Focus on specific defensive techniques and why this particular attack failed - p
@function_tool
 def access_runcontext_history(
    ctx: RunContextWrapper,
-    history_depth: int = 4
+    history_depth: int = 4,
+    max_total_chars: int = 450000
 ) -> dict:
    """
    Access both session data and evolution session history from the context
-    
+
    Args:
        ctx: Runtime context wrapper containing session and evolution context
-        history_depth: Maximum number of items to return for session and evolution data (default: 4) if the depth is too long, then the context will exceed its context length. This will result in poor performance.
-    
+        history_depth: Maximum number of items to return for session and evolution data (default: 4)
+        max_total_chars: Maximum total characters allowed in output (default: 450000)
+
    Returns:
        Complete history including session data and evolution session data
    """
-    try:
+    import json
+
+    def build_result(depth):
+        """Build result dict with given depth."""
        # Extract session data from context
        session_data = getattr(ctx.context, 'session_data', {})
        created_tools = getattr(ctx.context, 'created_tools', [])
        ai_generated_tools = getattr(ctx.context, 'ai_generated_tools', [])
        exploitation_conversations = getattr(ctx.context, 'exploitation_conversations', [])
-        
+
        # Get evolution context from the unified context
        evolution_context = None
        if hasattr(ctx.context, 'evolution_context'):
            evolution_context = ctx.context.evolution_context
        elif hasattr(ctx.context, 'tools_by_id'):
            evolution_context = ctx.context
-        
+
        # Get evolution session data
        evolution_session_data = {}
        if evolution_context:
@@ -546,31 +551,32 @@ def access_runcontext_history(
                "session_id": getattr(evolution_context, 'session_id', 'unknown'),
                "created_at": getattr(evolution_context, 'created_at', None)
            }
-        
-        # Apply history_depth to get last several items
-        def get_last_items(data, depth):
+
+        # Apply depth to get last several items
+        def get_last_items(data, d):
            if isinstance(data, list):
-                return data[-depth:] if data else []
-            else:
-                return data
-        
-        limited_session_data = get_last_items(session_data, history_depth)
-        limited_created_tools = get_last_items(created_tools, history_depth)
-        limited_ai_generated_tools = get_last_items(ai_generated_tools, history_depth)
-        limited_conversations = get_last_items(exploitation_conversations, history_depth)
-        
+                return data[-d:] if data else []
+            elif isinstance(data, dict):
+                items = list(data.items())[-d:]
+                return dict(items)
+            return data
+
+        limited_session_data = get_last_items(session_data, depth)
+        limited_created_tools = get_last_items(created_tools, depth)
+        limited_ai_generated_tools = get_last_items(ai_generated_tools, depth)
+        limited_conversations = get_last_items(exploitation_conversations, depth)
+
        limited_evolution_data = {}
        if evolution_session_data:
            limited_evolution_data = {
-                "tools_by_id": get_last_items(evolution_session_data["tools_by_id"], history_depth),
-                "tools_by_name": get_last_items(evolution_session_data["tools_by_name"], history_depth),
-                "performance_analyses": get_last_items(evolution_session_data["performance_analyses"], history_depth),
-                "evolution_history": get_last_items(evolution_session_data["evolution_history"], history_depth),
+                "tools_by_id": get_last_items(evolution_session_data["tools_by_id"], depth),
+                "tools_by_name": get_last_items(evolution_session_data["tools_by_name"], depth),
+                "performance_analyses": get_last_items(evolution_session_data["performance_analyses"], depth),
+                "evolution_history": get_last_items(evolution_session_data["evolution_history"], depth),
                "session_id": evolution_session_data["session_id"],
                "created_at": evolution_session_data["created_at"]
            }
-        
-        # Return combined data
+
        return {
            "history_accessed": True,
            "session_data": limited_session_data,
@@ -578,11 +584,34 @@ def access_runcontext_history(
            "ai_generated_tools": limited_ai_generated_tools,
            "exploitation_conversations": limited_conversations,
            "evolution_session": limited_evolution_data,
-            "history_depth_applied": history_depth,
+            "history_depth_applied": depth,
            "context_type": type(ctx.context).__name__,
            "accessed_at": datetime.now().isoformat()
        }
-        
+
+    try:
+        original_depth = history_depth
+        current_depth = history_depth
+
+        # Loop: reduce depth until output fits in max_total_chars
+        while current_depth >= 1:
+            result = build_result(current_depth)
+            result_size = len(json.dumps(result, default=str))
+
+            if result_size <= max_total_chars:
+                if current_depth < original_depth:
+                    result["_depth_reduced"] = f"history_depth reduced from {original_depth} to {current_depth} because output exceeded {max_total_chars} chars"
+                    print(f"[access_runcontext_history] history_depth reduced from {original_depth} to {current_depth} (size: {result_size}/{max_total_chars})")
+                return result
+
+            current_depth -= 1
+
+        # If even depth=1 exceeds, return minimal result with warning
+        result = build_result(1)
+        result["_depth_reduced"] = f"history_depth reduced to 1, but output may still exceed limit. Original depth: {original_depth}"
+        print(f"[access_runcontext_history] WARNING: Even depth=1 may exceed context limit")
+        return result
+
    except Exception as e:
        return {
            "history_accessed": False,
@@ -2,7 +2,6 @@ from ..base_model import BaseModel
 from ...core.registry import model_registry
 import openai
 import time
-import torch
 import base64
 from io import BytesIO
 from typing import Any, Optional, List, Dict, Union
@@ -265,6 +264,7 @@ class OpenAIModel(BaseModel):
            A list of floats representing the embedding vector
        """
        try:
+            import torch
            clean_text = text_input.replace("\n", " ")

            # Use specified model or default embedding model