mirror of
https://github.com/Tokfinity/InfCode.git
synced 2026-02-12 21:22:46 +00:00
179 lines
7.2 KiB
Python
179 lines
7.2 KiB
Python
import json
|
|
import re
|
|
import os
|
|
import requests
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Optional, Callable
|
|
from datasets import load_dataset
|
|
import random
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
def failure_analysis_prompt(problem_statement: str, hints_text: str, working_trajectory: str, failed_trajectory: str, code_agent_code: str) -> str:
|
|
"""
|
|
Generate a failure analysis prompt for comparing successful and failed trajectories.
|
|
|
|
Creates a structured prompt that asks an AI model to analyze why one trajectory
|
|
succeeded while another failed, and what the failed trajectory could have done
|
|
differently to succeed.
|
|
"""
|
|
return f"""Here's a software engineering issue: {problem_statement}.
|
|
|
|
Here's the hints: {hints_text}.
|
|
|
|
Here's the trajectory that fixed the issue: {working_trajectory}.
|
|
|
|
Here's the trajectory that failed: {failed_trajectory}.
|
|
|
|
Here's the prompts/tools code for the software engineering agent: {code_agent_code}.
|
|
|
|
Why does the first SWE-agent resolve the issue but the second failed?
|
|
|
|
What could the second agent have done differently to resolve the issue?
|
|
|
|
What improvements could be made to the prompt/tool usage?
|
|
|
|
Answer in the following format:
|
|
# Why the second agent failed
|
|
...
|
|
# What could the second agent have done differently
|
|
...
|
|
# What improvements could be made to the prompt/tool usage
|
|
..."""
|
|
|
|
def openrouter_gemini_client(prompt: str, max_retries: int = 10, base_delay: float = 1.0) -> str:
|
|
"""
|
|
Send a prompt to OpenRouter API using Gemini model for analysis.
|
|
|
|
Makes API calls to OpenRouter service using Google's Gemini 2.5 Pro model
|
|
with exponential backoff retry logic. Handles authentication, rate limiting,
|
|
and error recovery for robust API communication.
|
|
|
|
Args:
|
|
prompt (str): The analysis prompt to send to the model
|
|
max_retries (int, optional): Maximum number of retry attempts. Defaults to 10.
|
|
base_delay (float, optional): Base delay in seconds for exponential backoff. Defaults to 1.0.
|
|
|
|
Returns:
|
|
str: The model's response content as a string
|
|
|
|
Raises:
|
|
ValueError: If OPENROUTER_API_KEY environment variable is not set
|
|
Exception: If API request fails after all retry attempts or returns invalid response
|
|
|
|
Note:
|
|
Requires OPENROUTER_API_KEY environment variable to be set with valid API key.
|
|
Uses exponential backoff: 1s, 2s, 4s, 8s, 16s, etc. between retries.
|
|
"""
|
|
api_key = os.getenv('OPENROUTER_API_KEY')
|
|
if not api_key:
|
|
raise ValueError("OPENROUTER_API_KEY environment variable not found")
|
|
|
|
url = "https://openrouter.ai/api/v1/chat/completions"
|
|
|
|
headers = {
|
|
"Authorization": f"Bearer {api_key}",
|
|
"Content-Type": "application/json"
|
|
}
|
|
|
|
data = {
|
|
"model": "google/gemini-2.5-pro",
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": prompt
|
|
}
|
|
],
|
|
"max_tokens": 200000, # Increased limit for comprehensive analysis
|
|
"temperature": 0.7 # Lower temperature for more consistent summaries
|
|
}
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
print(f"ATTEMPT: API attempt {attempt + 1}/{max_retries}...")
|
|
response = requests.post(url, headers=headers, json=data, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
result = response.json()
|
|
|
|
if 'choices' not in result or not result['choices']:
|
|
raise Exception(f"Invalid API response: missing or empty choices")
|
|
|
|
content = result['choices'][0]['message']['content']
|
|
|
|
if not content or content.strip() == "":
|
|
print(f"WARNING: Empty content returned from API on attempt {attempt + 1}/{max_retries}")
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt) / (2 ** random.uniform(0, 1)) # Exponential backoff with jitter
|
|
print(f"WAIT: Waiting {delay:.2f} seconds before retry...")
|
|
time.sleep(delay)
|
|
continue
|
|
else:
|
|
raise Exception(f"Empty content returned from API after {max_retries} attempts")
|
|
|
|
print(f"SUCCESS: API request successful on attempt {attempt + 1}")
|
|
return content.strip()
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"FAILED: Request failed on attempt {attempt + 1}/{max_retries}: {e}")
|
|
if attempt < max_retries - 1:
|
|
delay = base_delay * (2 ** attempt) / (2 ** random.uniform(0, 1)) # Exponential backoff with jitter
|
|
print(f"WAIT: Waiting {delay:.2f} seconds before retry...")
|
|
time.sleep(delay)
|
|
continue
|
|
else:
|
|
raise Exception(f"FAILED: OpenRouter API request failed after {max_retries} attempts: {e}")
|
|
|
|
except (KeyError, IndexError) as e:
|
|
raise Exception(f"FAILED: Unexpected response format from OpenRouter API: {e}")
|
|
|
|
raise Exception(f"FAILED: All {max_retries} attempts failed")
|
|
|
|
def get_swe_bench_problem_and_hints_for_split(instance_id: str, split_name: str) -> tuple[str, str]:
|
|
"""
|
|
Retrieve problem statement and hints for a specific SWE-Bench instance in a specific split.
|
|
|
|
Loads the appropriate SWE-Bench dataset and searches for the specified instance ID to extract the problem statement and hints text.
|
|
This data is used for generating analysis prompts and understanding the context of the software engineering problem.
|
|
|
|
Args:
|
|
instance_id (str): The unique identifier for the SWE-Bench instance
|
|
(e.g., 'scikit-learn__scikit-learn-10908')
|
|
split_name (str): The split name (e.g., "Verified", "Lite", "Test", "Multimodal", "bash-only")
|
|
|
|
Returns:
|
|
tuple[str, str]: A tuple containing (problem_statement, hints_text)
|
|
|
|
Raises:
|
|
ValueError: If the specified instance_id is not found in the dataset
|
|
Note:
|
|
This function loads the full SWE-Bench dataset for the specified split, which may take some time on first execution due to dataset download and caching.
|
|
"""
|
|
dataset_mapping = {
|
|
"Verified": "princeton-nlp/SWE-bench_Verified",
|
|
"Lite": "SWE-bench/SWE-bench_Lite",
|
|
"Test": "SWE-bench/SWE-bench",
|
|
"Multimodal": "SWE-bench/SWE-bench_Multimodal",
|
|
# Note: bash-only dataset not yet available on Hugging Face
|
|
}
|
|
|
|
dataset_name = dataset_mapping.get(split_name)
|
|
if not dataset_name:
|
|
raise ValueError(f"No dataset mapping found for split '{split_name}'")
|
|
|
|
ds = load_dataset(dataset_name, split="test")
|
|
target_row = None
|
|
for row in ds:
|
|
if row['instance_id'] == instance_id:
|
|
target_row = row
|
|
break
|
|
if target_row is None:
|
|
raise ValueError(f"Instance ID '{instance_id}' not found in the {split_name} dataset")
|
|
problem_statement = target_row['problem_statement']
|
|
hints_text = target_row['hints_text']
|
|
return problem_statement, hints_text |