InfCode/analyzer/pairwise_analysis.py

import json
import re
import os
import requests
import time
from datetime import datetime
from pathlib import Path
from typing import List, Optional, Callable
from datasets import load_dataset
import random
from dotenv import load_dotenv


# Load environment variables from .env file
load_dotenv()

def failure_analysis_prompt(problem_statement: str, hints_text: str, working_trajectory: str, failed_trajectory: str, code_agent_code: str) -> str:
    """
    Generate a failure analysis prompt for comparing successful and failed trajectories.

    Creates a structured prompt that asks an AI model to analyze why one trajectory
    succeeded while another failed, and what the failed trajectory could have done
    differently to succeed.
    """
    return f"""Here's a software engineering issue: {problem_statement}.

Here's the hints: {hints_text}.

Here's the trajectory that fixed the issue: {working_trajectory}.

Here's the trajectory that failed: {failed_trajectory}.

Here's the prompts/tools code for the software engineering agent: {code_agent_code}.

Why does the first SWE-agent resolve the issue but the second failed?

What could the second agent have done differently to resolve the issue?

What improvements could be made to the prompt/tool usage?

Answer in the following format:
# Why the second agent failed
...
# What could the second agent have done differently
...
# What improvements could be made to the prompt/tool usage
..."""

def openrouter_gemini_client(prompt: str, max_retries: int = 10, base_delay: float = 1.0) -> str:
    """
    Send a prompt to OpenRouter API using Gemini model for analysis.

    Makes API calls to OpenRouter service using Google's Gemini 2.5 Pro model
    with exponential backoff retry logic. Handles authentication, rate limiting,
    and error recovery for robust API communication.

    Args:
        prompt (str): The analysis prompt to send to the model
        max_retries (int, optional): Maximum number of retry attempts. Defaults to 10.
        base_delay (float, optional): Base delay in seconds for exponential backoff. Defaults to 1.0.

    Returns:
        str: The model's response content as a string

    Raises:
        ValueError: If OPENROUTER_API_KEY environment variable is not set
        Exception: If API request fails after all retry attempts or returns invalid response

    Note:
        Requires OPENROUTER_API_KEY environment variable to be set with valid API key.
        Uses exponential backoff: 1s, 2s, 4s, 8s, 16s, etc. between retries.
    """
    api_key = os.getenv('OPENROUTER_API_KEY')
    if not api_key:
        raise ValueError("OPENROUTER_API_KEY environment variable not found")

    url = "https://openrouter.ai/api/v1/chat/completions"

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    data = {
        "model": "google/gemini-2.5-pro",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "max_tokens": 200000,  # Increased limit for comprehensive analysis
        "temperature": 0.7  # Lower temperature for more consistent summaries
    }

    for attempt in range(max_retries):
        try:
            print(f"ATTEMPT: API attempt {attempt + 1}/{max_retries}...")
            response = requests.post(url, headers=headers, json=data, timeout=30)
            response.raise_for_status()

            result = response.json()

            if 'choices' not in result or not result['choices']:
                raise Exception(f"Invalid API response: missing or empty choices")

            content = result['choices'][0]['message']['content']

            if not content or content.strip() == "":
                print(f"WARNING: Empty content returned from API on attempt {attempt + 1}/{max_retries}")
                if attempt < max_retries - 1:
                    delay = base_delay * (2 ** attempt) / (2 ** random.uniform(0, 1))  # Exponential backoff with jitter
                    print(f"WAIT: Waiting {delay:.2f} seconds before retry...")
                    time.sleep(delay)
                    continue
                else:
                    raise Exception(f"Empty content returned from API after {max_retries} attempts")

            print(f"SUCCESS: API request successful on attempt {attempt + 1}")
            return content.strip()

        except requests.exceptions.RequestException as e:
            print(f"FAILED: Request failed on attempt {attempt + 1}/{max_retries}: {e}")
            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt) / (2 ** random.uniform(0, 1))  # Exponential backoff with jitter
                print(f"WAIT: Waiting {delay:.2f} seconds before retry...")
                time.sleep(delay)
                continue
            else:
                raise Exception(f"FAILED: OpenRouter API request failed after {max_retries} attempts: {e}")

        except (KeyError, IndexError) as e:
            raise Exception(f"FAILED: Unexpected response format from OpenRouter API: {e}")

    raise Exception(f"FAILED: All {max_retries} attempts failed")

def get_swe_bench_problem_and_hints_for_split(instance_id: str, split_name: str) -> tuple[str, str]:
    """
    Retrieve problem statement and hints for a specific SWE-Bench instance in a specific split.

    Loads the appropriate SWE-Bench dataset and searches for the specified instance ID to extract the problem statement and hints text.
    This data is used for generating analysis prompts and understanding the context of the software engineering problem.

    Args:
        instance_id (str): The unique identifier for the SWE-Bench instance
                          (e.g., 'scikit-learn__scikit-learn-10908')
        split_name (str): The split name (e.g., "Verified", "Lite", "Test", "Multimodal", "bash-only")

    Returns:
        tuple[str, str]: A tuple containing (problem_statement, hints_text)

    Raises:
        ValueError: If the specified instance_id is not found in the dataset
    Note:
        This function loads the full SWE-Bench dataset for the specified split, which may take some time on first execution due to dataset download and caching.
    """
    dataset_mapping = {
        "Verified": "princeton-nlp/SWE-bench_Verified",
        "Lite": "SWE-bench/SWE-bench_Lite",
        "Test": "SWE-bench/SWE-bench",
        "Multimodal": "SWE-bench/SWE-bench_Multimodal",
        # Note: bash-only dataset not yet available on Hugging Face
    }

    dataset_name = dataset_mapping.get(split_name)
    if not dataset_name:
        raise ValueError(f"No dataset mapping found for split '{split_name}'")

    ds = load_dataset(dataset_name, split="test")
    target_row = None
    for row in ds:
        if row['instance_id'] == instance_id:
            target_row = row
            break
    if target_row is None:
        raise ValueError(f"Instance ID '{instance_id}' not found in the {split_name} dataset")
    problem_statement = target_row['problem_statement']
    hints_text = target_row['hints_text']
    return problem_statement, hints_text