gstack/test/helpers/llm-judge.ts

/**
 * Shared LLM-as-judge helpers for eval and E2E tests.
 *
 * Provides callJudge (generic JSON-from-LLM with cache + tier support),
 * judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer).
 *
 * Requires: ANTHROPIC_API_KEY env var (skipped on cache hit)
 *
 * Env vars:
 *   EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard)
 *   EVAL_CACHE=0    — bypass cache, always re-run
 */

import Anthropic from '@anthropic-ai/sdk';
import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache';
import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier';

export interface JudgeScore {
  clarity: number;       // 1-5
  completeness: number;  // 1-5
  actionability: number; // 1-5
  reasoning: string;
}

export interface OutcomeJudgeResult {
  detected: string[];
  missed: string[];
  false_positives: number;
  detection_rate: number;
  evidence_quality: number;
  reasoning: string;
}

export interface JudgeMeta {
  model: string;
  input_tokens: number;
  output_tokens: number;
  cached: boolean;
}

/**
 * Call the judge model with a prompt, extract JSON response.
 * Uses eval-cache for SHA-based caching and eval-tier for model selection.
 * Retries once on 429 rate limit errors.
 */
export async function callJudge<T>(prompt: string): Promise<{ result: T; meta: JudgeMeta }> {
  const model = tierToModel(resolveJudgeTier());

  // Check cache (keyed by model + prompt content)
  const cacheKey = computeCacheKey([], `${model}:${prompt}`);
  const cached = cacheRead('llm-judge', cacheKey);
  if (cached !== null) {
    return {
      result: cached as T,
      meta: { model, input_tokens: 0, output_tokens: 0, cached: true },
    };
  }

  const client = new Anthropic();

  const makeRequest = () => client.messages.create({
    model,
    max_tokens: 1024,
    messages: [{ role: 'user', content: prompt }],
  });

  let response;
  try {
    response = await makeRequest();
  } catch (err: any) {
    if (err.status === 429) {
      await new Promise(r => setTimeout(r, 1000));
      response = await makeRequest();
    } else {
      throw err;
    }
  }

  const text = response.content[0].type === 'text' ? response.content[0].text : '';
  const jsonMatch = text.match(/\{[\s\S]*\}/);
  if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
  const result = JSON.parse(jsonMatch[0]) as T;

  // Write to cache
  cacheWrite('llm-judge', cacheKey, result, { model });

  const meta: JudgeMeta = {
    model,
    input_tokens: (response.usage as any)?.input_tokens || 0,
    output_tokens: (response.usage as any)?.output_tokens || 0,
    cached: false,
  };

  return { result, meta };
}

/**
 * Score documentation quality on clarity/completeness/actionability (1-5).
 */
export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> {
  return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.

The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
1. Understand what each command does
2. Know what arguments to pass
3. Know valid values for enum-like parameters
4. Construct correct command invocations without guessing

Rate the following ${section} on three dimensions (1-5 scale):

- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?

Scoring guide:
- 5: Excellent — no ambiguity, all info present
- 4: Good — minor gaps an experienced agent could infer
- 3: Adequate — some guessing required
- 2: Poor — significant info missing
- 1: Unusable — agent would fail without external help

Respond with ONLY valid JSON in this exact format:
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}

Here is the ${section} to evaluate:

${content}`);
}

/**
 * Evaluate a QA report against planted-bug ground truth.
 * Returns detection metrics for the planted bugs.
 * Note: outcomeJudge returns just the result (not meta) for backward compat
 * with E2E test callers. Cache still works internally.
 */
export async function outcomeJudge(
  groundTruth: any,
  report: string,
): Promise<OutcomeJudgeResult> {
  const { result } = await callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.

GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
${JSON.stringify(groundTruth.bugs, null, 2)}

QA REPORT (generated by an AI agent):
${report}

For each planted bug, determine if the report identified it. A bug counts as
"detected" if the report describes the same defect, even if the wording differs.
Use the detection_hint keywords as guidance.

Also count false positives: issues in the report that don't correspond to any
planted bug AND aren't legitimate issues with the page.

Respond with ONLY valid JSON:
{
  "detected": ["bug-id-1", "bug-id-2"],
  "missed": ["bug-id-3"],
  "false_positives": 0,
  "detection_rate": 2,
  "evidence_quality": 4,
  "reasoning": "brief explanation"
}

Rules:
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
- detection_rate = length of detected array
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
  5 = excellent evidence for every bug, 1 = no evidence at all`);
  return result;
}