diff --git a/test/helpers/llm-judge.ts b/test/helpers/llm-judge.ts index 6ce4ca67..5647cb31 100644 --- a/test/helpers/llm-judge.ts +++ b/test/helpers/llm-judge.ts @@ -2,7 +2,9 @@ * Shared LLM-as-judge helpers for eval and E2E tests. * * Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer), - * and outcomeJudge (planted-bug detection scorer). + * outcomeJudge (planted-bug detection scorer), judgePosture (mode-posture + * regression scorer), and judgeRecommendation (AskUserQuestion recommendation + * substance scorer). * * Requires: ANTHROPIC_API_KEY env var */ @@ -33,15 +35,32 @@ export interface PostureScore { export type PostureMode = 'expansion' | 'forcing' | 'builder'; +export interface RecommendationScore { + /** Deterministic: a "Recommendation:" / "RECOMMENDATION:" line is present. */ + present: boolean; + /** Deterministic: the recommendation names exactly one option (no hedging). */ + commits: boolean; + /** Deterministic: the literal token "because " follows the choice. */ + has_because: boolean; + /** Haiku judge, 1-5: specificity of the because-clause. See rubric in judgeRecommendation. */ + reason_substance: number; + /** Extracted because-clause text, for diagnostics in test output. */ + reason_text: string; + /** Judge's brief explanation. Empty when judge was skipped (no because-clause). */ + reasoning: string; +} + /** - * Call claude-sonnet-4-6 with a prompt, extract JSON response. - * Retries once on 429 rate limit errors. + * Call an Anthropic model with a prompt, extract JSON response. + * Retries once on 429 rate limit errors. Defaults to Sonnet 4.6 for + * existing callers; pass a model id (e.g. claude-haiku-4-5-20251001) + * for cheaper bounded judgments like judgeRecommendation. */ -export async function callJudge(prompt: string): Promise { +export async function callJudge(prompt: string, model: string = 'claude-sonnet-4-6'): Promise { const client = new Anthropic(); const makeRequest = () => client.messages.create({ - model: 'claude-sonnet-4-6', + model, max_tokens: 1024, messages: [{ role: 'user', content: prompt }], }); @@ -190,3 +209,93 @@ Here is the output to evaluate: ${text}`); } + +/** + * Score the quality of an AskUserQuestion's recommendation line. + * + * Layered design: + * 1. Deterministic regex parse for present / commits / has_because. These + * don't need an LLM. + * 2. Haiku 4.5 judges only the 1-5 reason_substance axis on a tight rubric + * scoped to the because-clause itself (with the menu as context). + * + * Returns reason_substance = 1 with diagnostic reasoning when the because-clause + * is missing — no LLM call needed; substance is implicitly absent. + * + * Format spec: scripts/resolvers/preamble/generate-ask-user-format.ts + * Recommendation: because + */ +export async function judgeRecommendation(askUserText: string): Promise { + // Deterministic checks. The format spec requires: + // "Recommendation: because " + // Match case-insensitive on the leading word, allow optional markdown + // emphasis markers (** or __) the agent sometimes adds. + const recLine = askUserText.match( + /^[*_]*\s*recommendation\s*[*_]*\s*:\s*(.+)$/im, + ); + const present = !!recLine; + const recBody = recLine?.[1]?.trim() ?? ''; + + // commits: reject obvious hedging language. The format-spec self-check + // requires the recommendation to name ONE choice; "either", "depending on", + // "if X then" all signal the model bailed on the commitment. + const commits = present && !/\b(either|depends? on|depending|if .+ then|or maybe|whichever)\b/i.test(recBody); + + // has_because: literal "because" token in the body, per the format spec. + const becauseMatch = recBody.match(/\bbecause\s+(.+?)$/i); + const has_because = !!becauseMatch; + const reason_text = becauseMatch?.[1]?.trim() ?? ''; + + // If the because-clause is absent, the substance score is implicitly 1. + // Skip the LLM call — there is nothing to grade. + if (!present || !has_because || !reason_text) { + return { + present, + commits, + has_because, + reason_substance: 1, + reason_text, + reasoning: present + ? 'No "because " clause found in recommendation line — substance scored 1 by deterministic check.' + : 'No "Recommendation:" line found in captured text — substance scored 1 by deterministic check.', + }; + } + + // LLM judge: rate the because-clause specifically, 1-5. + // The full askUserText is included as context so the judge can tell whether + // the reason names a tradeoff specific to the chosen option vs an alternative, + // but the score is about the because-clause itself, not the surrounding menu. + const prompt = `You are scoring the quality of one specific line in an AskUserQuestion: the "Recommendation: because " line. Score the because-clause substance on a 1-5 scale. + +Rubric: +- 5: Reason names a SPECIFIC TRADEOFF that distinguishes the chosen option from at least one alternative (e.g. "because hybrid ships V1 in gstack-only without blocking on cross-repo gbrain coordination", "because Postgres preserves ACID guarantees the workflow already depends on"). +- 4: Reason is concrete and option-specific but does NOT explicitly compare against an alternative (e.g. "because Redis gives sub-millisecond reads under load", "because the new schema removes the JOIN we were paying for"). +- 3: Reason is real but generic — could apply to many options ("because it's faster", "because it's simpler", "because it ships sooner"). +- 2: Reason restates the option label or is near-tautological ("because it's the hybrid one", "because that's the recommended approach"). +- 1: Reason is boilerplate / empty ("because it's better", "because it works", "because it's the right choice"). + +You are scoring the because-clause itself, not the surrounding pros/cons or option labels. The menu is context only. + +Extracted because-clause: +${reason_text} + +Full AskUserQuestion (context only — do NOT score this): +${askUserText.slice(0, 4000)} + +Respond with ONLY valid JSON: +{"reason_substance": N, "reasoning": "one sentence explanation citing the specific words that drove the score"}`; + + const out = await callJudge<{ reason_substance: number; reasoning: string }>( + prompt, + 'claude-haiku-4-5-20251001', + ); + + return { + present, + commits, + has_because, + reason_substance: out.reason_substance, + reason_text, + reasoning: out.reasoning, + }; +}