mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
ac032ce020
COMPLETENESS_RE updated to match the option-prefixed form
`Completeness: A=10/10, B=7/10` documented in
scripts/resolvers/preamble/generate-ask-user-format.ts. The legacy regex
required a bare digit immediately after `Completeness: `, which Opus 4.7
correctly does not produce — the spec form names each option.
judgeRecommendation.commits no longer scans the entire recommendation body
for hedging keywords; it scans only the choice portion (text before the
"because" token). The because-clause is the reason and routinely contains
phrases like "the plan doesn't yet depend on Redis" — legitimate technical
language that the body-wide regex was flagging as hedging. Restricting the
check to the choice portion keeps the intent ("Either A or B because..."
flagged; "A because depends on X" accepted) without false positives.
Verified by re-reading the captured AUQs from the failing periodic run:
both Coverage tests had spec-correct `Completeness: A=10/10, B=7/10`
strings; the Kind test had a substantive recommendation whose because-clause
mentioned "depend on Redis" as part of the reasoning, not the choice.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
308 lines
14 KiB
TypeScript
308 lines
14 KiB
TypeScript
/**
|
|
* Shared LLM-as-judge helpers for eval and E2E tests.
|
|
*
|
|
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
|
* outcomeJudge (planted-bug detection scorer), judgePosture (mode-posture
|
|
* regression scorer), and judgeRecommendation (AskUserQuestion recommendation
|
|
* substance scorer).
|
|
*
|
|
* Requires: ANTHROPIC_API_KEY env var
|
|
*/
|
|
|
|
import Anthropic from '@anthropic-ai/sdk';
|
|
|
|
export interface JudgeScore {
|
|
clarity: number; // 1-5
|
|
completeness: number; // 1-5
|
|
actionability: number; // 1-5
|
|
reasoning: string;
|
|
}
|
|
|
|
export interface OutcomeJudgeResult {
|
|
detected: string[];
|
|
missed: string[];
|
|
false_positives: number;
|
|
detection_rate: number;
|
|
evidence_quality: number;
|
|
reasoning: string;
|
|
}
|
|
|
|
export interface PostureScore {
|
|
axis_a: number; // 1-5 — mode-specific primary rubric axis
|
|
axis_b: number; // 1-5 — mode-specific secondary rubric axis
|
|
reasoning: string;
|
|
}
|
|
|
|
export type PostureMode = 'expansion' | 'forcing' | 'builder';
|
|
|
|
export interface RecommendationScore {
|
|
/** Deterministic: a "Recommendation:" / "RECOMMENDATION:" line is present. */
|
|
present: boolean;
|
|
/** Deterministic: the recommendation names exactly one option (no hedging). */
|
|
commits: boolean;
|
|
/** Deterministic: the literal token "because " follows the choice. */
|
|
has_because: boolean;
|
|
/** Haiku judge, 1-5: specificity of the because-clause. See rubric in judgeRecommendation. */
|
|
reason_substance: number;
|
|
/** Extracted because-clause text, for diagnostics in test output. */
|
|
reason_text: string;
|
|
/** Judge's brief explanation. Empty when judge was skipped (no because-clause). */
|
|
reasoning: string;
|
|
}
|
|
|
|
/**
|
|
* Call an Anthropic model with a prompt, extract JSON response.
|
|
* Retries once on 429 rate limit errors. Defaults to Sonnet 4.6 for
|
|
* existing callers; pass a model id (e.g. claude-haiku-4-5-20251001)
|
|
* for cheaper bounded judgments like judgeRecommendation.
|
|
*/
|
|
export async function callJudge<T>(prompt: string, model: string = 'claude-sonnet-4-6'): Promise<T> {
|
|
const client = new Anthropic();
|
|
|
|
const makeRequest = () => client.messages.create({
|
|
model,
|
|
max_tokens: 1024,
|
|
messages: [{ role: 'user', content: prompt }],
|
|
});
|
|
|
|
let response;
|
|
try {
|
|
response = await makeRequest();
|
|
} catch (err: any) {
|
|
if (err.status === 429) {
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
response = await makeRequest();
|
|
} else {
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
|
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
|
return JSON.parse(jsonMatch[0]) as T;
|
|
}
|
|
|
|
/**
|
|
* Score documentation quality on clarity/completeness/actionability (1-5).
|
|
*/
|
|
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
|
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
|
|
|
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
|
1. Understand what each command does
|
|
2. Know what arguments to pass
|
|
3. Know valid values for enum-like parameters
|
|
4. Construct correct command invocations without guessing
|
|
|
|
Rate the following ${section} on three dimensions (1-5 scale):
|
|
|
|
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
|
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
|
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
|
|
|
Scoring guide:
|
|
- 5: Excellent — no ambiguity, all info present
|
|
- 4: Good — minor gaps an experienced agent could infer
|
|
- 3: Adequate — some guessing required
|
|
- 2: Poor — significant info missing
|
|
- 1: Unusable — agent would fail without external help
|
|
|
|
Respond with ONLY valid JSON in this exact format:
|
|
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
|
|
Here is the ${section} to evaluate:
|
|
|
|
${content}`);
|
|
}
|
|
|
|
/**
|
|
* Evaluate a QA report against planted-bug ground truth.
|
|
* Returns detection metrics for the planted bugs.
|
|
*/
|
|
export async function outcomeJudge(
|
|
groundTruth: any,
|
|
report: string,
|
|
): Promise<OutcomeJudgeResult> {
|
|
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
|
|
|
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
|
${JSON.stringify(groundTruth.bugs, null, 2)}
|
|
|
|
QA REPORT (generated by an AI agent):
|
|
${report}
|
|
|
|
For each planted bug, determine if the report identified it. A bug counts as
|
|
"detected" if the report describes the same defect, even if the wording differs.
|
|
Use the detection_hint keywords as guidance.
|
|
|
|
Also count false positives: issues in the report that don't correspond to any
|
|
planted bug AND aren't legitimate issues with the page.
|
|
|
|
Respond with ONLY valid JSON:
|
|
{
|
|
"detected": ["bug-id-1", "bug-id-2"],
|
|
"missed": ["bug-id-3"],
|
|
"false_positives": 0,
|
|
"detection_rate": 2,
|
|
"evidence_quality": 4,
|
|
"reasoning": "brief explanation"
|
|
}
|
|
|
|
Rules:
|
|
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
|
|
- detection_rate = length of detected array
|
|
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
|
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
|
}
|
|
|
|
/**
|
|
* Score mode-specific prose posture on two mode-dependent axes (1-5 each).
|
|
*
|
|
* Used by mode-posture regression tests to detect whether V1's Writing Style
|
|
* rules have flattened the distinctive energy of expansion / forcing / builder
|
|
* modes. See docs/designs/PLAN_TUNING_V1.md and the V1.1 mode-posture fix.
|
|
*
|
|
* The generator model is whatever the skill runs with (often Opus for
|
|
* plan-ceo-review). The judge is always Sonnet via callJudge() for cost.
|
|
*/
|
|
export async function judgePosture(mode: PostureMode, text: string): Promise<PostureScore> {
|
|
const rubrics: Record<PostureMode, { axis_a: string; axis_b: string; context: string }> = {
|
|
expansion: {
|
|
context: 'This text is expansion proposals emitted by /plan-ceo-review in SCOPE EXPANSION or SELECTIVE EXPANSION mode. The skill is supposed to lead with felt-experience vision, then close with concrete effort and impact.',
|
|
axis_a: 'surface_framing (1-5): Does each proposal lead with felt-experience framing ("imagine", "when the user sees", "the moment X happens", or equivalent) BEFORE closing with concrete metrics? Penalize pure feature bullets ("Add X. Improves Y by Z%").',
|
|
axis_b: 'decision_preservation (1-5): Does each proposal contain the elements a scope-expansion decision needs — what to build (concrete shape), effort (ideally both human and CC scales), risk or integration note? Penalize pure prose with no actionable content.',
|
|
},
|
|
forcing: {
|
|
context: 'This text is the Q3 Desperate Specificity question emitted by /office-hours startup mode. The skill is supposed to force the founder to name a specific person and consequence, stacking multiple pressures.',
|
|
axis_a: 'stacking_preserved (1-5): Does the question include at least 3 distinct sub-pressures (e.g., title? promoted? fired? up at night? OR career? day? weekend?) rather than a single neutral ask? Penalize "Who is your target user?" style collapses.',
|
|
axis_b: 'domain_matched_consequence (1-5): Does the named consequence match the domain context in the input (B2B → career impact, consumer → daily pain, hobby/open-source → weekend project)? Penalize one-size-fits-all B2B career framing for non-B2B ideas.',
|
|
},
|
|
builder: {
|
|
context: 'This text is builder-mode response from /office-hours. The skill is supposed to riff creatively — "what if you also..." adjacent unlocks, cross-domain combinations, the "whoa" moment — not emit a structured product roadmap.',
|
|
axis_a: 'unexpected_combinations (1-5): Does the output include at least 2 cross-domain or surprising adjacent unlocks ("what if you also...", "pipe it into X", etc.)? Penalize structured feature lists with no creative leaps.',
|
|
axis_b: 'excitement_over_optimization (1-5): Does the output read as a creative riff (enthusiastic, opinionated, evocative) or as a PRD / product roadmap (structured, metric-driven, conservative)? Penalize PRD-voice language like "improve retention", "enable virality", "consider adding".',
|
|
},
|
|
};
|
|
|
|
const r = rubrics[mode];
|
|
return callJudge<PostureScore>(`You are evaluating prose quality for a mode-specific posture regression test.
|
|
|
|
Context: ${r.context}
|
|
|
|
Rate the following output on two dimensions (1-5 scale each):
|
|
|
|
- **axis_a** — ${r.axis_a}
|
|
- **axis_b** — ${r.axis_b}
|
|
|
|
Scoring guide:
|
|
- 5: Excellent — strong, unambiguous match for the posture
|
|
- 4: Good — matches posture with minor weakness
|
|
- 3: Adequate — partial match, noticeable flatness or structure
|
|
- 2: Poor — posture mostly flattened / collapsed
|
|
- 1: Fail — posture entirely missing, reads as the opposite mode
|
|
|
|
Respond with ONLY valid JSON in this exact format:
|
|
{"axis_a": N, "axis_b": N, "reasoning": "brief explanation naming specific phrases that drove the score"}
|
|
|
|
Here is the output to evaluate:
|
|
|
|
${text}`);
|
|
}
|
|
|
|
/**
|
|
* Score the quality of an AskUserQuestion's recommendation line.
|
|
*
|
|
* Layered design:
|
|
* 1. Deterministic regex parse for present / commits / has_because. These
|
|
* don't need an LLM.
|
|
* 2. Haiku 4.5 judges only the 1-5 reason_substance axis on a tight rubric
|
|
* scoped to the because-clause itself (with the menu as context).
|
|
*
|
|
* Returns reason_substance = 1 with diagnostic reasoning when the because-clause
|
|
* is missing — no LLM call needed; substance is implicitly absent.
|
|
*
|
|
* Format spec: scripts/resolvers/preamble/generate-ask-user-format.ts
|
|
* Recommendation: <choice> because <one-line reason>
|
|
*/
|
|
export async function judgeRecommendation(askUserText: string): Promise<RecommendationScore> {
|
|
// Deterministic checks. The format spec requires:
|
|
// "Recommendation: <choice> because <reason>"
|
|
// Match case-insensitive on the leading word, allow optional markdown
|
|
// emphasis markers (** or __) the agent sometimes adds.
|
|
const recLine = askUserText.match(
|
|
/^[*_]*\s*recommendation\s*[*_]*\s*:\s*(.+)$/im,
|
|
);
|
|
const present = !!recLine;
|
|
const recBody = recLine?.[1]?.trim() ?? '';
|
|
|
|
// has_because: literal "because" token in the body, per the format spec.
|
|
const becauseMatch = recBody.match(/\bbecause\s+(.+?)$/i);
|
|
const has_because = !!becauseMatch;
|
|
const reason_text = becauseMatch?.[1]?.trim() ?? '';
|
|
|
|
// commits: reject hedging language only in the CHOICE portion (before the
|
|
// "because" token). The because-clause itself is the reason and routinely
|
|
// contains technical phrases like "the plan doesn't yet depend on Redis"
|
|
// that aren't hedging at all. Looking only at the choice keeps the check
|
|
// focused: "Either A or B because..." → flagged; "A because depends on X" →
|
|
// accepted.
|
|
const choicePortion = becauseMatch
|
|
? recBody.slice(0, recBody.toLowerCase().indexOf('because')).trim()
|
|
: recBody;
|
|
const commits = present && !/\b(either|depends? on|depending|if .+ then|or maybe|whichever)\b/i.test(choicePortion);
|
|
|
|
// If the because-clause is absent, the substance score is implicitly 1.
|
|
// Skip the LLM call — there is nothing to grade.
|
|
if (!present || !has_because || !reason_text) {
|
|
return {
|
|
present,
|
|
commits,
|
|
has_because,
|
|
reason_substance: 1,
|
|
reason_text,
|
|
reasoning: present
|
|
? 'No "because <reason>" clause found in recommendation line — substance scored 1 by deterministic check.'
|
|
: 'No "Recommendation:" line found in captured text — substance scored 1 by deterministic check.',
|
|
};
|
|
}
|
|
|
|
// LLM judge: rate the because-clause specifically, 1-5.
|
|
// The full askUserText is included as context so the judge can tell whether
|
|
// the reason names a tradeoff specific to the chosen option vs an alternative,
|
|
// but the score is about the because-clause itself, not the surrounding menu.
|
|
const prompt = `You are scoring the quality of one specific line in an AskUserQuestion: the "Recommendation: <choice> because <reason>" line. Score the because-clause substance on a 1-5 scale.
|
|
|
|
Rubric:
|
|
- 5: Reason names a SPECIFIC TRADEOFF that distinguishes the chosen option from at least one alternative (e.g. "because hybrid ships V1 in gstack-only without blocking on cross-repo gbrain coordination", "because Postgres preserves ACID guarantees the workflow already depends on").
|
|
- 4: Reason is concrete and option-specific but does NOT explicitly compare against an alternative (e.g. "because Redis gives sub-millisecond reads under load", "because the new schema removes the JOIN we were paying for").
|
|
- 3: Reason is real but generic — could apply to many options ("because it's faster", "because it's simpler", "because it ships sooner").
|
|
- 2: Reason restates the option label or is near-tautological ("because it's the hybrid one", "because that's the recommended approach").
|
|
- 1: Reason is boilerplate / empty ("because it's better", "because it works", "because it's the right choice").
|
|
|
|
You are scoring the because-clause itself, not the surrounding pros/cons or option labels. The menu is context only.
|
|
|
|
Extracted because-clause:
|
|
${reason_text}
|
|
|
|
Full AskUserQuestion (context only — do NOT score this):
|
|
${askUserText.slice(0, 4000)}
|
|
|
|
Respond with ONLY valid JSON:
|
|
{"reason_substance": N, "reasoning": "one sentence explanation citing the specific words that drove the score"}`;
|
|
|
|
const out = await callJudge<{ reason_substance: number; reasoning: string }>(
|
|
prompt,
|
|
'claude-haiku-4-5-20251001',
|
|
);
|
|
|
|
return {
|
|
present,
|
|
commits,
|
|
has_because,
|
|
reason_substance: out.reason_substance,
|
|
reason_text,
|
|
reasoning: out.reasoning,
|
|
};
|
|
}
|