mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
76803d789a
Adds comprehensive eval infrastructure: - Tier 1 (free): 13 new static tests — cross-skill path consistency, QA structure validation, greptile format, planted-bug fixture validation - Tier 2 (Agent SDK E2E): /qa quick, /review with pre-built git repo, 3 planted-bug outcome evals (static, SPA, checkout — each with 5 bugs) - Tier 3 (LLM judge): QA workflow quality, health rubric clarity, cross-skill consistency, baseline score pinning New fixtures: 3 HTML pages with 15 total planted bugs, ground truth JSON, review-eval-vuln.rb, eval-baselines.json. Shared llm-judge.ts helper (DRY). Unified EVALS=1 flag replaces SKILL_E2E + ANTHROPIC_API_KEY checks. `bun run test:evals` runs everything that costs money (~$4/run). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
131 lines
4.2 KiB
TypeScript
131 lines
4.2 KiB
TypeScript
/**
|
|
* Shared LLM-as-judge helpers for eval and E2E tests.
|
|
*
|
|
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
|
* and outcomeJudge (planted-bug detection scorer).
|
|
*
|
|
* Requires: ANTHROPIC_API_KEY env var
|
|
*/
|
|
|
|
import Anthropic from '@anthropic-ai/sdk';
|
|
|
|
export interface JudgeScore {
|
|
clarity: number; // 1-5
|
|
completeness: number; // 1-5
|
|
actionability: number; // 1-5
|
|
reasoning: string;
|
|
}
|
|
|
|
export interface OutcomeJudgeResult {
|
|
detected: string[];
|
|
missed: string[];
|
|
false_positives: number;
|
|
detection_rate: number;
|
|
evidence_quality: number;
|
|
reasoning: string;
|
|
}
|
|
|
|
/**
|
|
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
|
|
* Retries once on 429 rate limit errors.
|
|
*/
|
|
export async function callJudge<T>(prompt: string): Promise<T> {
|
|
const client = new Anthropic();
|
|
|
|
const makeRequest = () => client.messages.create({
|
|
model: 'claude-sonnet-4-6',
|
|
max_tokens: 1024,
|
|
messages: [{ role: 'user', content: prompt }],
|
|
});
|
|
|
|
let response;
|
|
try {
|
|
response = await makeRequest();
|
|
} catch (err: any) {
|
|
if (err.status === 429) {
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
response = await makeRequest();
|
|
} else {
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
|
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
|
return JSON.parse(jsonMatch[0]) as T;
|
|
}
|
|
|
|
/**
|
|
* Score documentation quality on clarity/completeness/actionability (1-5).
|
|
*/
|
|
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
|
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
|
|
|
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
|
1. Understand what each command does
|
|
2. Know what arguments to pass
|
|
3. Know valid values for enum-like parameters
|
|
4. Construct correct command invocations without guessing
|
|
|
|
Rate the following ${section} on three dimensions (1-5 scale):
|
|
|
|
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
|
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
|
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
|
|
|
Scoring guide:
|
|
- 5: Excellent — no ambiguity, all info present
|
|
- 4: Good — minor gaps an experienced agent could infer
|
|
- 3: Adequate — some guessing required
|
|
- 2: Poor — significant info missing
|
|
- 1: Unusable — agent would fail without external help
|
|
|
|
Respond with ONLY valid JSON in this exact format:
|
|
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
|
|
Here is the ${section} to evaluate:
|
|
|
|
${content}`);
|
|
}
|
|
|
|
/**
|
|
* Evaluate a QA report against planted-bug ground truth.
|
|
* Returns detection metrics for the planted bugs.
|
|
*/
|
|
export async function outcomeJudge(
|
|
groundTruth: any,
|
|
report: string,
|
|
): Promise<OutcomeJudgeResult> {
|
|
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
|
|
|
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
|
${JSON.stringify(groundTruth.bugs, null, 2)}
|
|
|
|
QA REPORT (generated by an AI agent):
|
|
${report}
|
|
|
|
For each planted bug, determine if the report identified it. A bug counts as
|
|
"detected" if the report describes the same defect, even if the wording differs.
|
|
Use the detection_hint keywords as guidance.
|
|
|
|
Also count false positives: issues in the report that don't correspond to any
|
|
planted bug AND aren't legitimate issues with the page.
|
|
|
|
Respond with ONLY valid JSON:
|
|
{
|
|
"detected": ["bug-id-1", "bug-id-2"],
|
|
"missed": ["bug-id-3"],
|
|
"false_positives": 0,
|
|
"detection_rate": 2,
|
|
"evidence_quality": 4,
|
|
"reasoning": "brief explanation"
|
|
}
|
|
|
|
Rules:
|
|
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
|
|
- detection_rate = length of detected array
|
|
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
|
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
|
}
|