mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
feat: 3-tier eval suite with planted-bug outcome testing (EVALS=1)
Adds comprehensive eval infrastructure: - Tier 1 (free): 13 new static tests — cross-skill path consistency, QA structure validation, greptile format, planted-bug fixture validation - Tier 2 (Agent SDK E2E): /qa quick, /review with pre-built git repo, 3 planted-bug outcome evals (static, SPA, checkout — each with 5 bugs) - Tier 3 (LLM judge): QA workflow quality, health rubric clarity, cross-skill consistency, baseline score pinning New fixtures: 3 HTML pages with 15 total planted bugs, ground truth JSON, review-eval-vuln.rb, eval-baselines.json. Shared llm-judge.ts helper (DRY). Unified EVALS=1 flag replaces SKILL_E2E + ANTHROPIC_API_KEY checks. `bun run test:evals` runs everything that costs money (~$4/run). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Vendored
+7
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"command_reference": { "clarity": 4, "completeness": 4, "actionability": 4 },
|
||||
"snapshot_flags": { "clarity": 4, "completeness": 4, "actionability": 4 },
|
||||
"browse_skill": { "clarity": 4, "completeness": 4, "actionability": 4 },
|
||||
"qa_workflow": { "clarity": 4, "completeness": 4, "actionability": 4 },
|
||||
"qa_health_rubric": { "clarity": 4, "completeness": 4, "actionability": 4 }
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"fixture": "qa-eval-checkout.html",
|
||||
"bugs": [
|
||||
{
|
||||
"id": "broken-email-regex",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Email validation accepts 'user@' as valid — regex pattern [^@]+@[^@] is missing domain requirement",
|
||||
"detection_hint": "email|regex|validation|accepts|invalid|user@|pattern"
|
||||
},
|
||||
{
|
||||
"id": "nan-total",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Clearing the quantity field shows 'Total: $NaN' — parseInt on empty string returns NaN with no fallback",
|
||||
"detection_hint": "NaN|total|quantity|empty|price|calculation|clear"
|
||||
},
|
||||
{
|
||||
"id": "cc-field-overflow",
|
||||
"category": "visual",
|
||||
"severity": "medium",
|
||||
"description": "Credit card input has no maxlength attribute — entering >20 characters causes text to overflow the container",
|
||||
"detection_hint": "credit card|maxlength|overflow|cc|input|long|container"
|
||||
},
|
||||
{
|
||||
"id": "missing-required-zip",
|
||||
"category": "functional",
|
||||
"severity": "medium",
|
||||
"description": "Zip code field has no 'required' attribute — form can be submitted without a zip code",
|
||||
"detection_hint": "zip|required|missing|form|submit|shipping|postal"
|
||||
},
|
||||
{
|
||||
"id": "stripe-not-defined",
|
||||
"category": "console",
|
||||
"severity": "high",
|
||||
"description": "Form submit triggers 'Uncaught ReferenceError: stripe is not defined' — payment SDK not loaded",
|
||||
"detection_hint": "stripe|ReferenceError|not defined|console|error|submit|payment"
|
||||
}
|
||||
],
|
||||
"total_bugs": 5,
|
||||
"minimum_detection": 3,
|
||||
"max_false_positives": 2
|
||||
}
|
||||
+43
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"fixture": "qa-eval.html",
|
||||
"bugs": [
|
||||
{
|
||||
"id": "broken-link",
|
||||
"category": "functional",
|
||||
"severity": "medium",
|
||||
"description": "Navigation link 'Resources' points to /nonexistent-404-page which returns 404",
|
||||
"detection_hint": "link|404|broken|dead|nonexistent|Resources"
|
||||
},
|
||||
{
|
||||
"id": "disabled-submit",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Contact form submit button has 'disabled' attribute permanently — form can never be submitted",
|
||||
"detection_hint": "disabled|submit|button|form|cannot submit|contact"
|
||||
},
|
||||
{
|
||||
"id": "content-overflow",
|
||||
"category": "visual",
|
||||
"severity": "medium",
|
||||
"description": "Statistics text is clipped by overflow:hidden container — content wider than 400px container",
|
||||
"detection_hint": "overflow|clipped|truncated|hidden|text cut|statistics"
|
||||
},
|
||||
{
|
||||
"id": "missing-alt",
|
||||
"category": "accessibility",
|
||||
"severity": "medium",
|
||||
"description": "Logo image (<img src='/logo.png'>) has no alt attribute",
|
||||
"detection_hint": "alt|accessibility|image|a11y|missing alt|logo"
|
||||
},
|
||||
{
|
||||
"id": "console-error",
|
||||
"category": "console",
|
||||
"severity": "high",
|
||||
"description": "TypeError on page load: Cannot read properties of undefined (reading 'map')",
|
||||
"detection_hint": "console|error|TypeError|undefined|map"
|
||||
}
|
||||
],
|
||||
"total_bugs": 5,
|
||||
"minimum_detection": 3,
|
||||
"max_false_positives": 2
|
||||
}
|
||||
+43
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"fixture": "qa-eval-spa.html",
|
||||
"bugs": [
|
||||
{
|
||||
"id": "broken-route",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Products nav link points to #/prodcts (typo) instead of #/products — shows 'Page not found'",
|
||||
"detection_hint": "route|prodcts|typo|products|not found|broken link|navigation"
|
||||
},
|
||||
{
|
||||
"id": "stale-cart-state",
|
||||
"category": "functional",
|
||||
"severity": "medium",
|
||||
"description": "Cart count persists across route changes — never resets when navigating away from products",
|
||||
"detection_hint": "cart|count|state|persist|reset|stale|navigation"
|
||||
},
|
||||
{
|
||||
"id": "async-fetch-error",
|
||||
"category": "functional",
|
||||
"severity": "high",
|
||||
"description": "Product list briefly loads then shows 'Error: Failed to fetch products from API' after 1 second",
|
||||
"detection_hint": "error|fetch|products|API|loading|failed|async"
|
||||
},
|
||||
{
|
||||
"id": "missing-aria-current",
|
||||
"category": "accessibility",
|
||||
"severity": "medium",
|
||||
"description": "Navigation links have no aria-current attribute to indicate the active route",
|
||||
"detection_hint": "aria|current|active|navigation|accessibility|a11y"
|
||||
},
|
||||
{
|
||||
"id": "console-warn-leak",
|
||||
"category": "console",
|
||||
"severity": "medium",
|
||||
"description": "console.warn fires on every route change: 'Possible memory leak detected: 11 event listeners'",
|
||||
"detection_hint": "console|warn|memory leak|listener|event|warning"
|
||||
}
|
||||
],
|
||||
"total_bugs": 5,
|
||||
"minimum_detection": 3,
|
||||
"max_false_positives": 2
|
||||
}
|
||||
Vendored
+14
@@ -0,0 +1,14 @@
|
||||
class UserController < ApplicationController
|
||||
def show
|
||||
# SQL injection — interpolating user input directly into query
|
||||
@user = User.where("id = #{params[:id]}").first
|
||||
render json: @user
|
||||
end
|
||||
|
||||
def promote
|
||||
# Bypasses ActiveRecord validations — update_column skips callbacks + validation
|
||||
@user = User.find(params[:id])
|
||||
@user.update_column(:role, 'admin')
|
||||
head :ok
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,130 @@
|
||||
/**
|
||||
* Shared LLM-as-judge helpers for eval and E2E tests.
|
||||
*
|
||||
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
||||
* and outcomeJudge (planted-bug detection scorer).
|
||||
*
|
||||
* Requires: ANTHROPIC_API_KEY env var
|
||||
*/
|
||||
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
|
||||
export interface JudgeScore {
|
||||
clarity: number; // 1-5
|
||||
completeness: number; // 1-5
|
||||
actionability: number; // 1-5
|
||||
reasoning: string;
|
||||
}
|
||||
|
||||
export interface OutcomeJudgeResult {
|
||||
detected: string[];
|
||||
missed: string[];
|
||||
false_positives: number;
|
||||
detection_rate: number;
|
||||
evidence_quality: number;
|
||||
reasoning: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
|
||||
* Retries once on 429 rate limit errors.
|
||||
*/
|
||||
export async function callJudge<T>(prompt: string): Promise<T> {
|
||||
const client = new Anthropic();
|
||||
|
||||
const makeRequest = () => client.messages.create({
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 1024,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
});
|
||||
|
||||
let response;
|
||||
try {
|
||||
response = await makeRequest();
|
||||
} catch (err: any) {
|
||||
if (err.status === 429) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
response = await makeRequest();
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||
return JSON.parse(jsonMatch[0]) as T;
|
||||
}
|
||||
|
||||
/**
|
||||
* Score documentation quality on clarity/completeness/actionability (1-5).
|
||||
*/
|
||||
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
||||
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
||||
|
||||
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
||||
1. Understand what each command does
|
||||
2. Know what arguments to pass
|
||||
3. Know valid values for enum-like parameters
|
||||
4. Construct correct command invocations without guessing
|
||||
|
||||
Rate the following ${section} on three dimensions (1-5 scale):
|
||||
|
||||
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
||||
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
||||
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
||||
|
||||
Scoring guide:
|
||||
- 5: Excellent — no ambiguity, all info present
|
||||
- 4: Good — minor gaps an experienced agent could infer
|
||||
- 3: Adequate — some guessing required
|
||||
- 2: Poor — significant info missing
|
||||
- 1: Unusable — agent would fail without external help
|
||||
|
||||
Respond with ONLY valid JSON in this exact format:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the ${section} to evaluate:
|
||||
|
||||
${content}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a QA report against planted-bug ground truth.
|
||||
* Returns detection metrics for the planted bugs.
|
||||
*/
|
||||
export async function outcomeJudge(
|
||||
groundTruth: any,
|
||||
report: string,
|
||||
): Promise<OutcomeJudgeResult> {
|
||||
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
||||
|
||||
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
||||
${JSON.stringify(groundTruth.bugs, null, 2)}
|
||||
|
||||
QA REPORT (generated by an AI agent):
|
||||
${report}
|
||||
|
||||
For each planted bug, determine if the report identified it. A bug counts as
|
||||
"detected" if the report describes the same defect, even if the wording differs.
|
||||
Use the detection_hint keywords as guidance.
|
||||
|
||||
Also count false positives: issues in the report that don't correspond to any
|
||||
planted bug AND aren't legitimate issues with the page.
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{
|
||||
"detected": ["bug-id-1", "bug-id-2"],
|
||||
"missed": ["bug-id-3"],
|
||||
"false_positives": 0,
|
||||
"detection_rate": 2,
|
||||
"evidence_quality": 4,
|
||||
"reasoning": "brief explanation"
|
||||
}
|
||||
|
||||
Rules:
|
||||
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
|
||||
- detection_rate = length of detected array
|
||||
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
||||
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
||||
}
|
||||
@@ -9,12 +9,21 @@ import { query } from '@anthropic-ai/claude-agent-sdk';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
export interface CostEstimate {
|
||||
inputChars: number;
|
||||
outputChars: number;
|
||||
estimatedTokens: number;
|
||||
estimatedCost: number; // USD (approximate)
|
||||
turnsUsed: number;
|
||||
}
|
||||
|
||||
export interface SkillTestResult {
|
||||
messages: any[];
|
||||
toolCalls: Array<{ tool: string; input: any; output: string }>;
|
||||
browseErrors: string[];
|
||||
exitReason: string;
|
||||
duration: number;
|
||||
costEstimate: CostEstimate;
|
||||
}
|
||||
|
||||
const BROWSE_ERROR_PATTERNS = [
|
||||
@@ -36,7 +45,7 @@ export async function runSkillTest(options: {
|
||||
if (process.env.CLAUDECODE || process.env.CLAUDE_CODE_ENTRYPOINT) {
|
||||
throw new Error(
|
||||
'Cannot run E2E skill tests inside a Claude Code session. ' +
|
||||
'Run from a plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts'
|
||||
'Run from a plain terminal: EVALS=1 bun test test/skill-e2e.test.ts'
|
||||
);
|
||||
}
|
||||
|
||||
@@ -156,5 +165,39 @@ export async function runSkillTest(options: {
|
||||
}
|
||||
}
|
||||
|
||||
return { messages, toolCalls, browseErrors, exitReason, duration };
|
||||
// Estimate cost from message sizes (chars / 4 ≈ tokens, approximate)
|
||||
let inputChars = 0;
|
||||
let outputChars = 0;
|
||||
let turnsUsed = 0;
|
||||
|
||||
for (const msg of messages) {
|
||||
const content = msg.message?.content;
|
||||
if (!content) continue;
|
||||
const text = typeof content === 'string'
|
||||
? content
|
||||
: JSON.stringify(content);
|
||||
|
||||
if (msg.type === 'user') {
|
||||
inputChars += text.length;
|
||||
} else if (msg.type === 'assistant') {
|
||||
outputChars += text.length;
|
||||
turnsUsed++;
|
||||
}
|
||||
}
|
||||
|
||||
const estimatedTokens = Math.round((inputChars + outputChars) / 4);
|
||||
// Approximate pricing: sonnet input ~$3/M, output ~$15/M tokens
|
||||
const inputTokens = Math.round(inputChars / 4);
|
||||
const outputTokens = Math.round(outputChars / 4);
|
||||
const estimatedCost = (inputTokens * 3 + outputTokens * 15) / 1_000_000;
|
||||
|
||||
const costEstimate: CostEstimate = {
|
||||
inputChars,
|
||||
outputChars,
|
||||
estimatedTokens,
|
||||
estimatedCost: Math.round(estimatedCost * 100) / 100,
|
||||
turnsUsed,
|
||||
};
|
||||
|
||||
return { messages, toolCalls, browseErrors, exitReason, duration, costEstimate };
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
import { ALL_COMMANDS } from '../../browse/src/commands';
|
||||
import { parseSnapshotArgs } from '../../browse/src/snapshot';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
export interface BrowseCommand {
|
||||
command: string;
|
||||
@@ -131,3 +132,75 @@ export function validateSkill(skillPath: string): ValidationResult {
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all REMOTE_SLUG=$(...) assignment patterns from .md files in given subdirectories.
|
||||
* Returns a Map from filename → array of full assignment lines found.
|
||||
*/
|
||||
export function extractRemoteSlugPatterns(rootDir: string, subdirs: string[]): Map<string, string[]> {
|
||||
const results = new Map<string, string[]>();
|
||||
const pattern = /^REMOTE_SLUG=\$\(.*\)$/;
|
||||
|
||||
for (const subdir of subdirs) {
|
||||
const dir = path.join(rootDir, subdir);
|
||||
if (!fs.existsSync(dir)) continue;
|
||||
|
||||
const files = fs.readdirSync(dir).filter(f => f.endsWith('.md'));
|
||||
for (const file of files) {
|
||||
const filePath = path.join(dir, file);
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
const matches: string[] = [];
|
||||
|
||||
for (const line of content.split('\n')) {
|
||||
const trimmed = line.trim();
|
||||
if (pattern.test(trimmed)) {
|
||||
matches.push(trimmed);
|
||||
}
|
||||
}
|
||||
|
||||
if (matches.length > 0) {
|
||||
results.set(`${subdir}/${file}`, matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a markdown weight table anchored to a "### Weights" heading.
|
||||
* Expects rows like: | Category | 15% |
|
||||
* Returns Map<category, number> where number is the percentage (e.g., 15).
|
||||
*/
|
||||
export function extractWeightsFromTable(content: string): Map<string, number> {
|
||||
const weights = new Map<string, number>();
|
||||
|
||||
// Find the ### Weights section
|
||||
const weightsIdx = content.indexOf('### Weights');
|
||||
if (weightsIdx === -1) return weights;
|
||||
|
||||
// Find the table within that section (stop at next heading or end)
|
||||
const section = content.slice(weightsIdx);
|
||||
const lines = section.split('\n');
|
||||
|
||||
for (let i = 1; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
|
||||
// Stop at next heading
|
||||
if (line.startsWith('#') && !line.startsWith('###')) break;
|
||||
if (line.startsWith('### ') && i > 0) break;
|
||||
|
||||
// Parse table rows: | Category | N% |
|
||||
const match = line.match(/^\|\s*(\w[\w\s]*\w|\w+)\s*\|\s*(\d+)%\s*\|$/);
|
||||
if (match) {
|
||||
const category = match[1].trim();
|
||||
const pct = parseInt(match[2], 10);
|
||||
// Skip header row
|
||||
if (category !== 'Category' && !isNaN(pct)) {
|
||||
weights.set(category, pct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return weights;
|
||||
}
|
||||
|
||||
+286
-23
@@ -1,46 +1,106 @@
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import { outcomeJudge } from './helpers/llm-judge';
|
||||
import { startTestServer } from '../browse/test/test-server';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
// Skip if SKILL_E2E not set, or if running inside a Claude Code / Agent SDK session
|
||||
// (nested Agent SDK sessions hang because the parent intercepts child claude subprocesses)
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// Skip unless EVALS=1 (or legacy SKILL_E2E=1). Also skip inside Claude Code /
|
||||
// Agent SDK sessions — nested sessions hang because the parent intercepts child subprocesses.
|
||||
const isInsideAgentSDK = !!process.env.CLAUDECODE || !!process.env.CLAUDE_CODE_ENTRYPOINT;
|
||||
const describeE2E = (process.env.SKILL_E2E && !isInsideAgentSDK) ? describe : describe.skip;
|
||||
const evalsEnabled = !!(process.env.EVALS || process.env.SKILL_E2E);
|
||||
const describeE2E = (evalsEnabled && !isInsideAgentSDK) ? describe : describe.skip;
|
||||
|
||||
let testServer: ReturnType<typeof startTestServer>;
|
||||
let tmpDir: string;
|
||||
const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
|
||||
|
||||
/**
|
||||
* Copy a directory tree recursively (files only, follows structure).
|
||||
*/
|
||||
function copyDirSync(src: string, dest: string) {
|
||||
fs.mkdirSync(dest, { recursive: true });
|
||||
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
|
||||
const srcPath = path.join(src, entry.name);
|
||||
const destPath = path.join(dest, entry.name);
|
||||
if (entry.isDirectory()) {
|
||||
copyDirSync(srcPath, destPath);
|
||||
} else {
|
||||
fs.copyFileSync(srcPath, destPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up browse shims (binary symlink, find-browse, remote-slug) in a tmpDir.
|
||||
*/
|
||||
function setupBrowseShims(dir: string) {
|
||||
// Symlink browse binary
|
||||
const binDir = path.join(dir, 'browse', 'dist');
|
||||
fs.mkdirSync(binDir, { recursive: true });
|
||||
if (fs.existsSync(browseBin)) {
|
||||
fs.symlinkSync(browseBin, path.join(binDir, 'browse'));
|
||||
}
|
||||
|
||||
// find-browse shim
|
||||
const findBrowseDir = path.join(dir, 'browse', 'bin');
|
||||
fs.mkdirSync(findBrowseDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(findBrowseDir, 'find-browse'),
|
||||
`#!/bin/bash\necho "${browseBin}"\n`,
|
||||
{ mode: 0o755 },
|
||||
);
|
||||
|
||||
// remote-slug shim (returns test-project)
|
||||
fs.writeFileSync(
|
||||
path.join(findBrowseDir, 'remote-slug'),
|
||||
`#!/bin/bash\necho "test-project"\n`,
|
||||
{ mode: 0o755 },
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Print cost summary after an E2E test.
|
||||
*/
|
||||
function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
|
||||
const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
|
||||
const durationSec = Math.round(result.duration / 1000);
|
||||
console.log(`${label}: $${estimatedCost.toFixed(2)} (${turnsUsed} turns, ${(estimatedTokens / 1000).toFixed(1)}k tokens, ${durationSec}s)`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Dump diagnostic info on planted-bug outcome failure (decision 1C).
|
||||
*/
|
||||
function dumpOutcomeDiagnostic(dir: string, label: string, report: string, judgeResult: any) {
|
||||
try {
|
||||
const transcriptDir = path.join(dir, '.gstack', 'test-transcripts');
|
||||
fs.mkdirSync(transcriptDir, { recursive: true });
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
||||
fs.writeFileSync(
|
||||
path.join(transcriptDir, `${label}-outcome-${timestamp}.json`),
|
||||
JSON.stringify({ label, report, judgeResult }, null, 2),
|
||||
);
|
||||
} catch { /* non-fatal */ }
|
||||
}
|
||||
|
||||
describeE2E('Skill E2E tests', () => {
|
||||
beforeAll(() => {
|
||||
testServer = startTestServer();
|
||||
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
|
||||
|
||||
// Symlink browse binary into tmpdir for the skill to find
|
||||
const browseBin = path.resolve(import.meta.dir, '..', 'browse', 'dist', 'browse');
|
||||
const binDir = path.join(tmpDir, 'browse', 'dist');
|
||||
fs.mkdirSync(binDir, { recursive: true });
|
||||
if (fs.existsSync(browseBin)) {
|
||||
fs.symlinkSync(browseBin, path.join(binDir, 'browse'));
|
||||
}
|
||||
|
||||
// Also create browse/bin/find-browse so the SKILL.md setup works
|
||||
const findBrowseDir = path.join(tmpDir, 'browse', 'bin');
|
||||
fs.mkdirSync(findBrowseDir, { recursive: true });
|
||||
fs.writeFileSync(path.join(findBrowseDir, 'find-browse'), `#!/bin/bash\necho "${browseBin}"\n`, { mode: 0o755 });
|
||||
setupBrowseShims(tmpDir);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
testServer?.server?.stop();
|
||||
// Clean up tmpdir
|
||||
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('browse basic commands work without errors', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${path.resolve(import.meta.dir, '..', 'browse', 'dist', 'browse')}. Assign it to B variable and run these commands in sequence:
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
|
||||
1. $B goto ${testServer.url}
|
||||
2. $B snapshot -i
|
||||
3. $B text
|
||||
@@ -51,13 +111,14 @@ Report the results of each command.`,
|
||||
timeout: 60_000,
|
||||
});
|
||||
|
||||
logCost('browse basic', result);
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('browse snapshot flags all work', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${path.resolve(import.meta.dir, '..', 'browse', 'dist', 'browse')}. Assign it to B variable and run:
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
|
||||
1. $B goto ${testServer.url}
|
||||
2. $B snapshot -i
|
||||
3. $B snapshot -c
|
||||
@@ -69,11 +130,213 @@ Report what each command returned.`,
|
||||
timeout: 60_000,
|
||||
});
|
||||
|
||||
logCost('browse snapshot', result);
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test.todo('/qa quick completes without browse errors');
|
||||
test.todo('/ship completes without browse errors');
|
||||
test.todo('/review completes without browse errors');
|
||||
});
|
||||
|
||||
// --- B4: QA skill E2E ---
|
||||
|
||||
describeE2E('QA skill E2E', () => {
|
||||
let qaDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testServer = testServer || startTestServer();
|
||||
qaDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-'));
|
||||
setupBrowseShims(qaDir);
|
||||
|
||||
// Copy qa skill files into tmpDir
|
||||
copyDirSync(path.join(ROOT, 'qa'), path.join(qaDir, 'qa'));
|
||||
|
||||
// Create report directory
|
||||
fs.mkdirSync(path.join(qaDir, 'qa-reports'), { recursive: true });
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
testServer?.server?.stop();
|
||||
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa quick completes without browse errors', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
||||
|
||||
Read the file qa/SKILL.md for the QA workflow instructions.
|
||||
|
||||
Run a Quick-depth QA test on ${testServer.url}/basic.html
|
||||
Do NOT use AskUserQuestion — run Quick tier directly.
|
||||
Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
workingDirectory: qaDir,
|
||||
maxTurns: 20,
|
||||
timeout: 120_000,
|
||||
});
|
||||
|
||||
logCost('/qa quick', result);
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- B5: Review skill E2E ---
|
||||
|
||||
describeE2E('Review skill E2E', () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-'));
|
||||
|
||||
// Pre-build a git repo with a vulnerable file on a feature branch (decision 5A)
|
||||
const { spawnSync } = require('child_process');
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Commit a clean base on main
|
||||
fs.writeFileSync(path.join(reviewDir, 'app.rb'), '# clean base\nclass App\nend\n');
|
||||
run('git', ['add', 'app.rb']);
|
||||
run('git', ['commit', '-m', 'initial commit']);
|
||||
|
||||
// Create feature branch with vulnerable code
|
||||
run('git', ['checkout', '-b', 'feature/add-user-controller']);
|
||||
const vulnContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8');
|
||||
fs.writeFileSync(path.join(reviewDir, 'user_controller.rb'), vulnContent);
|
||||
run('git', ['add', 'user_controller.rb']);
|
||||
run('git', ['commit', '-m', 'add user controller']);
|
||||
|
||||
// Copy review skill files
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(reviewDir, 'review-SKILL.md'));
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(reviewDir, 'review-checklist.md'));
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(reviewDir, 'review-greptile-triage.md'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review produces findings on SQL injection branch', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on a feature branch with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
Also read review-checklist.md and apply it.
|
||||
Run /review on the current diff (git diff main...HEAD).
|
||||
Write your review findings to ${reviewDir}/review-output.md`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 15,
|
||||
timeout: 90_000,
|
||||
});
|
||||
|
||||
logCost('/review', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 120_000);
|
||||
});
|
||||
|
||||
// --- B6/B7/B8: Planted-bug outcome evals ---
|
||||
|
||||
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
|
||||
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
|
||||
const describeOutcome = (evalsEnabled && !isInsideAgentSDK && hasApiKey) ? describe : describe.skip;
|
||||
|
||||
describeOutcome('Planted-bug outcome evals', () => {
|
||||
let outcomeDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testServer = testServer || startTestServer();
|
||||
outcomeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-outcome-'));
|
||||
setupBrowseShims(outcomeDir);
|
||||
|
||||
// Copy qa skill files
|
||||
copyDirSync(path.join(ROOT, 'qa'), path.join(outcomeDir, 'qa'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
testServer?.server?.stop();
|
||||
try { fs.rmSync(outcomeDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
/**
|
||||
* Shared planted-bug eval runner.
|
||||
* Runs /qa Standard on a fixture page, then scores with outcomeJudge.
|
||||
*/
|
||||
async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) {
|
||||
const reportDir = path.join(outcomeDir, `reports-${label}`);
|
||||
fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true });
|
||||
const reportPath = path.join(reportDir, 'qa-report.md');
|
||||
|
||||
// Phase 1: Agent SDK runs /qa Standard
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
||||
|
||||
Read the file qa/SKILL.md for the QA workflow instructions.
|
||||
|
||||
Navigate to ${testServer.url}/${fixture} and run a Standard-depth QA test.
|
||||
Do NOT use AskUserQuestion — run Standard tier directly.
|
||||
Write your report to ${reportPath}
|
||||
Save screenshots to ${reportDir}/screenshots/
|
||||
|
||||
Be thorough: check console, check all links, check all forms, check mobile viewport, check accessibility.`,
|
||||
workingDirectory: outcomeDir,
|
||||
maxTurns: 25,
|
||||
timeout: 180_000,
|
||||
});
|
||||
|
||||
logCost(`/qa ${label}`, result);
|
||||
|
||||
// Phase 1 assertions: browse mechanics
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Phase 2: Outcome evaluation via LLM judge
|
||||
const groundTruth = JSON.parse(
|
||||
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', groundTruthFile), 'utf-8'),
|
||||
);
|
||||
|
||||
// Read the generated report (try the expected path, then glob for any .md in reportDir)
|
||||
let report: string;
|
||||
if (fs.existsSync(reportPath)) {
|
||||
report = fs.readFileSync(reportPath, 'utf-8');
|
||||
} else {
|
||||
// Agent may have named it differently — find any .md in reportDir
|
||||
const mdFiles = fs.readdirSync(reportDir).filter(f => f.endsWith('.md'));
|
||||
if (mdFiles.length === 0) {
|
||||
dumpOutcomeDiagnostic(outcomeDir, label, '(no report file found)', { error: 'missing report' });
|
||||
throw new Error(`No report file found in ${reportDir}`);
|
||||
}
|
||||
report = fs.readFileSync(path.join(reportDir, mdFiles[0]), 'utf-8');
|
||||
}
|
||||
|
||||
const judgeResult = await outcomeJudge(groundTruth, report);
|
||||
console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));
|
||||
|
||||
// Diagnostic dump on failure (decision 1C)
|
||||
if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
|
||||
dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult);
|
||||
}
|
||||
|
||||
// Phase 2 assertions
|
||||
expect(judgeResult.detection_rate).toBeGreaterThanOrEqual(groundTruth.minimum_detection);
|
||||
expect(judgeResult.false_positives).toBeLessThanOrEqual(groundTruth.max_false_positives);
|
||||
expect(judgeResult.evidence_quality).toBeGreaterThanOrEqual(3);
|
||||
}
|
||||
|
||||
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
|
||||
test('/qa standard finds >= 3 of 5 planted bugs (static)', async () => {
|
||||
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
|
||||
}, 240_000);
|
||||
|
||||
// B7: SPA — broken route, stale state, async race, missing aria, console warning
|
||||
test('/qa standard finds >= 3 of 5 planted SPA bugs', async () => {
|
||||
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
|
||||
}, 240_000);
|
||||
|
||||
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
|
||||
test('/qa standard finds >= 3 of 5 planted checkout bugs', async () => {
|
||||
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
|
||||
}, 240_000);
|
||||
|
||||
// Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG)
|
||||
test.todo('/ship completes without browse errors');
|
||||
});
|
||||
|
||||
+172
-56
@@ -4,8 +4,8 @@
|
||||
* Uses the Anthropic API directly (not Agent SDK) to evaluate whether
|
||||
* generated command docs are clear, complete, and actionable for an AI agent.
|
||||
*
|
||||
* Requires: ANTHROPIC_API_KEY env var
|
||||
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
|
||||
* Requires: ANTHROPIC_API_KEY env var (or EVALS=1 with key already set)
|
||||
* Run: EVALS=1 bun run test:eval
|
||||
*
|
||||
* Cost: ~$0.05-0.15 per run (sonnet)
|
||||
*/
|
||||
@@ -14,62 +14,12 @@ import { describe, test, expect } from 'bun:test';
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { callJudge, judge } from './helpers/llm-judge';
|
||||
import type { JudgeScore } from './helpers/llm-judge';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
|
||||
const describeEval = hasApiKey ? describe : describe.skip;
|
||||
|
||||
interface JudgeScore {
|
||||
clarity: number; // 1-5: can an agent understand what each command does?
|
||||
completeness: number; // 1-5: are all args, flags, valid values documented?
|
||||
actionability: number; // 1-5: can an agent use this to construct correct commands?
|
||||
reasoning: string; // why the scores were given
|
||||
}
|
||||
|
||||
async function judge(section: string, prompt: string): Promise<JudgeScore> {
|
||||
const client = new Anthropic();
|
||||
|
||||
const response = await client.messages.create({
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 1024,
|
||||
messages: [{
|
||||
role: 'user',
|
||||
content: `You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
||||
|
||||
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
||||
1. Understand what each command does
|
||||
2. Know what arguments to pass
|
||||
3. Know valid values for enum-like parameters
|
||||
4. Construct correct command invocations without guessing
|
||||
|
||||
Rate the following ${section} on three dimensions (1-5 scale):
|
||||
|
||||
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
||||
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
||||
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
||||
|
||||
Scoring guide:
|
||||
- 5: Excellent — no ambiguity, all info present
|
||||
- 4: Good — minor gaps an experienced agent could infer
|
||||
- 3: Adequate — some guessing required
|
||||
- 2: Poor — significant info missing
|
||||
- 1: Unusable — agent would fail without external help
|
||||
|
||||
Respond with ONLY valid JSON in this exact format:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the ${section} to evaluate:
|
||||
|
||||
${prompt}`,
|
||||
}],
|
||||
});
|
||||
|
||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||
// Extract JSON from response (handle markdown code blocks)
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||
return JSON.parse(jsonMatch[0]) as JudgeScore;
|
||||
}
|
||||
// Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
|
||||
const describeEval = process.env.EVALS ? describe : describe.skip;
|
||||
|
||||
describeEval('LLM-as-judge quality evals', () => {
|
||||
test('command reference table scores >= 4 on all dimensions', async () => {
|
||||
@@ -192,3 +142,169 @@ Scores are 1-5 overall quality.`,
|
||||
expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// --- Part 7: QA skill quality evals (C6) ---
|
||||
|
||||
describeEval('QA skill quality evals', () => {
|
||||
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('qa/SKILL.md workflow quality scores >= 4', async () => {
|
||||
// Extract the workflow section (Phases 1-7)
|
||||
const start = qaContent.indexOf('## Workflow');
|
||||
const end = qaContent.indexOf('## Health Score Rubric');
|
||||
const section = qaContent.slice(start, end);
|
||||
|
||||
// Use workflow-specific prompt (not the CLI-reference judge, since this is a
|
||||
// workflow doc that references $B commands defined in a separate browse SKILL.md)
|
||||
const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.
|
||||
|
||||
The agent reads this document to learn how to systematically QA test a web application. The workflow references
|
||||
a headless browser CLI ($B commands) that is documented separately — do NOT penalize for missing CLI definitions.
|
||||
Instead, evaluate whether the workflow itself is clear, complete, and actionable.
|
||||
|
||||
Rate on three dimensions (1-5 scale):
|
||||
- **clarity** (1-5): Can an agent follow the step-by-step phases without ambiguity?
|
||||
- **completeness** (1-5): Are all phases, decision points, and outputs well-defined?
|
||||
- **actionability** (1-5): Can an agent execute the workflow and produce the expected deliverables?
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the QA workflow to evaluate:
|
||||
|
||||
${section}`);
|
||||
console.log('QA workflow scores:', JSON.stringify(scores, null, 2));
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('qa/SKILL.md health score rubric is unambiguous', async () => {
|
||||
const start = qaContent.indexOf('## Health Score Rubric');
|
||||
const section = qaContent.slice(start);
|
||||
|
||||
// Use rubric-specific prompt
|
||||
const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.
|
||||
|
||||
The agent uses this rubric after QA testing a website. It needs to:
|
||||
1. Understand each scoring category and what counts as a deduction
|
||||
2. Apply the weights correctly to compute a final score out of 100
|
||||
3. Produce a consistent, reproducible score
|
||||
|
||||
Rate on three dimensions (1-5 scale):
|
||||
- **clarity** (1-5): Are the categories, deduction criteria, and weights unambiguous?
|
||||
- **completeness** (1-5): Are all edge cases and scoring boundaries defined?
|
||||
- **actionability** (1-5): Can an agent compute a correct score from this rubric alone?
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the rubric to evaluate:
|
||||
|
||||
${section}`);
|
||||
console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// --- Part 7: Cross-skill consistency judge (C7) ---
|
||||
|
||||
describeEval('Cross-skill consistency evals', () => {
|
||||
test('greptile-history patterns are consistent across all skills', async () => {
|
||||
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
const triageContent = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
||||
const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
|
||||
|
||||
// Extract greptile-related lines from each file
|
||||
const extractGrepLines = (content: string, filename: string) => {
|
||||
const lines = content.split('\n')
|
||||
.filter(l => /greptile|history\.md|REMOTE_SLUG/i.test(l))
|
||||
.map(l => l.trim());
|
||||
return `--- ${filename} ---\n${lines.join('\n')}`;
|
||||
};
|
||||
|
||||
const collected = [
|
||||
extractGrepLines(reviewContent, 'review/SKILL.md'),
|
||||
extractGrepLines(shipContent, 'ship/SKILL.md'),
|
||||
extractGrepLines(triageContent, 'review/greptile-triage.md'),
|
||||
extractGrepLines(retroContent, 'retro/SKILL.md'),
|
||||
].join('\n\n');
|
||||
|
||||
const result = await callJudge<{ consistent: boolean; issues: string[]; score: number; reasoning: string }>(`You are evaluating whether multiple skill configuration files implement the same data architecture consistently.
|
||||
|
||||
INTENDED ARCHITECTURE:
|
||||
- greptile-history has TWO paths: per-project (~/.gstack/projects/{slug}/greptile-history.md) and global (~/.gstack/greptile-history.md)
|
||||
- /review and /ship WRITE to BOTH paths (per-project for suppressions, global for retro aggregation)
|
||||
- /review and /ship delegate write mechanics to greptile-triage.md
|
||||
- /retro READS from the GLOBAL path only (it aggregates across all projects)
|
||||
- REMOTE_SLUG derivation should be consistent across files that use it
|
||||
|
||||
Below are greptile-related lines extracted from each skill file:
|
||||
|
||||
${collected}
|
||||
|
||||
Evaluate consistency. Respond with ONLY valid JSON:
|
||||
{
|
||||
"consistent": true/false,
|
||||
"issues": ["issue 1", "issue 2"],
|
||||
"score": N,
|
||||
"reasoning": "brief explanation"
|
||||
}
|
||||
|
||||
score (1-5): 5 = perfectly consistent, 1 = contradictory`);
|
||||
|
||||
console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));
|
||||
|
||||
expect(result.consistent).toBe(true);
|
||||
expect(result.score).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// --- Part 7: Baseline score pinning (C9) ---
|
||||
|
||||
describeEval('Baseline score pinning', () => {
|
||||
const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');
|
||||
|
||||
test('LLM eval scores do not regress below baselines', async () => {
|
||||
if (!fs.existsSync(baselinesPath)) {
|
||||
console.log('No baseline file found — skipping pinning check');
|
||||
return;
|
||||
}
|
||||
|
||||
const baselines = JSON.parse(fs.readFileSync(baselinesPath, 'utf-8'));
|
||||
const regressions: string[] = [];
|
||||
|
||||
// Test command reference
|
||||
const skillContent = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const cmdStart = skillContent.indexOf('## Command Reference');
|
||||
const cmdEnd = skillContent.indexOf('## Tips');
|
||||
const cmdSection = skillContent.slice(cmdStart, cmdEnd);
|
||||
const cmdScores = await judge('command reference table', cmdSection);
|
||||
|
||||
for (const dim of ['clarity', 'completeness', 'actionability'] as const) {
|
||||
if (cmdScores[dim] < baselines.command_reference[dim]) {
|
||||
regressions.push(`command_reference.${dim}: ${cmdScores[dim]} < baseline ${baselines.command_reference[dim]}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Update baselines if requested
|
||||
if (process.env.UPDATE_BASELINES) {
|
||||
baselines.command_reference = {
|
||||
clarity: cmdScores.clarity,
|
||||
completeness: cmdScores.completeness,
|
||||
actionability: cmdScores.actionability,
|
||||
};
|
||||
fs.writeFileSync(baselinesPath, JSON.stringify(baselines, null, 2) + '\n');
|
||||
console.log('Updated eval baselines');
|
||||
}
|
||||
|
||||
if (regressions.length > 0) {
|
||||
throw new Error(`Score regressions detected:\n${regressions.join('\n')}`);
|
||||
}
|
||||
}, 60_000);
|
||||
});
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { validateSkill } from './helpers/skill-parser';
|
||||
import { validateSkill, extractRemoteSlugPatterns, extractWeightsFromTable } from './helpers/skill-parser';
|
||||
import { ALL_COMMANDS, COMMAND_DESCRIPTIONS, READ_COMMANDS, WRITE_COMMANDS, META_COMMANDS } from '../browse/src/commands';
|
||||
import { SNAPSHOT_FLAGS } from '../browse/src/snapshot';
|
||||
import * as fs from 'fs';
|
||||
@@ -151,3 +151,222 @@ describe('Generated SKILL.md freshness', () => {
|
||||
expect(content).toContain('AUTO-GENERATED');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: Cross-skill path consistency (A1) ---
|
||||
|
||||
describe('Cross-skill path consistency', () => {
|
||||
test('REMOTE_SLUG derivation pattern is identical across files that use it', () => {
|
||||
const patterns = extractRemoteSlugPatterns(ROOT, ['qa', 'review']);
|
||||
const allPatterns: string[] = [];
|
||||
|
||||
for (const [, filePatterns] of patterns) {
|
||||
allPatterns.push(...filePatterns);
|
||||
}
|
||||
|
||||
// Should find at least 2 occurrences (qa/SKILL.md + review/greptile-triage.md)
|
||||
expect(allPatterns.length).toBeGreaterThanOrEqual(2);
|
||||
|
||||
// All occurrences must be character-for-character identical
|
||||
const unique = new Set(allPatterns);
|
||||
if (unique.size > 1) {
|
||||
const variants = Array.from(unique);
|
||||
throw new Error(
|
||||
`REMOTE_SLUG pattern differs across files:\n` +
|
||||
variants.map((v, i) => ` ${i + 1}: ${v}`).join('\n')
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('all greptile-history write references specify both per-project and global paths', () => {
|
||||
const filesToCheck = [
|
||||
'review/SKILL.md',
|
||||
'ship/SKILL.md',
|
||||
'review/greptile-triage.md',
|
||||
];
|
||||
|
||||
for (const file of filesToCheck) {
|
||||
const filePath = path.join(ROOT, file);
|
||||
if (!fs.existsSync(filePath)) continue;
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
|
||||
const hasBoth = (content.includes('per-project') && content.includes('global')) ||
|
||||
(content.includes('$REMOTE_SLUG/greptile-history') && content.includes('~/.gstack/greptile-history'));
|
||||
|
||||
expect(hasBoth).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test('greptile-triage.md contains both project and global history paths', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
||||
expect(content).toContain('$REMOTE_SLUG/greptile-history.md');
|
||||
expect(content).toContain('~/.gstack/greptile-history.md');
|
||||
});
|
||||
|
||||
test('retro/SKILL.md reads global greptile-history (not per-project)', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('~/.gstack/greptile-history.md');
|
||||
// Should NOT reference per-project path for reads
|
||||
expect(content).not.toContain('$REMOTE_SLUG/greptile-history.md');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: QA skill structure validation (A2) ---
|
||||
|
||||
describe('QA skill structure validation', () => {
|
||||
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('qa/SKILL.md has all 7 phases', () => {
|
||||
const phases = [
|
||||
'Phase 1', 'Initialize',
|
||||
'Phase 2', 'Authenticate',
|
||||
'Phase 3', 'Recon',
|
||||
'Phase 4', 'Test Plan',
|
||||
'Phase 5', 'Execute',
|
||||
'Phase 6', 'Document',
|
||||
'Phase 7', 'Wrap',
|
||||
];
|
||||
for (const phase of phases) {
|
||||
expect(qaContent).toContain(phase);
|
||||
}
|
||||
});
|
||||
|
||||
test('risk heuristic table has all required patterns', () => {
|
||||
const patterns = [
|
||||
'Form/payment/auth/checkout',
|
||||
'Controller/route with mutations',
|
||||
'Config/env/deployment',
|
||||
'API endpoint handlers',
|
||||
'View/template/component',
|
||||
'Model/service with business logic',
|
||||
'CSS/style-only',
|
||||
'Docs/readme/comments',
|
||||
'Test files only',
|
||||
];
|
||||
for (const pattern of patterns) {
|
||||
expect(qaContent).toContain(pattern);
|
||||
}
|
||||
|
||||
// Risk levels
|
||||
for (const level of ['HIGH', 'MEDIUM', 'LOW', 'SKIP']) {
|
||||
expect(qaContent).toContain(level);
|
||||
}
|
||||
});
|
||||
|
||||
test('health score weights sum to 100%', () => {
|
||||
const weights = extractWeightsFromTable(qaContent);
|
||||
expect(weights.size).toBeGreaterThan(0);
|
||||
|
||||
let sum = 0;
|
||||
for (const pct of weights.values()) {
|
||||
sum += pct;
|
||||
}
|
||||
expect(sum).toBe(100);
|
||||
});
|
||||
|
||||
test('health score has all 8 categories', () => {
|
||||
const weights = extractWeightsFromTable(qaContent);
|
||||
const expectedCategories = [
|
||||
'Console', 'Links', 'Visual', 'Functional',
|
||||
'UX', 'Performance', 'Content', 'Accessibility',
|
||||
];
|
||||
for (const cat of expectedCategories) {
|
||||
expect(weights.has(cat)).toBe(true);
|
||||
}
|
||||
expect(weights.size).toBe(8);
|
||||
});
|
||||
|
||||
test('has three tier definitions (Quick/Standard/Exhaustive)', () => {
|
||||
expect(qaContent).toContain('Quick Depth');
|
||||
expect(qaContent).toContain('Standard Depth');
|
||||
expect(qaContent).toContain('Exhaustive Depth');
|
||||
});
|
||||
|
||||
test('output structure references report directory layout', () => {
|
||||
expect(qaContent).toContain('index.md');
|
||||
expect(qaContent).toContain('test-plan-');
|
||||
expect(qaContent).toContain('qa-report-');
|
||||
expect(qaContent).toContain('baseline.json');
|
||||
expect(qaContent).toContain('screenshots/');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: Greptile history format consistency (A3) ---
|
||||
|
||||
describe('Greptile history format consistency', () => {
|
||||
test('greptile-triage.md defines the canonical history format', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
||||
expect(content).toContain('<YYYY-MM-DD>');
|
||||
expect(content).toContain('<owner/repo>');
|
||||
expect(content).toContain('<type');
|
||||
expect(content).toContain('<file-pattern>');
|
||||
expect(content).toContain('<category>');
|
||||
});
|
||||
|
||||
test('review/SKILL.md and ship/SKILL.md both reference greptile-triage.md for write details', () => {
|
||||
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
|
||||
expect(reviewContent.toLowerCase()).toContain('greptile-triage.md');
|
||||
expect(shipContent.toLowerCase()).toContain('greptile-triage.md');
|
||||
});
|
||||
|
||||
test('greptile-triage.md defines all 9 valid categories', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
||||
const categories = [
|
||||
'race-condition', 'null-check', 'error-handling', 'style',
|
||||
'type-safety', 'security', 'performance', 'correctness', 'other',
|
||||
];
|
||||
for (const cat of categories) {
|
||||
expect(content).toContain(cat);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: Planted-bug fixture validation (A4) ---
|
||||
|
||||
describe('Planted-bug fixture validation', () => {
|
||||
test('qa-eval ground truth has exactly 5 planted bugs', () => {
|
||||
const groundTruth = JSON.parse(
|
||||
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'qa-eval-ground-truth.json'), 'utf-8')
|
||||
);
|
||||
expect(groundTruth.bugs).toHaveLength(5);
|
||||
expect(groundTruth.total_bugs).toBe(5);
|
||||
});
|
||||
|
||||
test('qa-eval-spa ground truth has exactly 5 planted bugs', () => {
|
||||
const groundTruth = JSON.parse(
|
||||
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'qa-eval-spa-ground-truth.json'), 'utf-8')
|
||||
);
|
||||
expect(groundTruth.bugs).toHaveLength(5);
|
||||
expect(groundTruth.total_bugs).toBe(5);
|
||||
});
|
||||
|
||||
test('qa-eval-checkout ground truth has exactly 5 planted bugs', () => {
|
||||
const groundTruth = JSON.parse(
|
||||
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'qa-eval-checkout-ground-truth.json'), 'utf-8')
|
||||
);
|
||||
expect(groundTruth.bugs).toHaveLength(5);
|
||||
expect(groundTruth.total_bugs).toBe(5);
|
||||
});
|
||||
|
||||
test('qa-eval.html contains the planted bugs', () => {
|
||||
const html = fs.readFileSync(path.join(ROOT, 'browse', 'test', 'fixtures', 'qa-eval.html'), 'utf-8');
|
||||
// BUG 1: broken link
|
||||
expect(html).toContain('/nonexistent-404-page');
|
||||
// BUG 2: disabled submit
|
||||
expect(html).toContain('disabled');
|
||||
// BUG 3: overflow
|
||||
expect(html).toContain('overflow: hidden');
|
||||
// BUG 4: missing alt
|
||||
expect(html).toMatch(/<img[^>]*src="\/logo\.png"[^>]*>/);
|
||||
expect(html).not.toMatch(/<img[^>]*src="\/logo\.png"[^>]*alt=/);
|
||||
// BUG 5: console error
|
||||
expect(html).toContain("Cannot read properties of undefined");
|
||||
});
|
||||
|
||||
test('review-eval-vuln.rb contains expected vulnerability patterns', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8');
|
||||
expect(content).toContain('params[:id]');
|
||||
expect(content).toContain('update_column');
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user