feat: QA restructure, browser ref staleness, eval efficiency metrics (v0.4.0) (#83)

* feat: browser ref staleness detection via async count() validation

resolveRef() now checks element count to detect stale refs after page
mutations (e.g. SPA navigation). RefEntry stores role+name metadata
for better diagnostics. 3 new snapshot tests for staleness detection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* feat: qa-only skill, qa fix loop, plan-to-QA artifact flow

Add /qa-only (report-only, Edit tool blocked), restructure /qa with
find-fix-verify cycle, add {{QA_METHODOLOGY}} DRY placeholder for
shared methodology. /plan-eng-review now writes test-plan artifacts
to ~/.gstack/projects/<slug>/ for QA consumption.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* feat: eval efficiency metrics — turns, duration, commentary across all surfaces

Add generateCommentary() for natural-language delta interpretation,
per-test turns/duration in comparison and summary output, judgePassed
unit tests, 3 new E2E tests (qa-only, qa fix loop, plan artifact).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* chore: bump version and changelog (v0.4.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* docs: update ARCHITECTURE, BROWSER, CONTRIBUTING, README for v0.4.0

- ARCHITECTURE: add ref staleness detection section, update RefEntry type
- BROWSER: add ref staleness paragraph to snapshot system docs
- CONTRIBUTING: update eval tool descriptions with commentary feature
- README: fix missing qa-only in project-local uninstall command

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* docs: add user-facing benefit descriptions to v0.4.0 changelog

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-15 23:55:39 -05:00
committed by GitHub
parent bb46ca6b21
commit f3ee0ee28a
30 changed files with 2317 additions and 354 deletions
+57 -4
View File
@@ -40,6 +40,33 @@ const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;
// Duration + turns from E2E runs
const avgE2EDuration = e2eRuns.length > 0
? e2eRuns.reduce((s, r) => s + (r.total_duration_ms || 0), 0) / e2eRuns.length
: 0;
const e2eTurns: number[] = [];
for (const r of e2eRuns) {
const runTurns = r.tests.reduce((s, t) => s + (t.turns_used || 0), 0);
if (runTurns > 0) e2eTurns.push(runTurns);
}
const avgE2ETurns = e2eTurns.length > 0
? e2eTurns.reduce((a, b) => a + b, 0) / e2eTurns.length
: 0;
// Per-test efficiency stats (avg turns + duration across runs)
const testEfficiency = new Map<string, { turns: number[]; durations: number[]; costs: number[] }>();
for (const r of e2eRuns) {
for (const t of r.tests) {
if (!testEfficiency.has(t.name)) {
testEfficiency.set(t.name, { turns: [], durations: [], costs: [] });
}
const stats = testEfficiency.get(t.name)!;
if (t.turns_used !== undefined) stats.turns.push(t.turns_used);
if (t.duration_ms > 0) stats.durations.push(t.duration_ms);
if (t.cost_usd > 0) stats.costs.push(t.cost_usd);
}
}
// Detection rates from outcome evals
const detectionRates: number[] = [];
for (const r of e2eRuns) {
@@ -94,22 +121,48 @@ for (const stats of branchStats.values()) {
// Print summary
console.log('');
console.log('Eval Summary');
console.log('═'.repeat(60));
console.log('═'.repeat(70));
console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
console.log(` Total spend: $${totalCost.toFixed(2)}`);
console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`);
console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`);
if (avgE2EDuration > 0) {
console.log(` Avg duration/e2e: ${Math.round(avgE2EDuration / 1000)}s`);
}
if (avgE2ETurns > 0) {
console.log(` Avg turns/e2e: ${Math.round(avgE2ETurns)}`);
}
if (avgDetection !== null) {
console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`);
}
console.log('─'.repeat(60));
console.log('─'.repeat(70));
// Per-test efficiency averages (only if we have enough data)
if (testEfficiency.size > 0 && e2eRuns.length >= 2) {
console.log(' Per-test efficiency (averages across runs):');
const sorted = [...testEfficiency.entries()]
.filter(([, s]) => s.turns.length >= 2)
.sort((a, b) => {
const avgA = a[1].costs.reduce((s, c) => s + c, 0) / a[1].costs.length;
const avgB = b[1].costs.reduce((s, c) => s + c, 0) / b[1].costs.length;
return avgB - avgA;
});
for (const [name, stats] of sorted) {
const avgT = Math.round(stats.turns.reduce((a, b) => a + b, 0) / stats.turns.length);
const avgD = Math.round(stats.durations.reduce((a, b) => a + b, 0) / stats.durations.length / 1000);
const avgC = (stats.costs.reduce((a, b) => a + b, 0) / stats.costs.length).toFixed(2);
const label = name.length > 30 ? name.slice(0, 27) + '...' : name.padEnd(30);
console.log(` ${label} $${avgC} ${avgT}t ${avgD}s (${stats.turns.length} runs)`);
}
console.log('─'.repeat(70));
}
if (flakyTests.length > 0) {
console.log(` Flaky tests (${flakyTests.length}):`);
for (const name of flakyTests) {
console.log(` - ${name}`);
}
console.log('─'.repeat(60));
console.log('─'.repeat(70));
}
if (branchStats.size > 0) {
@@ -119,7 +172,7 @@ if (branchStats.size > 0) {
const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : '';
console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`);
}
console.log('─'.repeat(60));
console.log('─'.repeat(70));
}
// Date range