mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-08 06:26:45 +02:00
test: apply ship review-army findings — helper extract, slice SKILL.md, defensive judge
Five categories of fixes surfaced by the /ship pre-landing reviews (testing + maintainability + security + performance + adversarial Claude), applied as one review-iteration commit. Refactor — collapse 5x duplicated judge-assertion block: - Add assertRecommendationQuality() + RECOMMENDATION_SUBSTANCE_THRESHOLD constant to test/helpers/e2e-helpers.ts. - Plan-format (4 cases) and Phase 4 (1 case) collapse from ~22 lines each to a single helper call. Future rubric tweaks land in one place instead of five. Performance — extract Phase 4 slice instead of copying full SKILL.md: - Phase 4 test fixture now reads office-hours/SKILL.md and writes only the AskUserQuestion Format section + Phase 4 section to the tmpdir, per CLAUDE.md "extract, don't copy" rule. Verified locally: cost dropped from $0.51 → $0.36/run, turn count 8 → 4, latency 50s → 36s. Reduces Opus context bloat without weakening the regression check. - Add `if (!workDir) return` guard to Phase 4 afterAll cleanup so a skipped describe block doesn't silently fs.rmSync(undefined) under the empty catch. Defense — judge prompt + output: - Wrap captured AskUserQuestion text in clearly delimited UNTRUSTED_CONTEXT block with explicit instruction to treat its content as data, not commands. Cheap defense against the (unlikely but real) injection vector where a captured AskUserQuestion contains "Ignore previous instructions" text. - Bump captured-text budget from 4000 → 8000 chars; real plan-format menus with 4 options × ~800 chars exceed 4000 and were silently truncating Haiku context mid-option. Cleanup — abbreviation rule + dead imports + touchfile consistency: - AUQ → AskUserQuestion in 3 sites (office-hours/SKILL.md.tmpl Phase 4 footer, two test comments) per the always-write-in-full memory rule. Regenerated office-hours/SKILL.md. - Drop unused `describe`/`test` imports in 2 new test files (only describeIfSelected/testConcurrentIfSelected wrappers are used). - Add `test/skill-e2e-office-hours-phase4.test.ts` to its own touchfile entry for consistency with other entries that include their test file. - Fix misleading comment in fixture test about LLM short-circuiting (it's has_because, not commits, that skips the API call). Verified: build clean, free `bun test` exits 0, fixture test 30/30 expect() calls pass, Phase 4 paid eval passes substance 5 in 36s. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -22,10 +22,9 @@ import { runSkillTest } from './helpers/session-runner';
|
||||
import {
|
||||
ROOT, runId,
|
||||
describeIfSelected, testConcurrentIfSelected,
|
||||
logCost, recordE2E,
|
||||
logCost, assertRecommendationQuality,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
import { judgeRecommendation } from './helpers/llm-judge';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
@@ -153,26 +152,14 @@ After writing the file, stop. Do not continue the review.`,
|
||||
expect(captured).not.toMatch(COMPLETENESS_RE);
|
||||
expect(captured).toMatch(KIND_NOTE_RE);
|
||||
|
||||
// Recommendation-quality judge: deterministic regex for present/commits/has_because,
|
||||
// Haiku 4.5 for reason_substance 1-5. Threshold >= 4 catches generic-tier reasoning.
|
||||
const recScore = await judgeRecommendation(captured);
|
||||
recordE2E(evalCollector, '/plan-ceo-review-format-mode', 'Plan Format — CEO Mode Selection', result, {
|
||||
await assertRecommendationQuality({
|
||||
captured,
|
||||
evalCollector,
|
||||
evalId: '/plan-ceo-review-format-mode',
|
||||
evalTitle: 'Plan Format — CEO Mode Selection',
|
||||
result,
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
judge_scores: {
|
||||
rec_present: recScore.present ? 1 : 0,
|
||||
rec_commits: recScore.commits ? 1 : 0,
|
||||
rec_has_because: recScore.has_because ? 1 : 0,
|
||||
rec_substance: recScore.reason_substance,
|
||||
},
|
||||
judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`,
|
||||
});
|
||||
expect(recScore.present, recScore.reasoning).toBe(true);
|
||||
expect(recScore.commits, recScore.reasoning).toBe(true);
|
||||
expect(recScore.has_because, recScore.reasoning).toBe(true);
|
||||
expect(
|
||||
recScore.reason_substance,
|
||||
`${recScore.reasoning}\n reason: "${recScore.reason_text}"`,
|
||||
).toBeGreaterThanOrEqual(4);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
@@ -221,24 +208,14 @@ After writing the file, stop. Do not continue the review.`,
|
||||
// presence checked by the judge.
|
||||
expect(captured).toMatch(COMPLETENESS_RE);
|
||||
|
||||
const recScore = await judgeRecommendation(captured);
|
||||
recordE2E(evalCollector, '/plan-ceo-review-format-approach', 'Plan Format — CEO Approach Menu', result, {
|
||||
await assertRecommendationQuality({
|
||||
captured,
|
||||
evalCollector,
|
||||
evalId: '/plan-ceo-review-format-approach',
|
||||
evalTitle: 'Plan Format — CEO Approach Menu',
|
||||
result,
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
judge_scores: {
|
||||
rec_present: recScore.present ? 1 : 0,
|
||||
rec_commits: recScore.commits ? 1 : 0,
|
||||
rec_has_because: recScore.has_because ? 1 : 0,
|
||||
rec_substance: recScore.reason_substance,
|
||||
},
|
||||
judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`,
|
||||
});
|
||||
expect(recScore.present, recScore.reasoning).toBe(true);
|
||||
expect(recScore.commits, recScore.reasoning).toBe(true);
|
||||
expect(recScore.has_because, recScore.reasoning).toBe(true);
|
||||
expect(
|
||||
recScore.reason_substance,
|
||||
`${recScore.reasoning}\n reason: "${recScore.reason_text}"`,
|
||||
).toBeGreaterThanOrEqual(4);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
@@ -290,24 +267,14 @@ After writing the file with that ONE question, stop. Do not continue the review.
|
||||
// presence checked by the judge.
|
||||
expect(captured).toMatch(COMPLETENESS_RE);
|
||||
|
||||
const recScore = await judgeRecommendation(captured);
|
||||
recordE2E(evalCollector, '/plan-eng-review-format-coverage', 'Plan Format — Eng Coverage Issue', result, {
|
||||
await assertRecommendationQuality({
|
||||
captured,
|
||||
evalCollector,
|
||||
evalId: '/plan-eng-review-format-coverage',
|
||||
evalTitle: 'Plan Format — Eng Coverage Issue',
|
||||
result,
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
judge_scores: {
|
||||
rec_present: recScore.present ? 1 : 0,
|
||||
rec_commits: recScore.commits ? 1 : 0,
|
||||
rec_has_because: recScore.has_because ? 1 : 0,
|
||||
rec_substance: recScore.reason_substance,
|
||||
},
|
||||
judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`,
|
||||
});
|
||||
expect(recScore.present, recScore.reasoning).toBe(true);
|
||||
expect(recScore.commits, recScore.reasoning).toBe(true);
|
||||
expect(recScore.has_because, recScore.reasoning).toBe(true);
|
||||
expect(
|
||||
recScore.reason_substance,
|
||||
`${recScore.reasoning}\n reason: "${recScore.reason_text}"`,
|
||||
).toBeGreaterThanOrEqual(4);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
@@ -357,24 +324,14 @@ After writing the file with that ONE question, stop. Do not continue the review.
|
||||
expect(captured).not.toMatch(COMPLETENESS_RE);
|
||||
expect(captured).toMatch(KIND_NOTE_RE);
|
||||
|
||||
const recScore = await judgeRecommendation(captured);
|
||||
recordE2E(evalCollector, '/plan-eng-review-format-kind', 'Plan Format — Eng Kind Issue', result, {
|
||||
await assertRecommendationQuality({
|
||||
captured,
|
||||
evalCollector,
|
||||
evalId: '/plan-eng-review-format-kind',
|
||||
evalTitle: 'Plan Format — Eng Kind Issue',
|
||||
result,
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
judge_scores: {
|
||||
rec_present: recScore.present ? 1 : 0,
|
||||
rec_commits: recScore.commits ? 1 : 0,
|
||||
rec_has_because: recScore.has_because ? 1 : 0,
|
||||
rec_substance: recScore.reason_substance,
|
||||
},
|
||||
judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`,
|
||||
});
|
||||
expect(recScore.present, recScore.reasoning).toBe(true);
|
||||
expect(recScore.commits, recScore.reasoning).toBe(true);
|
||||
expect(recScore.has_because, recScore.reasoning).toBe(true);
|
||||
expect(
|
||||
recScore.reason_substance,
|
||||
`${recScore.reasoning}\n reason: "${recScore.reason_text}"`,
|
||||
).toBeGreaterThanOrEqual(4);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user