mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-09 14:55:37 +02:00
test: add gate-tier mode-posture regression tests
Three gate-tier E2E tests detect when preamble / template changes flatten the distinctive posture of /plan-ceo-review SCOPE EXPANSION or /office-hours (startup Q3, builder mode). The V1 regression that this PR fixes shipped without anyone catching it at ship time — this is the ongoing signal so the same thing doesn't happen again. Pieces: - `judgePosture(mode, text)` in `test/helpers/llm-judge.ts`. Sonnet judge with mode-specific dual-axis rubric (expansion: surface_framing + decision_preservation; forcing: stacking_preserved + domain_matched_consequence; builder: unexpected_combinations + excitement_over_optimization). Pass threshold 4/5 on both axes. - Three fixtures in `test/fixtures/mode-posture/` — deterministic input for expansion proposal generation, Q3 forcing question, and builder adjacent-unlock riffing. - `plan-ceo-review-expansion-energy` case appended to `test/skill-e2e-plan.test.ts`. Generator: Opus (skill default). Judge: Sonnet. - New `test/skill-e2e-office-hours.test.ts` with `office-hours-forcing-energy` + `office-hours-builder-wildness` cases. Generator: Sonnet. Judge: Sonnet. - Touchfile registration in `test/helpers/touchfiles.ts` — all three as `gate` tier in `E2E_TIERS`, triggered by changes to `scripts/resolvers/preamble.ts`, the relevant skill template, the judge helper, or any mode-posture fixture. Cost: ~$0.50-$1.50 per triggered PR. Sonnet judge is cheap; Opus generator for the plan-ceo-review case dominates. Known V1.1 tradeoff: judges test prose markers more than deep behavior. V1.2 candidate is a cross-provider (Codex) adversarial judge on the same output to decouple house-style bias.
This commit is contained in:
@@ -6,6 +6,7 @@ import {
|
||||
copyDirSync, setupBrowseShims, logCost, recordE2E,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
import { judgePosture } from './helpers/llm-judge';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
@@ -183,6 +184,79 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Plan CEO Review SCOPE EXPANSION energy (V1.1 mode-posture regression gate) ---
|
||||
|
||||
describeIfSelected('Plan CEO Review Expansion Energy E2E', ['plan-ceo-review-expansion-energy'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-ceo-exp-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Use the shared fixture so expansion-energy regressions are reproducible.
|
||||
const fixture = fs.readFileSync(
|
||||
path.join(ROOT, 'test', 'fixtures', 'mode-posture', 'expansion-plan.md'),
|
||||
'utf-8',
|
||||
);
|
||||
fs.writeFileSync(path.join(planDir, 'plan.md'), fixture);
|
||||
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'add plan']);
|
||||
|
||||
fs.mkdirSync(path.join(planDir, 'plan-ceo-review'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
|
||||
path.join(planDir, 'plan-ceo-review', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('plan-ceo-review-expansion-energy', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
|
||||
|
||||
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration or system audit steps.
|
||||
|
||||
Choose SCOPE EXPANSION mode. Skip any AskUserQuestion calls — this is non-interactive. Auto-approve the ideal-architecture approach in 0C-bis. For 0D, run all three analyses (10x check, platonic ideal, delight opportunities), then emit exactly 2 concrete expansion proposals in the opt-in ceremony.
|
||||
|
||||
Write your expansion proposals to ${planDir}/proposals.md with ONLY the proposal text — no conversational wrapper, no review summary, no mode analysis. Each proposal separated by "---".`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 15,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-ceo-review-expansion-energy',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
});
|
||||
|
||||
logCost('/plan-ceo-review (EXPANSION ENERGY)', result);
|
||||
recordE2E(evalCollector, '/plan-ceo-review-expansion-energy', 'Plan CEO Review Expansion Energy E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
const proposalsPath = path.join(planDir, 'proposals.md');
|
||||
if (!fs.existsSync(proposalsPath)) {
|
||||
throw new Error('Agent did not emit proposals.md — expansion energy eval requires proposal output');
|
||||
}
|
||||
const proposalText = fs.readFileSync(proposalsPath, 'utf-8');
|
||||
expect(proposalText.length).toBeGreaterThan(200);
|
||||
|
||||
const scores = await judgePosture('expansion', proposalText);
|
||||
console.log('Expansion energy scores:', JSON.stringify(scores, null, 2));
|
||||
// Pass threshold: 4/5 on both axes (good — matches posture with minor weakness).
|
||||
expect(scores.axis_a).toBeGreaterThanOrEqual(4); // surface_framing
|
||||
expect(scores.axis_b).toBeGreaterThanOrEqual(4); // decision_preservation
|
||||
}, 600_000);
|
||||
});
|
||||
|
||||
// --- Plan Eng Review E2E ---
|
||||
|
||||
describeIfSelected('Plan Eng Review E2E', ['plan-eng-review'], () => {
|
||||
|
||||
Reference in New Issue
Block a user