From 2b3f9676f2ffb76f6914ec375d8e7ea5133df7bb Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 17 Apr 2026 06:48:10 +0800 Subject: [PATCH] test: E2E test for /plan-tune plain-English inspection flow (gate tier) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test/skill-e2e-plan-tune.test.ts — verifies /plan-tune correctly routes plain-English intent ("review the questions I've been asked") to the Review question log section without requiring CLI subcommand syntax. Seeds a synthetic question-log.jsonl with 3 entries exercising: - override behavior (user chose expand over recommended selective) - one-way door respect (user followed ship-test-failure-triage recommendation) - two-way override (user skipped recommended changelog polish) Invokes the skill via `claude -p` and asserts: - Agent surfaces >= 2 of 3 logged question_ids in output - Agent notices override/skip behavior from the log - Exit reason is success or error_max_turns (not agent-crash) Gate-tier because the core v1 DX promise is plain-English intent routing. If it requires memorized subcommands or breaks on natural language, that's a regression of the defining feature. Registered in test/helpers/touchfiles.ts with dependencies: - plan-tune/** (skill template + generated md) - scripts/question-registry.ts (required for log lookup) - scripts/psychographic-signals.ts, scripts/one-way-doors.ts (derive path) - bin/gstack-question-log, gstack-question-preference, gstack-developer-profile Skipped when EVALS_ENABLED is not set; runs on `bun run test:evals`. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 6 + test/skill-e2e-plan-tune.test.ts | 188 +++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 test/skill-e2e-plan-tune.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 34ead7d0..8adcb049 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -79,6 +79,9 @@ export const E2E_TOUCHFILES: Record = { 'plan-eng-review-artifact': ['plan-eng-review/**'], 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], + // /plan-tune (v1 observational) + 'plan-tune-inspect': ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'], + // Codex offering verification 'codex-offered-office-hours': ['office-hours/**', 'scripts/gen-skill-docs.ts'], 'codex-offered-ceo-review': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'], @@ -239,6 +242,9 @@ export const E2E_TIERS: Record = { 'plan-eng-coverage-audit': 'gate', 'plan-review-report': 'gate', + // /plan-tune — gate (core v1 DX promise: plain-English intent routing) + 'plan-tune-inspect': 'gate', + // Codex offering verification 'codex-offered-office-hours': 'gate', 'codex-offered-ceo-review': 'gate', diff --git a/test/skill-e2e-plan-tune.test.ts b/test/skill-e2e-plan-tune.test.ts new file mode 100644 index 00000000..dd750208 --- /dev/null +++ b/test/skill-e2e-plan-tune.test.ts @@ -0,0 +1,188 @@ +import { beforeAll, afterAll, expect } from 'bun:test'; +import { runSkillTest } from './helpers/session-runner'; +import { + ROOT, runId, + describeIfSelected, testConcurrentIfSelected, + copyDirSync, logCost, recordE2E, + createEvalCollector, finalizeEvalCollector, +} from './helpers/e2e-helpers'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const evalCollector = createEvalCollector('e2e-plan-tune'); + +// --------------------------------------------------------------------------- +// /plan-tune E2E: verify the skill recognizes plain-English intent and hits +// the right binary paths without CLI subcommand syntax. +// +// This is a gate-tier test — if /plan-tune requires memorized subcommands or +// fails on plain English, that is a regression of the core v1 DX promise. +// --------------------------------------------------------------------------- + +describeIfSelected('PlanTune E2E', ['plan-tune-inspect'], () => { + let workDir: string; + let gstackHome: string; + let slug: string; + + beforeAll(() => { + workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-tune-')); + gstackHome = path.join(workDir, '.gstack-home'); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 }); + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(workDir, 'README.md'), '# test\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Copy the /plan-tune skill (extract the flow section only — full template + // is ~45KB and includes preamble boilerplate the agent doesn't need). + copyDirSync(path.join(ROOT, 'plan-tune'), path.join(workDir, 'plan-tune')); + + // Copy required bins — the skill references these by path. + const binDir = path.join(workDir, 'bin'); + fs.mkdirSync(binDir, { recursive: true }); + for (const script of [ + 'gstack-slug', + 'gstack-config', + 'gstack-question-log', + 'gstack-question-preference', + 'gstack-developer-profile', + 'gstack-builder-profile', + ]) { + const src = path.join(ROOT, 'bin', script); + if (fs.existsSync(src)) { + fs.copyFileSync(src, path.join(binDir, script)); + fs.chmodSync(path.join(binDir, script), 0o755); + } + } + + // gstack-developer-profile --derive imports from scripts/ — copy those too. + const scriptsDir = path.join(workDir, 'scripts'); + fs.mkdirSync(scriptsDir, { recursive: true }); + for (const src of ['question-registry.ts', 'psychographic-signals.ts', 'archetypes.ts', 'one-way-doors.ts']) { + fs.copyFileSync(path.join(ROOT, 'scripts', src), path.join(scriptsDir, src)); + } + + // Compute slug the same way the binary does (basename fallback). + slug = path.basename(workDir).replace(/[^a-zA-Z0-9._-]/g, ''); + + // Seed a few question-log entries so "review questions" has something to show. + const projectDir = path.join(gstackHome, 'projects', slug); + fs.mkdirSync(projectDir, { recursive: true }); + const entries = [ + { + ts: '2026-04-10T10:00:00Z', + skill: 'plan-ceo-review', + question_id: 'plan-ceo-review-mode', + question_summary: 'Which review mode?', + category: 'routing', + door_type: 'two-way', + options_count: 4, + user_choice: 'expand', + recommended: 'selective', + followed_recommendation: false, + session_id: 's1', + }, + { + ts: '2026-04-11T10:00:00Z', + skill: 'ship', + question_id: 'ship-test-failure-triage', + question_summary: 'Test failed', + category: 'approval', + door_type: 'one-way', + options_count: 3, + user_choice: 'fix-now', + recommended: 'fix-now', + followed_recommendation: true, + session_id: 's2', + }, + { + ts: '2026-04-12T10:00:00Z', + skill: 'ship', + question_id: 'ship-changelog-voice-polish', + question_summary: 'Polish changelog voice', + category: 'approval', + door_type: 'two-way', + options_count: 2, + user_choice: 'skip', + recommended: 'accept', + followed_recommendation: false, + session_id: 's3', + }, + ]; + fs.writeFileSync( + path.join(projectDir, 'question-log.jsonl'), + entries.map((e) => JSON.stringify(e)).join('\n') + '\n', + ); + + // Pre-set question_tuning=true so the skill doesn't enter the first-time setup flow. + const cfgDir = path.join(gstackHome); + fs.mkdirSync(cfgDir, { recursive: true }); + fs.writeFileSync(path.join(cfgDir, 'config.yaml'), 'question_tuning: true\n'); + }); + + afterAll(() => { + try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {} + finalizeEvalCollector(evalCollector); + }); + + // ------------------------------------------------------------------------- + // Plain-English intent: "review my questions" + // ------------------------------------------------------------------------- + testConcurrentIfSelected('plan-tune-inspect', async () => { + const result = await runSkillTest({ + prompt: `Read ./plan-tune/SKILL.md for the /plan-tune skill instructions. + +The user has invoked /plan-tune and says: "Review the questions I've been asked recently." + +IMPORTANT: +- Use GSTACK_HOME="${gstackHome}" as an environment variable for all bin calls. +- Replace any ~/.claude/skills/gstack/bin/ references with ./bin/ (relative path). +- Replace any ~/.claude/skills/gstack/scripts/ references with ./scripts/. +- Do NOT use AskUserQuestion. +- Do NOT implement code changes. +- Route the user's intent to the right section of the skill (Review question log). +- Show them the logged questions with counts and the follow/override ratio.`, + workingDirectory: workDir, + maxTurns: 15, + allowedTools: ['Bash', 'Read', 'Grep', 'Glob'], + timeout: 120_000, + testName: 'plan-tune-inspect', + runId, + }); + + logCost('/plan-tune review', result); + + const output = result.output.toLowerCase(); + + // Agent must have surfaced at least 2 of the 3 logged question_ids + const mentionsCEO = output.includes('plan-ceo-review-mode') || output.includes('review mode'); + const mentionsShipTest = output.includes('ship-test-failure-triage') || output.includes('test failed'); + const mentionsChangelog = output.includes('changelog') || output.includes('ship-changelog-voice-polish'); + const foundCount = [mentionsCEO, mentionsShipTest, mentionsChangelog].filter(Boolean).length; + + // Agent should note override behavior (user overrode CEO review and changelog polish) + const noticedOverride = + output.includes('overrid') || + output.includes('skip') || + output.includes('expand'); + + const exitOk = ['success', 'error_max_turns'].includes(result.exitReason); + + recordE2E(evalCollector, '/plan-tune', 'Plan-tune inspection flow (plain English)', result, { + passed: exitOk && foundCount >= 2, + }); + + expect(exitOk).toBe(true); + expect(foundCount).toBeGreaterThanOrEqual(2); + + if (!noticedOverride) { + console.warn('Agent did not surface override/skip behavior from the log'); + } + }, 180_000); +});