test: E2E test for /plan-tune plain-English inspection flow (gate tier)

test/skill-e2e-plan-tune.test.ts — verifies /plan-tune correctly routes plain-English intent ("review the questions I've been asked") to the Review question log section without requiring CLI subcommand syntax. Seeds a synthetic question-log.jsonl with 3 entries exercising: - override behavior (user chose expand over recommended selective) - one-way door respect (user followed ship-test-failure-triage recommendation) - two-way override (user skipped recommended changelog polish) Invokes the skill via `claude -p` and asserts: - Agent surfaces >= 2 of 3 logged question_ids in output - Agent notices override/skip behavior from the log - Exit reason is success or error_max_turns (not agent-crash) Gate-tier because the core v1 DX promise is plain-English intent routing. If it requires memorized subcommands or breaks on natural language, that's a regression of the defining feature. Registered in test/helpers/touchfiles.ts with dependencies: - plan-tune/** (skill template + generated md) - scripts/question-registry.ts (required for log lookup) - scripts/psychographic-signals.ts, scripts/one-way-doors.ts (derive path) - bin/gstack-question-log, gstack-question-preference, gstack-developer-profile Skipped when EVALS_ENABLED is not set; runs on `bun run test:evals`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-04-17 06:48:10 +08:00
parent 0427c957f2
commit 2b3f9676f2
2 changed files with 194 additions and 0 deletions
@@ -79,6 +79,9 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'plan-eng-review-artifact':  ['plan-eng-review/**'],
  'plan-review-report':        ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

+  // /plan-tune (v1 observational)
+  'plan-tune-inspect':         ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'],
+
  // Codex offering verification
  'codex-offered-office-hours':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],
  'codex-offered-ceo-review':    ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
@@ -239,6 +242,9 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'plan-eng-coverage-audit': 'gate',
  'plan-review-report': 'gate',

+  // /plan-tune — gate (core v1 DX promise: plain-English intent routing)
+  'plan-tune-inspect': 'gate',
+
  // Codex offering verification
  'codex-offered-office-hours': 'gate',
  'codex-offered-ceo-review': 'gate',
@@ -0,0 +1,188 @@
+import { beforeAll, afterAll, expect } from 'bun:test';
+import { runSkillTest } from './helpers/session-runner';
+import {
+  ROOT, runId,
+  describeIfSelected, testConcurrentIfSelected,
+  copyDirSync, logCost, recordE2E,
+  createEvalCollector, finalizeEvalCollector,
+} from './helpers/e2e-helpers';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const evalCollector = createEvalCollector('e2e-plan-tune');
+
+// ---------------------------------------------------------------------------
+// /plan-tune E2E: verify the skill recognizes plain-English intent and hits
+// the right binary paths without CLI subcommand syntax.
+//
+// This is a gate-tier test — if /plan-tune requires memorized subcommands or
+// fails on plain English, that is a regression of the core v1 DX promise.
+// ---------------------------------------------------------------------------
+
+describeIfSelected('PlanTune E2E', ['plan-tune-inspect'], () => {
+  let workDir: string;
+  let gstackHome: string;
+  let slug: string;
+
+  beforeAll(() => {
+    workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-tune-'));
+    gstackHome = path.join(workDir, '.gstack-home');
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 });
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+    fs.writeFileSync(path.join(workDir, 'README.md'), '# test\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Copy the /plan-tune skill (extract the flow section only — full template
+    // is ~45KB and includes preamble boilerplate the agent doesn't need).
+    copyDirSync(path.join(ROOT, 'plan-tune'), path.join(workDir, 'plan-tune'));
+
+    // Copy required bins — the skill references these by path.
+    const binDir = path.join(workDir, 'bin');
+    fs.mkdirSync(binDir, { recursive: true });
+    for (const script of [
+      'gstack-slug',
+      'gstack-config',
+      'gstack-question-log',
+      'gstack-question-preference',
+      'gstack-developer-profile',
+      'gstack-builder-profile',
+    ]) {
+      const src = path.join(ROOT, 'bin', script);
+      if (fs.existsSync(src)) {
+        fs.copyFileSync(src, path.join(binDir, script));
+        fs.chmodSync(path.join(binDir, script), 0o755);
+      }
+    }
+
+    // gstack-developer-profile --derive imports from scripts/ — copy those too.
+    const scriptsDir = path.join(workDir, 'scripts');
+    fs.mkdirSync(scriptsDir, { recursive: true });
+    for (const src of ['question-registry.ts', 'psychographic-signals.ts', 'archetypes.ts', 'one-way-doors.ts']) {
+      fs.copyFileSync(path.join(ROOT, 'scripts', src), path.join(scriptsDir, src));
+    }
+
+    // Compute slug the same way the binary does (basename fallback).
+    slug = path.basename(workDir).replace(/[^a-zA-Z0-9._-]/g, '');
+
+    // Seed a few question-log entries so "review questions" has something to show.
+    const projectDir = path.join(gstackHome, 'projects', slug);
+    fs.mkdirSync(projectDir, { recursive: true });
+    const entries = [
+      {
+        ts: '2026-04-10T10:00:00Z',
+        skill: 'plan-ceo-review',
+        question_id: 'plan-ceo-review-mode',
+        question_summary: 'Which review mode?',
+        category: 'routing',
+        door_type: 'two-way',
+        options_count: 4,
+        user_choice: 'expand',
+        recommended: 'selective',
+        followed_recommendation: false,
+        session_id: 's1',
+      },
+      {
+        ts: '2026-04-11T10:00:00Z',
+        skill: 'ship',
+        question_id: 'ship-test-failure-triage',
+        question_summary: 'Test failed',
+        category: 'approval',
+        door_type: 'one-way',
+        options_count: 3,
+        user_choice: 'fix-now',
+        recommended: 'fix-now',
+        followed_recommendation: true,
+        session_id: 's2',
+      },
+      {
+        ts: '2026-04-12T10:00:00Z',
+        skill: 'ship',
+        question_id: 'ship-changelog-voice-polish',
+        question_summary: 'Polish changelog voice',
+        category: 'approval',
+        door_type: 'two-way',
+        options_count: 2,
+        user_choice: 'skip',
+        recommended: 'accept',
+        followed_recommendation: false,
+        session_id: 's3',
+      },
+    ];
+    fs.writeFileSync(
+      path.join(projectDir, 'question-log.jsonl'),
+      entries.map((e) => JSON.stringify(e)).join('\n') + '\n',
+    );
+
+    // Pre-set question_tuning=true so the skill doesn't enter the first-time setup flow.
+    const cfgDir = path.join(gstackHome);
+    fs.mkdirSync(cfgDir, { recursive: true });
+    fs.writeFileSync(path.join(cfgDir, 'config.yaml'), 'question_tuning: true\n');
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
+    finalizeEvalCollector(evalCollector);
+  });
+
+  // -------------------------------------------------------------------------
+  // Plain-English intent: "review my questions"
+  // -------------------------------------------------------------------------
+  testConcurrentIfSelected('plan-tune-inspect', async () => {
+    const result = await runSkillTest({
+      prompt: `Read ./plan-tune/SKILL.md for the /plan-tune skill instructions.
+
+The user has invoked /plan-tune and says: "Review the questions I've been asked recently."
+
+IMPORTANT:
+- Use GSTACK_HOME="${gstackHome}" as an environment variable for all bin calls.
+- Replace any ~/.claude/skills/gstack/bin/ references with ./bin/ (relative path).
+- Replace any ~/.claude/skills/gstack/scripts/ references with ./scripts/.
+- Do NOT use AskUserQuestion.
+- Do NOT implement code changes.
+- Route the user's intent to the right section of the skill (Review question log).
+- Show them the logged questions with counts and the follow/override ratio.`,
+      workingDirectory: workDir,
+      maxTurns: 15,
+      allowedTools: ['Bash', 'Read', 'Grep', 'Glob'],
+      timeout: 120_000,
+      testName: 'plan-tune-inspect',
+      runId,
+    });
+
+    logCost('/plan-tune review', result);
+
+    const output = result.output.toLowerCase();
+
+    // Agent must have surfaced at least 2 of the 3 logged question_ids
+    const mentionsCEO = output.includes('plan-ceo-review-mode') || output.includes('review mode');
+    const mentionsShipTest = output.includes('ship-test-failure-triage') || output.includes('test failed');
+    const mentionsChangelog = output.includes('changelog') || output.includes('ship-changelog-voice-polish');
+    const foundCount = [mentionsCEO, mentionsShipTest, mentionsChangelog].filter(Boolean).length;
+
+    // Agent should note override behavior (user overrode CEO review and changelog polish)
+    const noticedOverride =
+      output.includes('overrid') ||
+      output.includes('skip') ||
+      output.includes('expand');
+
+    const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
+
+    recordE2E(evalCollector, '/plan-tune', 'Plan-tune inspection flow (plain English)', result, {
+      passed: exitOk && foundCount >= 2,
+    });
+
+    expect(exitOk).toBe(true);
+    expect(foundCount).toBeGreaterThanOrEqual(2);
+
+    if (!noticedOverride) {
+      console.warn('Agent did not surface override/skip behavior from the log');
+    }
+  }, 180_000);
+});