mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 13:15:24 +02:00
2b3f9676f2
test/skill-e2e-plan-tune.test.ts — verifies /plan-tune correctly routes
plain-English intent ("review the questions I've been asked") to the
Review question log section without requiring CLI subcommand syntax.
Seeds a synthetic question-log.jsonl with 3 entries exercising:
- override behavior (user chose expand over recommended selective)
- one-way door respect (user followed ship-test-failure-triage recommendation)
- two-way override (user skipped recommended changelog polish)
Invokes the skill via `claude -p` and asserts:
- Agent surfaces >= 2 of 3 logged question_ids in output
- Agent notices override/skip behavior from the log
- Exit reason is success or error_max_turns (not agent-crash)
Gate-tier because the core v1 DX promise is plain-English intent routing.
If it requires memorized subcommands or breaks on natural language, that's
a regression of the defining feature.
Registered in test/helpers/touchfiles.ts with dependencies:
- plan-tune/** (skill template + generated md)
- scripts/question-registry.ts (required for log lookup)
- scripts/psychographic-signals.ts, scripts/one-way-doors.ts (derive path)
- bin/gstack-question-log, gstack-question-preference, gstack-developer-profile
Skipped when EVALS_ENABLED is not set; runs on `bun run test:evals`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
189 lines
7.2 KiB
TypeScript
189 lines
7.2 KiB
TypeScript
import { beforeAll, afterAll, expect } from 'bun:test';
|
|
import { runSkillTest } from './helpers/session-runner';
|
|
import {
|
|
ROOT, runId,
|
|
describeIfSelected, testConcurrentIfSelected,
|
|
copyDirSync, logCost, recordE2E,
|
|
createEvalCollector, finalizeEvalCollector,
|
|
} from './helpers/e2e-helpers';
|
|
import { spawnSync } from 'child_process';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
const evalCollector = createEvalCollector('e2e-plan-tune');
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// /plan-tune E2E: verify the skill recognizes plain-English intent and hits
|
|
// the right binary paths without CLI subcommand syntax.
|
|
//
|
|
// This is a gate-tier test — if /plan-tune requires memorized subcommands or
|
|
// fails on plain English, that is a regression of the core v1 DX promise.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describeIfSelected('PlanTune E2E', ['plan-tune-inspect'], () => {
|
|
let workDir: string;
|
|
let gstackHome: string;
|
|
let slug: string;
|
|
|
|
beforeAll(() => {
|
|
workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-tune-'));
|
|
gstackHome = path.join(workDir, '.gstack-home');
|
|
|
|
const run = (cmd: string, args: string[]) =>
|
|
spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 });
|
|
run('git', ['init', '-b', 'main']);
|
|
run('git', ['config', 'user.email', 'test@test.com']);
|
|
run('git', ['config', 'user.name', 'Test']);
|
|
fs.writeFileSync(path.join(workDir, 'README.md'), '# test\n');
|
|
run('git', ['add', '.']);
|
|
run('git', ['commit', '-m', 'initial']);
|
|
|
|
// Copy the /plan-tune skill (extract the flow section only — full template
|
|
// is ~45KB and includes preamble boilerplate the agent doesn't need).
|
|
copyDirSync(path.join(ROOT, 'plan-tune'), path.join(workDir, 'plan-tune'));
|
|
|
|
// Copy required bins — the skill references these by path.
|
|
const binDir = path.join(workDir, 'bin');
|
|
fs.mkdirSync(binDir, { recursive: true });
|
|
for (const script of [
|
|
'gstack-slug',
|
|
'gstack-config',
|
|
'gstack-question-log',
|
|
'gstack-question-preference',
|
|
'gstack-developer-profile',
|
|
'gstack-builder-profile',
|
|
]) {
|
|
const src = path.join(ROOT, 'bin', script);
|
|
if (fs.existsSync(src)) {
|
|
fs.copyFileSync(src, path.join(binDir, script));
|
|
fs.chmodSync(path.join(binDir, script), 0o755);
|
|
}
|
|
}
|
|
|
|
// gstack-developer-profile --derive imports from scripts/ — copy those too.
|
|
const scriptsDir = path.join(workDir, 'scripts');
|
|
fs.mkdirSync(scriptsDir, { recursive: true });
|
|
for (const src of ['question-registry.ts', 'psychographic-signals.ts', 'archetypes.ts', 'one-way-doors.ts']) {
|
|
fs.copyFileSync(path.join(ROOT, 'scripts', src), path.join(scriptsDir, src));
|
|
}
|
|
|
|
// Compute slug the same way the binary does (basename fallback).
|
|
slug = path.basename(workDir).replace(/[^a-zA-Z0-9._-]/g, '');
|
|
|
|
// Seed a few question-log entries so "review questions" has something to show.
|
|
const projectDir = path.join(gstackHome, 'projects', slug);
|
|
fs.mkdirSync(projectDir, { recursive: true });
|
|
const entries = [
|
|
{
|
|
ts: '2026-04-10T10:00:00Z',
|
|
skill: 'plan-ceo-review',
|
|
question_id: 'plan-ceo-review-mode',
|
|
question_summary: 'Which review mode?',
|
|
category: 'routing',
|
|
door_type: 'two-way',
|
|
options_count: 4,
|
|
user_choice: 'expand',
|
|
recommended: 'selective',
|
|
followed_recommendation: false,
|
|
session_id: 's1',
|
|
},
|
|
{
|
|
ts: '2026-04-11T10:00:00Z',
|
|
skill: 'ship',
|
|
question_id: 'ship-test-failure-triage',
|
|
question_summary: 'Test failed',
|
|
category: 'approval',
|
|
door_type: 'one-way',
|
|
options_count: 3,
|
|
user_choice: 'fix-now',
|
|
recommended: 'fix-now',
|
|
followed_recommendation: true,
|
|
session_id: 's2',
|
|
},
|
|
{
|
|
ts: '2026-04-12T10:00:00Z',
|
|
skill: 'ship',
|
|
question_id: 'ship-changelog-voice-polish',
|
|
question_summary: 'Polish changelog voice',
|
|
category: 'approval',
|
|
door_type: 'two-way',
|
|
options_count: 2,
|
|
user_choice: 'skip',
|
|
recommended: 'accept',
|
|
followed_recommendation: false,
|
|
session_id: 's3',
|
|
},
|
|
];
|
|
fs.writeFileSync(
|
|
path.join(projectDir, 'question-log.jsonl'),
|
|
entries.map((e) => JSON.stringify(e)).join('\n') + '\n',
|
|
);
|
|
|
|
// Pre-set question_tuning=true so the skill doesn't enter the first-time setup flow.
|
|
const cfgDir = path.join(gstackHome);
|
|
fs.mkdirSync(cfgDir, { recursive: true });
|
|
fs.writeFileSync(path.join(cfgDir, 'config.yaml'), 'question_tuning: true\n');
|
|
});
|
|
|
|
afterAll(() => {
|
|
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
|
finalizeEvalCollector(evalCollector);
|
|
});
|
|
|
|
// -------------------------------------------------------------------------
|
|
// Plain-English intent: "review my questions"
|
|
// -------------------------------------------------------------------------
|
|
testConcurrentIfSelected('plan-tune-inspect', async () => {
|
|
const result = await runSkillTest({
|
|
prompt: `Read ./plan-tune/SKILL.md for the /plan-tune skill instructions.
|
|
|
|
The user has invoked /plan-tune and says: "Review the questions I've been asked recently."
|
|
|
|
IMPORTANT:
|
|
- Use GSTACK_HOME="${gstackHome}" as an environment variable for all bin calls.
|
|
- Replace any ~/.claude/skills/gstack/bin/ references with ./bin/ (relative path).
|
|
- Replace any ~/.claude/skills/gstack/scripts/ references with ./scripts/.
|
|
- Do NOT use AskUserQuestion.
|
|
- Do NOT implement code changes.
|
|
- Route the user's intent to the right section of the skill (Review question log).
|
|
- Show them the logged questions with counts and the follow/override ratio.`,
|
|
workingDirectory: workDir,
|
|
maxTurns: 15,
|
|
allowedTools: ['Bash', 'Read', 'Grep', 'Glob'],
|
|
timeout: 120_000,
|
|
testName: 'plan-tune-inspect',
|
|
runId,
|
|
});
|
|
|
|
logCost('/plan-tune review', result);
|
|
|
|
const output = result.output.toLowerCase();
|
|
|
|
// Agent must have surfaced at least 2 of the 3 logged question_ids
|
|
const mentionsCEO = output.includes('plan-ceo-review-mode') || output.includes('review mode');
|
|
const mentionsShipTest = output.includes('ship-test-failure-triage') || output.includes('test failed');
|
|
const mentionsChangelog = output.includes('changelog') || output.includes('ship-changelog-voice-polish');
|
|
const foundCount = [mentionsCEO, mentionsShipTest, mentionsChangelog].filter(Boolean).length;
|
|
|
|
// Agent should note override behavior (user overrode CEO review and changelog polish)
|
|
const noticedOverride =
|
|
output.includes('overrid') ||
|
|
output.includes('skip') ||
|
|
output.includes('expand');
|
|
|
|
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
|
|
|
recordE2E(evalCollector, '/plan-tune', 'Plan-tune inspection flow (plain English)', result, {
|
|
passed: exitOk && foundCount >= 2,
|
|
});
|
|
|
|
expect(exitOk).toBe(true);
|
|
expect(foundCount).toBeGreaterThanOrEqual(2);
|
|
|
|
if (!noticedOverride) {
|
|
console.warn('Agent did not surface override/skip behavior from the log');
|
|
}
|
|
}, 180_000);
|
|
});
|