mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test: rewrite 5 plan-mode E2E tests on the real-PTY harness
Replaces SDK-based assertions with runPlanSkillObservation contract. Each test launches real claude --permission-mode plan, invokes the skill, and asserts the outcome reaches 'asked' or 'plan_ready' within a 300s budget (no silent Write/Edit, no crash, no timeout). Affected: - test/skill-e2e-plan-ceo-plan-mode.test.ts - test/skill-e2e-plan-eng-plan-mode.test.ts - test/skill-e2e-plan-design-plan-mode.test.ts - test/skill-e2e-plan-devex-plan-mode.test.ts - test/skill-e2e-plan-mode-no-op.test.ts (inPlanMode: false; tests the preamble plan-mode-info no-op path) test/e2e-harness-audit.test.ts — recognize runPlanSkillObservation as a valid coverage path alongside the legacy canUseTool / runPlanModeSkillTest. test/helpers/touchfiles.ts — point the 5 plan-mode test selections and the e2e-harness-audit selection at test/helpers/claude-pty-runner.ts instead of the deleted plan-mode-helpers.ts. Proof: bun test EVALS=1 EVALS_TIER=gate on these 5 files runs sequentially in 790s and passes 5/5. Same tests were 0/5 on origin/main, on v1.0.0.0, and on this branch with the SDK harness. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,8 +1,11 @@
|
||||
/**
|
||||
* E2E harness audit — every skill with `interactive: true` in its frontmatter
|
||||
* must have at least one test file that uses `canUseTool` via the extended
|
||||
* agent-sdk-runner. This prevents future drift where a skill opts into the
|
||||
* handshake without adding real coverage.
|
||||
* must have at least one test file that drives a real interactive session.
|
||||
* Two valid coverage paths:
|
||||
* 1. `canUseTool` via the agent-sdk-runner (legacy SDK-based path)
|
||||
* 2. `runPlanSkillObservation` via the claude-pty-runner (real-PTY path
|
||||
* added when the SDK harness was found unable to observe plan mode's
|
||||
* native confirmation UI — see test/helpers/claude-pty-runner.ts)
|
||||
*
|
||||
* Runs as a free unit test (no API calls). Pure filesystem scan.
|
||||
*/
|
||||
@@ -76,14 +79,16 @@ function findInteractiveSkills(): string[] {
|
||||
}
|
||||
|
||||
/**
|
||||
* Scan a test file's contents for the canUseTool-via-harness pattern.
|
||||
* Either: direct canUseTool usage in runAgentSdkTest, or usage of the
|
||||
* shared plan-mode-helpers that wrap it.
|
||||
* Scan a test file's contents for any of the supported real-interactive
|
||||
* coverage patterns. Either: direct canUseTool usage in runAgentSdkTest,
|
||||
* the legacy plan-mode-helpers wrapper, or the new real-PTY observation
|
||||
* helper.
|
||||
*/
|
||||
function hasCanUseToolCoverage(testFile: string): boolean {
|
||||
const content = fs.readFileSync(testFile, 'utf-8');
|
||||
if (content.includes('canUseTool')) return true;
|
||||
if (content.includes('runPlanModeSkillTest')) return true;
|
||||
if (content.includes('runPlanSkillObservation')) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -84,14 +84,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
|
||||
// Plan-mode smoke tests — gate-tier safety regression tests. Each fires when
|
||||
// any of: the interactive skill's template, the plan-mode resolver
|
||||
// (completion-status now owns generatePlanModeInfo), preamble composition,
|
||||
// the Agent SDK harness, or the shared plan-mode-helpers change.
|
||||
'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
|
||||
'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
|
||||
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
|
||||
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
|
||||
'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
|
||||
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
|
||||
// (completion-status owns generatePlanModeInfo), preamble composition, or
|
||||
// the real-PTY runner (which the tests now use instead of the SDK harness)
|
||||
// change.
|
||||
'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],
|
||||
|
||||
// AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
|
||||
|
||||
@@ -1,38 +1,48 @@
|
||||
/**
|
||||
* plan-ceo-review plan-mode smoke test (gate tier, paid).
|
||||
* plan-ceo-review plan-mode smoke (gate, paid, real-PTY).
|
||||
*
|
||||
* Asserts: when /plan-ceo-review is invoked with the plan-mode distinctive
|
||||
* phrase in the system reminder, the skill goes STRAIGHT to its Step 0
|
||||
* scope-mode AskUserQuestion. Specifically:
|
||||
* 1. First AskUserQuestion is NOT the old vestigial handshake
|
||||
* (A=exit-and-rerun / C=cancel).
|
||||
* 2. No Write or Edit tool fires before the first AskUserQuestion
|
||||
* (catches silent plan-file-write bypass).
|
||||
* 3. ExitPlanMode does not fire before the first AskUserQuestion.
|
||||
* Asserts: when /plan-ceo-review is invoked in plan mode, the skill reaches
|
||||
* a terminal outcome that is either:
|
||||
* - 'asked' — skill emitted its Step 0 numbered prompt (scope mode
|
||||
* selection, or the routing-injection prompt that runs
|
||||
* before Step 0)
|
||||
* - 'plan_ready' — skill ran end-to-end and surfaced claude's native
|
||||
* "Ready to execute" confirmation
|
||||
*
|
||||
* Cost: ~$0.50–$1.00 per run. Gated: EVALS=1 EVALS_TIER=gate.
|
||||
* FAIL conditions: silent Write/Edit before any prompt, claude crash,
|
||||
* timeout.
|
||||
*
|
||||
* Replaces the SDK-based test that never worked: the SDK's canUseTool
|
||||
* interceptor on AskUserQuestion never fires in plan mode because plan
|
||||
* mode renders its native confirmation as TTY UI, not via the
|
||||
* AskUserQuestion tool. The real PTY harness observes the rendered
|
||||
* terminal output directly.
|
||||
*
|
||||
* See test/helpers/claude-pty-runner.ts for runner internals.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeSkillTest,
|
||||
assertNotHandshakeShape,
|
||||
} from './helpers/plan-mode-helpers';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
test('goes straight to scope-mode question, no handshake, no silent writes', async () => {
|
||||
const result = await runPlanModeSkillTest({
|
||||
test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-ceo-review',
|
||||
// Step 0 asks for review mode; HOLD is the cheapest, most-neutral answer.
|
||||
firstAnswerSubstring: 'HOLD',
|
||||
inPlanMode: true,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
assertNotHandshakeShape(result.askUserQuestions[0]!);
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
expect(result.exitPlanModeBeforeAsk).toBe(false);
|
||||
}, 120_000);
|
||||
if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`plan-ceo-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -1,31 +1,36 @@
|
||||
/**
|
||||
* plan-design-review plan-mode smoke test (gate tier, paid).
|
||||
* plan-design-review plan-mode smoke (gate, paid, real-PTY).
|
||||
*
|
||||
* See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion
|
||||
* contract. Exercises the same assertions against /plan-design-review.
|
||||
* contract. Exercises the same contract against /plan-design-review.
|
||||
*
|
||||
* Note: on no-UI-scope branches plan-design-review legitimately short-
|
||||
* circuits to plan_ready without firing AskUserQuestion. Both 'asked' and
|
||||
* 'plan_ready' are valid pass outcomes.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeSkillTest,
|
||||
assertNotHandshakeShape,
|
||||
} from './helpers/plan-mode-helpers';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-design-review plan-mode smoke (gate)', () => {
|
||||
test('goes straight to first design question, no handshake, no silent writes', async () => {
|
||||
const result = await runPlanModeSkillTest({
|
||||
test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-design-review',
|
||||
// First question for design review varies; pick any reasonable match.
|
||||
// The substring match falls back to the first option if no match.
|
||||
firstAnswerSubstring: '7',
|
||||
inPlanMode: true,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
assertNotHandshakeShape(result.askUserQuestions[0]!);
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
expect(result.exitPlanModeBeforeAsk).toBe(false);
|
||||
}, 120_000);
|
||||
if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`plan-design-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -1,30 +1,32 @@
|
||||
/**
|
||||
* plan-devex-review plan-mode smoke test (gate tier, paid).
|
||||
* plan-devex-review plan-mode smoke (gate, paid, real-PTY).
|
||||
*
|
||||
* See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion
|
||||
* contract. Exercises the same assertions against /plan-devex-review.
|
||||
* contract. Exercises the same contract against /plan-devex-review.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeSkillTest,
|
||||
assertNotHandshakeShape,
|
||||
} from './helpers/plan-mode-helpers';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-devex-review plan-mode smoke (gate)', () => {
|
||||
test('goes straight to DX-mode question, no handshake, no silent writes', async () => {
|
||||
const result = await runPlanModeSkillTest({
|
||||
test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-devex-review',
|
||||
// Step 0 asks for DX review mode; TRIAGE is the lightest-weight mode.
|
||||
firstAnswerSubstring: 'TRIAGE',
|
||||
inPlanMode: true,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
assertNotHandshakeShape(result.askUserQuestions[0]!);
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
expect(result.exitPlanModeBeforeAsk).toBe(false);
|
||||
}, 120_000);
|
||||
if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`plan-devex-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -1,29 +1,32 @@
|
||||
/**
|
||||
* plan-eng-review plan-mode smoke test (gate tier, paid).
|
||||
* plan-eng-review plan-mode smoke (gate, paid, real-PTY).
|
||||
*
|
||||
* See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion
|
||||
* contract. This file exercises the same assertions against /plan-eng-review.
|
||||
* contract. This file exercises the same contract against /plan-eng-review.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeSkillTest,
|
||||
assertNotHandshakeShape,
|
||||
} from './helpers/plan-mode-helpers';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-eng-review plan-mode smoke (gate)', () => {
|
||||
test('goes straight to scope-mode question, no handshake, no silent writes', async () => {
|
||||
const result = await runPlanModeSkillTest({
|
||||
test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-eng-review',
|
||||
firstAnswerSubstring: 'HOLD',
|
||||
inPlanMode: true,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
assertNotHandshakeShape(result.askUserQuestions[0]!);
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
expect(result.exitPlanModeBeforeAsk).toBe(false);
|
||||
}, 120_000);
|
||||
if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`plan-eng-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -1,47 +1,48 @@
|
||||
/**
|
||||
* Plan-mode-info no-op regression (gate tier, paid).
|
||||
* Plan-mode-info no-op regression (gate tier, paid, real-PTY).
|
||||
*
|
||||
* Asserts: when /plan-ceo-review is invoked WITHOUT the plan-mode distinctive
|
||||
* phrase in the system reminder, the plan-mode-info preamble section is a
|
||||
* no-op. The skill should proceed to its normal Step 0 flow with no
|
||||
* AskUserQuestion echoing or referencing the plan-mode reminder text.
|
||||
* Asserts: when /plan-ceo-review is invoked OUTSIDE plan mode (no
|
||||
* --permission-mode plan flag, no plan-mode reminder injected), the skill
|
||||
* still reaches a terminal outcome ('asked' or 'plan_ready'). This is the
|
||||
* negative coverage to the per-skill plan-mode smokes — if the
|
||||
* plan-mode-info preamble section ever starts misfiring for non-plan-mode
|
||||
* sessions (e.g., gating questions on a phrase that isn't there), this
|
||||
* test catches it.
|
||||
*
|
||||
* This guardrails the "outside plan mode, this block doesn't interfere"
|
||||
* case — a different coverage case from the per-skill in-plan-mode smokes.
|
||||
* If the plan-mode-info section ever starts misfiring for non-plan-mode
|
||||
* sessions, this test catches it.
|
||||
*
|
||||
* Cost: ~$0.50 per run. Gated: EVALS=1 EVALS_TIER=gate.
|
||||
* Why this matters: outside plan mode, claude doesn't render a native
|
||||
* confirmation UI. The skill must drive its own AskUserQuestion. Same
|
||||
* runner, same outcome contract — just `inPlanMode: false`.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeSkillTest,
|
||||
PLAN_MODE_REMINDER,
|
||||
} from './helpers/plan-mode-helpers';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-mode-info no-op outside plan mode (gate regression)', () => {
|
||||
test('no AskUserQuestion echoes the plan-mode reminder when absent', async () => {
|
||||
const result = await runPlanModeSkillTest({
|
||||
test('skill reaches a terminal outcome outside plan mode', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-ceo-review',
|
||||
firstAnswerSubstring: 'HOLD',
|
||||
omitPlanModeReminder: true,
|
||||
maxTurns: 3,
|
||||
inPlanMode: false,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
// Skill should still hit Step 0 normally outside plan mode.
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// No AskUserQuestion should echo the plan-mode distinctive phrase.
|
||||
// If one does, the plan-mode-info section is leaking outside plan mode.
|
||||
for (const aq of result.askUserQuestions) {
|
||||
const questions = aq.input.questions as Array<{ question: string }>;
|
||||
for (const q of questions) {
|
||||
expect(q.question).not.toContain(PLAN_MODE_REMINDER);
|
||||
}
|
||||
if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`plan-mode no-op regression FAILED: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}, 120_000);
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
|
||||
// Negative regression: the rendered output must NOT echo the plan-mode
|
||||
// distinctive reminder phrase. If it does, the plan-mode preamble
|
||||
// section is leaking outside plan mode.
|
||||
const PLAN_MODE_REMINDER =
|
||||
'Plan mode is active. The user indicated that they do not want you to execute yet';
|
||||
expect(obs.evidence).not.toContain(PLAN_MODE_REMINDER);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user