From d06f08938f0625744a7f8496a9dc4a6746f8a962 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 23 Apr 2026 16:48:10 -0700 Subject: [PATCH] test: gate-tier units + periodic Pros/Cons evals for AskUserQuestion format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part 3 of 4 (plan: ~/.claude/plans/system-instruction-you-are-working-polymorphic-twilight.md). Gate-tier (E1, free, runs on every `bun test`): test/preamble-compose.test.ts — pins the composition order Asserts AskUserQuestion Format section renders BEFORE Model-Specific Behavioral Patch in tier-≥2 preamble output. Covers claude default, opus-4-7 overlay, tier 2/3, and codex host. Catches any future edit to scripts/resolvers/preamble.ts that silently reverts the order. test/resolver-ask-user-format.test.ts — pins the Pros/Cons contract 14 assertions against generateAskUserFormat output: D, ELI10, Stakes if we pick wrong:, Recommendation: , Pros / cons:, ✅/❌ markers, min 2 pros + 1 con rules, hard-stop escape exact phrase, neutral-posture CT1 rule ((recommended) label preserved for AUTO_DECIDE), Completeness coverage-vs-kind, tool_use mandate (rule 11), self-check list, D-numbering model-level caveat. test/model-overlay-opus-4-7.test.ts — pins the pacing directive Asserts raw overlay file + resolved overlay output contain "Pace questions to the skill" and NOT "Batch your questions". Verifies INHERIT:claude chain still works (Todo-list, subordination wrapper), Fan out / Effort-match / Literal interpretation nudges preserved. Also asserts claude base overlay does NOT carry the Opus-specific pacing directive (no cross-contamination). Periodic-tier (E2, Opus-dependent, ~$1-2/run): test/skill-e2e-plan-prosons.test.ts — 4 cases extending v1.6.3.0 harness 1. Format positive — every token present when plan has real tradeoff 2. Hard-stop NEGATIVE — plan with genuine tradeoff must NOT dodge to "No cons — hard-stop choice" escape 3. Neutral-posture NEGATIVE — plan where one option dominates must emit (recommended) label + "because ", must NOT dodge to "taste call" / "no preference" 4. Hard-stop POSITIVE — destructive-action plan may legitimately use the hard-stop escape test/helpers/touchfiles.ts — entries for all new eval cases Dependencies: overlay, preamble.ts, generate-ask-user-format.ts, and the 4 plan-review templates. Diff-based selection triggers the evals whenever those files change. Also added entries for 7 expanded-coverage cases (ship, office-hours, investigate, qa, review, design-review, document-release) — test cases will land in follow-up PRs per skill. Follow-ups noted in test file header: - True multi-turn cadence eval (3 findings → 3 distinct asks) — current harness captures one $OUT_FILE per session; multi-turn capture needs new harness support. - Expanded-coverage test cases for the 7 non-plan-review skills. Verified: - bun test: 349 pass (30 new + 319 baseline), 1 pre-existing security-bench oversize failure on main (unrelated, unchanged). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 40 ++- test/model-overlay-opus-4-7.test.ts | 98 +++++++ test/preamble-compose.test.ts | 72 ++++++ test/resolver-ask-user-format.test.ts | 121 +++++++++ test/skill-e2e-plan-prosons.test.ts | 352 ++++++++++++++++++++++++++ 5 files changed, 679 insertions(+), 4 deletions(-) create mode 100644 test/model-overlay-opus-4-7.test.ts create mode 100644 test/preamble-compose.test.ts create mode 100644 test/resolver-ask-user-format.test.ts create mode 100644 test/skill-e2e-plan-prosons.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 5c8a009e..5bf2dcb7 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -84,10 +84,27 @@ export const E2E_TOUCHFILES: Record = { // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) // Fires when either template OR the two preamble resolvers change. - 'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], - 'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], - 'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], - 'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'], + 'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + + // v1.7.0.0 Pros/Cons format cadence + format + negative-escape evals. + // Dependencies: same as format-mode + the 4 plan-review templates + overlay. + // All periodic-tier (non-deterministic Opus 4.7 behavior). + 'plan-ceo-review-prosons-cadence': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'plan-review-prosons-format': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'plan-review-prosons-hardstop-neg': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'plan-review-prosons-neutral-neg': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + + // Expanded coverage (CT3) — 6 non-plan-review skills inherit Pros/Cons via preamble + 'ship-prosons-format': ['ship/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'office-hours-prosons-format': ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'investigate-prosons-format': ['investigate/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'qa-prosons-format': ['qa/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'review-prosons-format': ['review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'design-review-prosons-format': ['design-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'document-release-prosons-format': ['document-release/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], // /plan-tune (v1 observational) 'plan-tune-inspect': ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'], @@ -288,6 +305,21 @@ export const E2E_TIERS: Record = { 'plan-eng-review-format-coverage': 'periodic', 'plan-eng-review-format-kind': 'periodic', + // v1.7.0.0 Pros/Cons format — cadence + negative-escape evals (all periodic) + 'plan-ceo-review-prosons-cadence': 'periodic', + 'plan-review-prosons-format': 'periodic', + 'plan-review-prosons-hardstop-neg': 'periodic', + 'plan-review-prosons-neutral-neg': 'periodic', + + // CT3 expanded coverage — non-plan-review skills inheriting Pros/Cons (all periodic) + 'ship-prosons-format': 'periodic', + 'office-hours-prosons-format': 'periodic', + 'investigate-prosons-format': 'periodic', + 'qa-prosons-format': 'periodic', + 'review-prosons-format': 'periodic', + 'design-review-prosons-format': 'periodic', + 'document-release-prosons-format': 'periodic', + // /plan-tune — gate (core v1 DX promise: plain-English intent routing) 'plan-tune-inspect': 'gate', diff --git a/test/model-overlay-opus-4-7.test.ts b/test/model-overlay-opus-4-7.test.ts new file mode 100644 index 00000000..0fe9f80e --- /dev/null +++ b/test/model-overlay-opus-4-7.test.ts @@ -0,0 +1,98 @@ +/** + * Opus 4.7 model overlay — gate-tier assertions on the pacing directive. + * + * v1.6.4.0 regressed plan-review cadence because the Opus 4.7 overlay + * carried a "Batch your questions" directive that physically rendered + * above the skill-level pacing rule. Opus 4.7 read top-to-bottom, + * absorbed batching as the ambient default, and stopped honoring the + * plan-review STOP directives. + * + * v1.7.0.0 replaces that block with "Pace questions to the skill" — + * one-question-at-a-time is now the default when the skill contains + * STOP directives; batching becomes the explicit exception. + * + * This test asserts: + * - The new "Pace questions" directive is present + * - The old "Batch your questions" directive is gone + * - The AUTO_DECIDE-compatible language survives (subordination, skill wins) + */ +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import type { TemplateContext } from '../scripts/resolvers/types'; +import { HOST_PATHS } from '../scripts/resolvers/types'; +import { generateModelOverlay } from '../scripts/resolvers/model-overlay'; + +function makeCtx(model: string): TemplateContext { + return { + skillName: 'test-skill', + tmplPath: 'test.tmpl', + host: 'claude', + paths: HOST_PATHS.claude, + preambleTier: 2, + model, + }; +} + +const ROOT = path.resolve(__dirname, '..'); + +describe('Opus 4.7 overlay — pacing directive', () => { + test('raw opus-4-7.md contains "Pace questions to the skill"', () => { + const raw = fs.readFileSync( + path.join(ROOT, 'model-overlays/opus-4-7.md'), + 'utf-8', + ); + expect(raw).toContain('Pace questions to the skill'); + }); + + test('raw opus-4-7.md does NOT contain "Batch your questions" directive', () => { + const raw = fs.readFileSync( + path.join(ROOT, 'model-overlays/opus-4-7.md'), + 'utf-8', + ); + expect(raw).not.toContain('**Batch your questions.**'); + }); + + test('resolved overlay output contains "Pace questions to the skill"', () => { + const out = generateModelOverlay(makeCtx('opus-4-7')); + expect(out).toContain('Pace questions to the skill'); + }); + + test('resolved overlay inherits from claude base (INHERIT:claude)', () => { + const out = generateModelOverlay(makeCtx('opus-4-7')); + // The claude base contributes the subordination wrapper + Todo discipline + expect(out).toContain('Todo-list discipline'); + expect(out).toContain('subordinate'); + }); + + test('resolved overlay says skill STOP directives trigger one-per-turn pacing', () => { + const out = generateModelOverlay(makeCtx('opus-4-7')); + expect(out).toMatch(/STOP\. AskUserQuestion/); + expect(out).toMatch(/pace one question per turn|one question per turn/i); + }); + + test('resolved overlay requires AskUserQuestion as tool_use', () => { + const out = generateModelOverlay(makeCtx('opus-4-7')); + expect(out).toContain('tool_use'); + }); + + test('resolved overlay flags "obvious fix" findings still need user approval', () => { + const out = generateModelOverlay(makeCtx('opus-4-7')); + expect(out).toMatch(/obvious fix/i); + expect(out).toMatch(/user approval/i); + }); + + test('resolved overlay keeps Fan out / Effort-match / Literal interpretation nudges', () => { + const out = generateModelOverlay(makeCtx('opus-4-7')); + expect(out).toContain('Fan out explicitly'); + expect(out).toContain('Effort-match the step'); + expect(out).toContain('Literal interpretation awareness'); + }); + + test('claude overlay (no INHERIT chain) does not carry the pacing directive', () => { + // Claude is the default overlay; opus-4-7 inherits FROM claude. + // The pacing directive belongs to opus-4-7 only. + const out = generateModelOverlay(makeCtx('claude')); + expect(out).not.toContain('Pace questions to the skill'); + }); +}); diff --git a/test/preamble-compose.test.ts b/test/preamble-compose.test.ts new file mode 100644 index 00000000..22fdfd7c --- /dev/null +++ b/test/preamble-compose.test.ts @@ -0,0 +1,72 @@ +/** + * Preamble composition order — gate-tier test. + * + * Asserts that the AskUserQuestion Format section renders BEFORE the + * Model-Specific Behavioral Patch section in tier-≥2 preamble output. + * This order is load-bearing: Opus 4.7 reads top-to-bottom and absorbs + * the first pacing directive it hits. v1.6.4.0 regressed plan-review + * cadence because the overlay rendered first with "Batch your questions" + * as the ambient default. + * + * If someone later reorders `scripts/resolvers/preamble.ts` so Overlay + * comes before Format, this test catches it before the next model + * migration can silently re-break the plan-review pacing. + */ +import { describe, test, expect } from 'bun:test'; +import type { TemplateContext } from '../scripts/resolvers/types'; +import { HOST_PATHS } from '../scripts/resolvers/types'; +import { generatePreamble } from '../scripts/resolvers/preamble'; + +function makeCtx( + host: 'claude' | 'codex', + tier: 1 | 2 | 3 | 4, + model?: string, +): TemplateContext { + return { + skillName: 'test-skill', + tmplPath: 'test.tmpl', + host, + paths: HOST_PATHS[host], + preambleTier: tier, + ...(model ? { model } : {}), + }; +} + +describe('Preamble composition order', () => { + test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 2, claude)', () => { + const out = generatePreamble(makeCtx('claude', 2, 'claude')); + const formatIdx = out.indexOf('## AskUserQuestion Format'); + const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch'); + expect(formatIdx).toBeGreaterThan(-1); + expect(overlayIdx).toBeGreaterThan(-1); + expect(formatIdx).toBeLessThan(overlayIdx); + }); + + test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 2, opus-4-7)', () => { + const out = generatePreamble(makeCtx('claude', 2, 'opus-4-7')); + const formatIdx = out.indexOf('## AskUserQuestion Format'); + const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch'); + expect(formatIdx).toBeGreaterThan(-1); + expect(overlayIdx).toBeGreaterThan(-1); + expect(formatIdx).toBeLessThan(overlayIdx); + }); + + test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 3)', () => { + const out = generatePreamble(makeCtx('claude', 3, 'opus-4-7')); + const formatIdx = out.indexOf('## AskUserQuestion Format'); + const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch'); + expect(formatIdx).toBeLessThan(overlayIdx); + }); + + test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (codex host)', () => { + const out = generatePreamble(makeCtx('codex', 2, 'opus-4-7')); + const formatIdx = out.indexOf('## AskUserQuestion Format'); + const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch'); + expect(formatIdx).toBeLessThan(overlayIdx); + }); + + test('tier 1 preamble does NOT include AskUserQuestion Format (but MAY include overlay)', () => { + const out = generatePreamble(makeCtx('claude', 1)); + expect(out).not.toContain('## AskUserQuestion Format'); + }); +}); diff --git a/test/resolver-ask-user-format.test.ts b/test/resolver-ask-user-format.test.ts new file mode 100644 index 00000000..37744f2b --- /dev/null +++ b/test/resolver-ask-user-format.test.ts @@ -0,0 +1,121 @@ +/** + * AskUserQuestion Format resolver — gate-tier assertions on the generated + * Pros/Cons format directive block. + * + * v1.7.0.0 introduces Pros/Cons decision-brief formatting: + * - D numbered header + * - ELI10 paragraph + * - Stakes-if-we-pick-wrong line + * - Recommendation line (mandatory, even for neutral posture) + * - Pros/Cons block with ✅/❌ per option, min 2 pros + 1 con, ≥40 char bullets + * - Net: synthesis line + * + * This test pins the format contract so a future edit to the resolver + * can't silently drop a rule. If the resolver stops emitting one of + * these tokens, bun test catches it in milliseconds instead of waiting + * for the weekly periodic eval to notice. + */ +import { describe, test, expect } from 'bun:test'; +import type { TemplateContext } from '../scripts/resolvers/types'; +import { HOST_PATHS } from '../scripts/resolvers/types'; +import { generateAskUserFormat } from '../scripts/resolvers/preamble/generate-ask-user-format'; + +function makeCtx(): TemplateContext { + return { + skillName: 'test-skill', + tmplPath: 'test.tmpl', + host: 'claude', + paths: HOST_PATHS.claude, + preambleTier: 2, + }; +} + +describe('generateAskUserFormat — v1.7.0.0 Pros/Cons format', () => { + const out = generateAskUserFormat(makeCtx()); + + test('includes AskUserQuestion Format header', () => { + expect(out).toContain('## AskUserQuestion Format'); + }); + + test('documents D-numbered header requirement', () => { + expect(out).toContain('D'); + expect(out).toMatch(/first question in a skill invocation is `D1`/i); + }); + + test('documents ELI10 requirement', () => { + expect(out).toContain('ELI10'); + expect(out).toMatch(/plain English.*16-year-old/); + }); + + test('documents Stakes-if-we-pick-wrong line', () => { + expect(out).toContain('Stakes if we pick wrong'); + }); + + test('documents mandatory Recommendation line', () => { + expect(out).toContain('Recommendation: '); + expect(out).toMatch(/Recommendation.*ALWAYS|Recommendation \(ALWAYS\)/); + }); + + test('documents Pros / cons block header', () => { + expect(out).toContain('Pros / cons:'); + }); + + test('documents ✅ pro markers with min count + min length rule', () => { + expect(out).toContain('✅'); + expect(out).toMatch(/[Mm]inimum 2 pros/); + expect(out).toMatch(/40 characters|≥40 chars/); + }); + + test('documents ❌ con markers with min count rule', () => { + expect(out).toContain('❌'); + expect(out).toMatch(/1 con per option|minimum.*1 con/i); + }); + + test('documents hard-stop escape with exact phrase', () => { + // "No cons — this is a hard-stop choice" may span a line break in the + // rendered resolver text; match across whitespace collapses. + expect(out).toMatch(/No cons\s+—\s+this is a\s+hard-stop choice/); + }); + + test('documents neutral-posture escape preserving (recommended) label', () => { + // CT1 resolution: (recommended) label STAYS on default option to preserve + // AUTO_DECIDE contract. Neutrality expressed in prose only. + expect(out).toMatch(/taste call/i); + // `s` flag makes . match newlines — the label + STAYS phrase spans a line break + expect(out).toMatch(/\(recommended\)[\s\S]*STAYS|STAYS[\s\S]*\(recommended\)/); + expect(out).toMatch(/AUTO_DECIDE/); + }); + + test('documents Net line for closing synthesis', () => { + expect(out).toMatch(/^Net:/m); + expect(out).toMatch(/synthesis|tradeoff/i); + }); + + test('documents Completeness scoring rules (coverage vs kind)', () => { + expect(out).toContain('Completeness'); + expect(out).toMatch(/10 = complete/); + expect(out).toMatch(/options differ in kind, not coverage/); + }); + + test('documents tool_use mandate (rule 11)', () => { + expect(out).toMatch(/tool_use/); + // "not a question" spans a newline in the rendered text + expect(out).toMatch(/not a[\s\S]*question|not[\s\S]*interactive/i); + }); + + test('includes self-check before emitting', () => { + expect(out).toContain('Self-check before emitting'); + expect(out).toMatch(/D header present/); + expect(out).toMatch(/Net line closes/); + }); + + test('documents D-numbering as model-level not runtime state', () => { + // Codex finding #4 caveat: D-numbering is a prompt wish, not a system + // guarantee. TemplateContext has no counter. This check pins the caveat. + expect(out).toMatch(/model-level instruction|not a runtime counter|count your own/i); + }); + + test('per-skill override guidance preserved', () => { + expect(out).toMatch(/Per-skill instructions may add/); + }); +}); diff --git a/test/skill-e2e-plan-prosons.test.ts b/test/skill-e2e-plan-prosons.test.ts new file mode 100644 index 00000000..8fb68bc0 --- /dev/null +++ b/test/skill-e2e-plan-prosons.test.ts @@ -0,0 +1,352 @@ +/** + * v1.7.0.0 Pros/Cons format regression tests for plan reviews. + * + * Extends the v1.6.3.0 format harness (skill-e2e-plan-format.test.ts) with + * four new cases covering the Pros/Cons decision-brief format: + * + * 1. Format positive — every AskUserQuestion renders with D / ELI10 / + * Stakes / Recommendation / Pros/cons / ✅×2+ / ❌×1+ / Net tokens. + * 2. Hard-stop positive — destructive-action question may use the single + * "No cons — this is a hard-stop choice" escape. + * 3. Hard-stop NEGATIVE (CT2) — plan with genuine tradeoff, model must NOT + * dodge to the hard-stop escape. Forces real tradeoff articulation. + * 4. Neutral-posture NEGATIVE (CT2) — plan with one clearly-dominant option, + * model must emit (recommended) label and concrete recommendation, NOT + * "no preference — taste call" dodge. + * + * Capture pattern matches existing harness: agent writes verbatim + * AskUserQuestion text to $OUT_FILE; regex predicates run on the captured + * file. Classified periodic (Opus 4.7 non-deterministic). + * + * FOLLOW-UP (not in v1.7.0.0): + * - True cadence eval (3 findings → 3 distinct asks across turns). Current + * $OUT_FILE harness captures ONE would-be question per session. Multi-turn + * cadence needs new harness support. Filed in TODOs. + * - Expanded coverage for /ship /office-hours /investigate /qa /review + * /design-review /document-release. Touchfiles entries already exist; eval + * cases will land as follow-up PRs per skill. + */ +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { runSkillTest } from './helpers/session-runner'; +import { + ROOT, runId, + describeIfSelected, testConcurrentIfSelected, + logCost, recordE2E, + createEvalCollector, finalizeEvalCollector, +} from './helpers/e2e-helpers'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const evalCollector = createEvalCollector('e2e-plan-prosons'); + +// v1.7.0.0 format tokens +const D_NUMBER_RE = /D\d+\s+—/; +const ELI10_RE = /ELI10:/i; +const STAKES_RE = /Stakes if we pick wrong:/i; +const RECOMMENDATION_RE = /[Rr]ecommendation:/; +const PROS_CONS_HEADER_RE = /Pros\s*\/\s*cons:/i; +const NET_LINE_RE = /^Net:/m; +const HARD_STOP_ESCAPE_RE = /✅\s+No cons\s+—\s+this is a hard-stop choice/; +const NEUTRAL_POSTURE_RE = /taste call/i; +const RECOMMENDED_LABEL_RE = /\(recommended\)/; + +function countChars(text: string, char: string): number { + return (text.match(new RegExp(char, 'g')) || []).length; +} + +const TRADEOFF_PLAN = `# Plan: Add user dashboard caching + +## Context +Dashboard renders in 3s on cold load, 800ms on warm cache. Users complain. + +## Approach options + +### Option A: Redis cache layer (complete) +- Add Redis with 5min TTL for dashboard aggregates. +- Cold path: compute + cache. Warm path: fetch from cache. +- Needs Redis infra, cache invalidation logic for activity updates. +- Covers all users, all flows, fails gracefully on cache miss. + +### Option B: In-memory LRU cache (happy path only) +- Per-process LRU with 100-entry cap. +- No cross-process sharing; cache warms per-pod. +- Skips cache invalidation; stale reads up to 5min. + +Both options have real pros and cons. This is a genuine tradeoff. +`; + +const HARDSTOP_PLAN = `# Plan: Delete all user sessions + +## Context +Security incident. All active sessions need to be terminated immediately. + +## Action +Run \`DELETE FROM sessions WHERE TRUE\`. No dry-run mode. + +This is a one-way door. There is no "partial" version. +`; + +const DOMINANT_PLAN = `# Plan: Add input validation to signup endpoint + +## Context +Signup endpoint currently accepts any email string and any password length. +Bug report: users type gibberish, signup succeeds, they can't log in. + +## Options + +### Option A: Full RFC 5322 email validation + min 8-char password + server-side checks +- Catches malformed emails, rejects weak passwords, validated on server. +- Prevents the reported bug and adjacent bugs. +- Standard web practice. + +### Option B: Client-side type="email" only, no password validation +- Only catches some browsers' built-in validation. +- Attackers bypass by disabling JS. +- Does not fix the reported bug. + +Option A clearly dominates on coverage. This is NOT a taste call. +`; + +function setupPlanDir(tmpPrefix: string, planContent: string, skillName: string): string { + const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix)); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(planDir, 'plan.md'), planContent); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add plan']); + + fs.mkdirSync(path.join(planDir, skillName), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, skillName, 'SKILL.md'), + path.join(planDir, skillName, 'SKILL.md'), + ); + + return planDir; +} + +function captureInstruction(outFile: string): string { + return `Write the verbatim text of the single AskUserQuestion you would have made to ${outFile} (full text including D header, ELI10, Stakes, Recommendation, Pros/cons, and Net line — the complete rich markdown body). Do NOT call any tool to ask the user. Do NOT paraphrase. This is a format-capture test.`; +} + +// --- Case 1: Format positive — all v1.7.0.0 tokens present --- + +describeIfSelected('Plan Prosons — Format Positive', ['plan-review-prosons-format'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-prosons-format-', TRADEOFF_PLAN, 'plan-ceo-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-review-prosons-format', async () => { + const result = await runSkillTest({ + prompt: `Read plan-ceo-review/SKILL.md for the review workflow. + +Read plan.md — two cache approaches with real tradeoffs. Pick the architectural approach via AskUserQuestion (Step 0C-bis / Implementation Alternatives). These options differ in coverage. + +${captureInstruction(outFile)} + +After writing the file, stop.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-review-prosons-format', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-review prosons format positive', result); + recordE2E(evalCollector, '/plan-review-prosons-format', 'Plan Prosons — Format Positive', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(200); + + // Every Pros/Cons token present + expect(captured).toMatch(D_NUMBER_RE); + expect(captured).toMatch(ELI10_RE); + expect(captured).toMatch(STAKES_RE); + expect(captured).toMatch(RECOMMENDATION_RE); + expect(captured).toMatch(PROS_CONS_HEADER_RE); + expect(captured).toMatch(NET_LINE_RE); + + // Pro/con bullet counts: ≥2 ✅ and ≥1 ❌ per option (total ≥4 ✅ and ≥2 ❌ for 2 options) + expect(countChars(captured, '✅')).toBeGreaterThanOrEqual(4); + expect(countChars(captured, '❌')).toBeGreaterThanOrEqual(2); + + // (recommended) label on one option + expect(captured).toMatch(RECOMMENDED_LABEL_RE); + }, 300_000); +}); + +// --- Case 2: Hard-stop escape NEGATIVE (CT2) --- + +describeIfSelected('Plan Prosons — Hard-stop Negative', ['plan-review-prosons-hardstop-neg'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-prosons-hardstop-neg-', TRADEOFF_PLAN, 'plan-ceo-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-review-prosons-hardstop-neg', async () => { + const result = await runSkillTest({ + prompt: `Read plan-ceo-review/SKILL.md. + +Read plan.md — this has REAL tradeoffs between Redis and in-memory caching (both have pros and cons). Pick the architectural approach via AskUserQuestion. + +${captureInstruction(outFile)} + +After writing the file, stop.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-review-prosons-hardstop-neg', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-review prosons hard-stop negative', result); + recordE2E(evalCollector, '/plan-review-prosons-hardstop-neg', 'Plan Prosons — Hard-stop Negative', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(200); + + // Genuine tradeoff — must NOT dodge to hard-stop escape. + expect(captured).not.toMatch(HARD_STOP_ESCAPE_RE); + // Must have real pros and cons (≥2 ✅ + ≥1 ❌ per option) + expect(countChars(captured, '✅')).toBeGreaterThanOrEqual(4); + expect(countChars(captured, '❌')).toBeGreaterThanOrEqual(2); + }, 300_000); +}); + +// --- Case 3: Neutral-posture NEGATIVE (CT2) --- + +describeIfSelected('Plan Prosons — Neutral-posture Negative', ['plan-review-prosons-neutral-neg'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-prosons-neutral-neg-', DOMINANT_PLAN, 'plan-ceo-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-review-prosons-neutral-neg', async () => { + const result = await runSkillTest({ + prompt: `Read plan-ceo-review/SKILL.md. + +Read plan.md — Option A dominates Option B on coverage. This is NOT a taste call. Pick the approach via AskUserQuestion (Step 0C-bis / Implementation Alternatives — coverage-differentiated, so Completeness: N/10 applies). + +${captureInstruction(outFile)} + +After writing the file, stop.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-review-prosons-neutral-neg', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-review prosons neutral negative', result); + recordE2E(evalCollector, '/plan-review-prosons-neutral-neg', 'Plan Prosons — Neutral Negative', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(200); + + // One option dominates — must NOT use "taste call" neutral-posture dodge. + expect(captured).not.toMatch(NEUTRAL_POSTURE_RE); + // (recommended) label MUST be present on the dominant option. + expect(captured).toMatch(RECOMMENDED_LABEL_RE); + // Recommendation line must contain "because" (concrete reason, not "no preference") + expect(captured).toMatch(/[Rr]ecommendation:.*because/); + }, 300_000); +}); + +// --- Case 4: Hard-stop POSITIVE (escape allowed when legitimately one-sided) --- + +describeIfSelected('Plan Prosons — Hard-stop Positive', ['plan-ceo-review-prosons-cadence'], () => { + let planDir: string; + let outFile: string; + + beforeAll(() => { + planDir = setupPlanDir('skill-e2e-plan-prosons-hardstop-pos-', HARDSTOP_PLAN, 'plan-ceo-review'); + outFile = path.join(planDir, 'ask-capture.md'); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('plan-ceo-review-prosons-cadence', async () => { + const result = await runSkillTest({ + prompt: `Read plan-ceo-review/SKILL.md. + +Read plan.md — this is a destructive one-way action (terminate all sessions). Ask the user to confirm via AskUserQuestion. This is a legitimate hard-stop choice — the hard-stop escape (\`✅ No cons — this is a hard-stop choice\`) is allowed here because there is no meaningful alternative besides doing or not doing the action. + +${captureInstruction(outFile)} + +After writing the file, stop.`, + workingDirectory: planDir, + maxTurns: 10, + timeout: 240_000, + testName: 'plan-ceo-review-prosons-cadence', + runId, + model: 'claude-opus-4-7', + }); + + logCost('/plan-review prosons hard-stop positive', result); + recordE2E(evalCollector, '/plan-ceo-review-prosons-cadence', 'Plan Prosons — Hard-stop Positive', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + expect(fs.existsSync(outFile)).toBe(true); + const captured = fs.readFileSync(outFile, 'utf-8'); + expect(captured.length).toBeGreaterThan(100); + + // Format scaffolding still required + expect(captured).toMatch(PROS_CONS_HEADER_RE); + // Hard-stop escape is ACCEPTED here (destructive one-way action) + // Either the escape is used OR real pros/cons are present — both are valid. + const hasEscape = HARD_STOP_ESCAPE_RE.test(captured); + const hasProsAndCons = countChars(captured, '✅') >= 1 && countChars(captured, '❌') >= 1; + expect(hasEscape || hasProsAndCons).toBe(true); + }, 300_000); +}); + +afterAll(async () => { + await finalizeEvalCollector(evalCollector); +});