diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts index 92209d3e..b5b24141 100644 --- a/test/helpers/claude-pty-runner.ts +++ b/test/helpers/claude-pty-runner.ts @@ -376,6 +376,218 @@ export function classifyVisible(visible: string): ClassifyResult { return null; } +// ──────────────────────────────────────────────────────────────────────────── +// Per-finding AskUserQuestion count primitives (used by runPlanSkillCounting). +// +// These are pure helpers extracted up-front so the unit suite can exercise +// them deterministically before the live-PTY counter runs them. Each one is +// independently unit-testable against synthetic visible-buffer strings. +// ──────────────────────────────────────────────────────────────────────────── + +/** + * Captured identity of an AskUserQuestion — the rendered question text plus + * its numbered options. Used by `runPlanSkillCounting` to dedupe redrawn + * prompts and to feed `Step0BoundaryPredicate` callers. + * + * `signature` is the stable hash. Two AUQs with identical prompt + options + * produce the same signature; differences in either field produce different + * signatures. Critically: two AUQs with shared option labels (e.g. the + * generic "A) Add to plan / B) Defer / C) Build now" menu) but different + * question text get DIFFERENT signatures because the prompt is in the hash. + */ +export interface AskUserQuestionFingerprint { + /** Stable hash combining normalized prompt text + options signature. */ + signature: string; + /** First 240 chars of the rendered question prompt (post-normalization). */ + promptSnippet: string; + /** Captured option labels, in index order. */ + options: Array<{ index: number; label: string }>; + /** Wall-clock when first observed (ms since the helper started polling). */ + observedAtMs: number; + /** True if observed BEFORE the Step-0 boundary fired. */ + preReview: boolean; +} + +/** + * Predicate fired against the AUQ we just answered (not the visible buffer). + * Returns true if this AUQ's fingerprint marks the LAST Step-0 question for + * its skill — all subsequent AUQs are review-phase findings. + * + * Event-based by design: matching against an answered AUQ's fingerprint + * (prompt + options) is deterministic, whereas matching against later + * rendered content (section headers, summary text) races with the agent's + * output cadence. See plan §D14 for the rationale. + */ +export type Step0BoundaryPredicate = ( + answeredFingerprint: AskUserQuestionFingerprint, +) => boolean; + +/** + * Parse the rendered question prompt out of a visible TTY buffer. The prompt + * is the 1–3 lines of text immediately ABOVE the latest `❯ 1.` cursor line — + * not part of the option list, not the permission-dialog header. + * + * Returns the prompt normalized to a single-spaced 240-char snippet (strip + * ANSI residue, collapse internal whitespace, trim) — short enough to use as + * a hash key, long enough to disambiguate distinct questions. + * + * Returns "" when no prompt could be parsed (cursor not yet rendered, or + * cursor is at the top of the buffer with no preceding text). Callers that + * use the empty string as a fingerprint input should treat empty-prompt + * AUQs as "wait one more poll" rather than fingerprinting them — otherwise + * the same options + empty prompt across two distinct questions collide. + */ +export function parseQuestionPrompt(visible: string): string { + // Tail-only — older prompts higher in the buffer are stale. + const tail = visible.length > 4096 ? visible.slice(-4096) : visible; + const lines = tail.split('\n'); + + // Find the latest `❯ 1.` cursor line (matching parseNumberedOptions). + let cursorLineIdx = -1; + for (let i = lines.length - 1; i >= 0; i--) { + if (/^\s*❯\s*1\./.test(lines[i] ?? '')) { + cursorLineIdx = i; + break; + } + } + if (cursorLineIdx < 0) return ''; + + // Walk up at most 6 lines collecting prompt text. Stop at: + // - a blank line preceded by another blank line (paragraph break) + // - top of buffer + // - a line that itself starts with `N.` (we're inside an option list) + const promptLines: string[] = []; + let blankRun = 0; + for (let i = cursorLineIdx - 1; i >= 0 && promptLines.length < 6; i--) { + const raw = lines[i] ?? ''; + const trimmed = raw.trim(); + if (trimmed === '') { + blankRun += 1; + if (blankRun >= 2 && promptLines.length > 0) break; + continue; + } + blankRun = 0; + // Stop if we hit what looks like a previous numbered list. + if (/^[\s❯]*[1-9]\.\s+\S/.test(raw)) break; + promptLines.unshift(trimmed); + } + + const joined = promptLines.join(' ').replace(/\s+/g, ' ').trim(); + return joined.slice(0, 240); +} + +/** + * Stable hash for an AskUserQuestion's identity — combines normalized prompt + * text with the options signature so two distinct questions with shared menu + * labels (the generic A/B/C TODO-proposal menu, for instance) get different + * fingerprints. + * + * Uses Bun's fast non-crypto hash since these strings are short and we only + * need collision resistance against accidental TTY redraws, not adversaries. + * Hex-encoded for diagnostic dumps. + */ +export function auqFingerprint( + promptSnippet: string, + opts: Array<{ index: number; label: string }>, +): string { + const normalized = promptSnippet.replace(/\s+/g, ' ').trim(); + const sig = optionsSignature(opts); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return (Bun as any).hash(normalized + '||' + sig).toString(16); +} + +/** + * Detects when a plan-* skill has reached its Completion Summary / Review + * Report — a terminal signal complementary to plan-mode's "Ready to execute" + * confirmation. Each plan-review skill writes one of these phrasings near + * the end of its run; matching any one is enough to stop counting. + * + * Best-effort: this is a content marker, not a deterministic event. Hard + * ceiling (`reviewCountCeiling` in `runPlanSkillCounting`) is the reliable + * stop signal; this regex is the "we're done, go gracefully" hint. + */ +export const COMPLETION_SUMMARY_RE = + /(GSTACK REVIEW REPORT|## Completion [Ss]ummary|Status:\s*(clean|issues_open)|^VERDICT:)/m; + +/** + * Result of asserting that a plan file ends with `## GSTACK REVIEW REPORT` + * as its last `## ` heading. `ok` is true iff the report is present AND no + * other `## ` heading appears after it. Diagnostic fields are populated only + * on failure to keep the success path cheap. + */ +export interface ReviewReportAtBottomResult { + ok: boolean; + reason?: string; + trailingHeadings?: string[]; +} + +/** + * Assert that `## GSTACK REVIEW REPORT` is the last `## ` heading in a plan + * file's content. Pure string operation — no filesystem access. Used by the + * finding-count E2E tests as a second assertion on each test's produced plan. + * + * The plan-mode skill template mandates the agent move/append the review + * report so it's always the last `##` section. A regression where the agent + * appends additional sections after the report (or skips it entirely) ships + * silently today; this assertion catches both. + */ +export function assertReviewReportAtBottom( + content: string, +): ReviewReportAtBottomResult { + const re = /^## GSTACK REVIEW REPORT\s*$/m; + const match = re.exec(content); + if (!match) { + return { ok: false, reason: 'no GSTACK REVIEW REPORT section' }; + } + const after = content.slice(match.index + match[0].length); + // Match any `## ` heading after the report. Reject `## ` followed by + // newline-only (trailing-whitespace ## headers) to avoid false positives. + const trailingHeadings = Array.from( + after.matchAll(/^## \S.*$/gm), + ).map((m) => m[0]); + if (trailingHeadings.length > 0) { + return { + ok: false, + reason: 'trailing ## heading(s) after GSTACK REVIEW REPORT', + trailingHeadings, + }; + } + return { ok: true }; +} + +/** + * Per-skill Step-0 boundary predicates. Each fires `true` when the answered + * AUQ's fingerprint matches the LAST question of that skill's Step 0 phase. + * + * - `ceoStep0Boundary`: matches the mode-pick AUQ (options match `MODE_RE`). + * - `engStep0Boundary`: matches the cross-project-learnings or scope-reduction + * AUQ that closes plan-eng-review's preamble. + * - `designStep0Boundary`: matches plan-design-review's first dimension / + * posture AUQ. + * - `devexStep0Boundary`: matches plan-devex-review's persona-selection AUQ. + * + * Predicates live alongside the helper so the unit suite can exercise each + * against synthetic fingerprints (positive AND negative cases). Skill test + * files import them directly. + */ +export const ceoStep0Boundary: Step0BoundaryPredicate = (fp) => + fp.options.some((o) => MODE_RE.test(o.label)); + +export const engStep0Boundary: Step0BoundaryPredicate = (fp) => + /scope reduction recommendation|cross[\s-]?project learnings/i.test( + fp.promptSnippet, + ); + +export const designStep0Boundary: Step0BoundaryPredicate = (fp) => + /design system|design posture|design score|first dimension/i.test( + fp.promptSnippet, + ); + +export const devexStep0Boundary: Step0BoundaryPredicate = (fp) => + /developer persona|target persona|persona selection|TTHW target/i.test( + fp.promptSnippet, + ); + /** * Spawn `claude --permission-mode plan` in a real PTY and return a session * handle. Caller is responsible for `await session.close()` to release the diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts index c88fb8d0..28b96b83 100644 --- a/test/helpers/claude-pty-runner.unit.test.ts +++ b/test/helpers/claude-pty-runner.unit.test.ts @@ -30,7 +30,17 @@ import { parseNumberedOptions, classifyVisible, TAIL_SCAN_BYTES, + optionsSignature, + parseQuestionPrompt, + auqFingerprint, + COMPLETION_SUMMARY_RE, + assertReviewReportAtBottom, + ceoStep0Boundary, + engStep0Boundary, + designStep0Boundary, + devexStep0Boundary, type ClaudePtyOptions, + type AskUserQuestionFingerprint, } from './claude-pty-runner'; describe('isPermissionDialogVisible', () => { @@ -318,3 +328,333 @@ describe('runPlanSkillObservation env passthrough surface', () => { expect(opts.env).toEqual({ QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }); }); }); + +// ──────────────────────────────────────────────────────────────────────────── +// Per-finding count primitives — Section 3 unit tests #1–#5, #7, #12. +// ──────────────────────────────────────────────────────────────────────────── + +describe('optionsSignature', () => { + test('returns a "|"-joined `index:label` string for a clean list', () => { + const sig = optionsSignature([ + { index: 1, label: 'HOLD SCOPE' }, + { index: 2, label: 'SCOPE EXPANSION' }, + ]); + expect(sig).toBe('1:HOLD SCOPE|2:SCOPE EXPANSION'); + }); + + test('order-independent: shuffled inputs produce the same signature', () => { + // parseNumberedOptions already returns sorted, but defensive sort means + // a future caller that hands us shuffled input still produces a stable + // dedupe signature. + const a = optionsSignature([ + { index: 2, label: 'B' }, + { index: 1, label: 'A' }, + { index: 3, label: 'C' }, + ]); + const b = optionsSignature([ + { index: 1, label: 'A' }, + { index: 2, label: 'B' }, + { index: 3, label: 'C' }, + ]); + expect(a).toBe(b); + }); + + test('empty list returns empty string', () => { + expect(optionsSignature([])).toBe(''); + }); + + test('single-item list returns just that entry', () => { + expect(optionsSignature([{ index: 1, label: 'Only' }])).toBe('1:Only'); + }); +}); + +describe('parseQuestionPrompt', () => { + test('captures 1-line prompt above the cursor', () => { + const visible = ` + D1 — Pick a mode + + ❯ 1. HOLD SCOPE + 2. SCOPE EXPANSION + `; + const prompt = parseQuestionPrompt(visible); + expect(prompt).toBe('D1 — Pick a mode'); + }); + + test('captures multi-line prompt above the cursor', () => { + const visible = ` + D2 — Approach selection + + Which architecture should we follow? + + ❯ 1. Bypass existing helper + 2. Reuse existing helper + `; + const prompt = parseQuestionPrompt(visible); + // Multi-line prompts get joined with single spaces. + expect(prompt).toContain('D2 — Approach selection'); + expect(prompt).toContain('Which architecture should we follow?'); + }); + + test('returns "" when no cursor is rendered', () => { + expect(parseQuestionPrompt('Just some prose.\nNo cursor.')).toBe(''); + }); + + test('truncates to 240 chars', () => { + const longPrompt = 'A'.repeat(500); + const visible = `${longPrompt}\n\n ❯ 1. yes\n 2. no`; + expect(parseQuestionPrompt(visible).length).toBeLessThanOrEqual(240); + }); + + test('does not pull text from a previous numbered list above', () => { + const visible = ` + ❯ 1. previous answered question + 2. previous option two + + D2 — A new question text + + ❯ 1. fresh option A + 2. fresh option B + `; + const prompt = parseQuestionPrompt(visible); + // Stops at the previous numbered-list line; should NOT contain "previous answered question". + expect(prompt).toContain('D2 — A new question text'); + expect(prompt).not.toContain('previous answered question'); + }); + + test('normalizes whitespace (collapses runs of spaces and tabs)', () => { + const visible = `D1 — Spaced out + + ❯ 1. yes + 2. no`; + expect(parseQuestionPrompt(visible)).toBe('D1 — Spaced out'); + }); +}); + +describe('auqFingerprint', () => { + test('returns the same fingerprint for identical inputs', () => { + const opts = [ + { index: 1, label: 'A' }, + { index: 2, label: 'B' }, + ]; + expect(auqFingerprint('hello', opts)).toBe(auqFingerprint('hello', opts)); + }); + + test('different prompts with shared option labels produce DIFFERENT fingerprints', () => { + // The collision regression Codex F1 caught: option-label-only fingerprints + // collapsed multiple distinct findings into one when they shared menu shape. + const sharedOpts = [ + { index: 1, label: 'Add to plan' }, + { index: 2, label: 'Defer' }, + { index: 3, label: 'Build now' }, + ]; + const fpFinding1 = auqFingerprint('D5 — Architecture: bypass helper?', sharedOpts); + const fpFinding2 = auqFingerprint('D6 — Tests: zero coverage?', sharedOpts); + expect(fpFinding1).not.toBe(fpFinding2); + }); + + test('same prompt with different options produces DIFFERENT fingerprints', () => { + const prompt = 'D1 — Pick a mode'; + const fpA = auqFingerprint(prompt, [ + { index: 1, label: 'HOLD SCOPE' }, + { index: 2, label: 'SCOPE EXPANSION' }, + ]); + const fpB = auqFingerprint(prompt, [ + { index: 1, label: 'HOLD SCOPE' }, + { index: 2, label: 'SCOPE REDUCTION' }, + ]); + expect(fpA).not.toBe(fpB); + }); + + test('whitespace-only differences in prompt do NOT change the fingerprint', () => { + // Same content, different rendering whitespace (TTY redraw artifact) + // must produce the same fingerprint so dedupe survives reflow. + const opts = [{ index: 1, label: 'A' }, { index: 2, label: 'B' }]; + const fpA = auqFingerprint('Pick a mode', opts); + const fpB = auqFingerprint('Pick a mode', opts); + expect(fpA).toBe(fpB); + }); + + test('empty prompt + same options collide (caller must guard against this)', () => { + // Documents the contract: empty-prompt fingerprints WILL collide if the + // caller fingerprints them. runPlanSkillCounting must skip empty-prompt + // AUQs and re-poll instead. + const opts = [{ index: 1, label: 'A' }]; + expect(auqFingerprint('', opts)).toBe(auqFingerprint('', opts)); + }); +}); + +describe('COMPLETION_SUMMARY_RE', () => { + test('matches GSTACK REVIEW REPORT heading', () => { + expect(COMPLETION_SUMMARY_RE.test('## GSTACK REVIEW REPORT')).toBe(true); + }); + + test('matches Completion Summary heading (ceo + eng)', () => { + expect(COMPLETION_SUMMARY_RE.test('## Completion Summary')).toBe(true); + expect(COMPLETION_SUMMARY_RE.test('## Completion summary')).toBe(true); + }); + + test('matches Status: clean (CEO review-log shape)', () => { + expect(COMPLETION_SUMMARY_RE.test('Status: clean')).toBe(true); + expect(COMPLETION_SUMMARY_RE.test('Status: issues_open')).toBe(true); + }); + + test('matches VERDICT: line', () => { + expect(COMPLETION_SUMMARY_RE.test('VERDICT: CLEARED — Eng Review passed')).toBe(true); + }); + + test('does NOT match prose mentions of "verdict" mid-line', () => { + // VERDICT must be at the start of a line to count. + expect(COMPLETION_SUMMARY_RE.test('the final verdict: undecided')).toBe(false); + }); +}); + +describe('assertReviewReportAtBottom', () => { + test('passes when REVIEW REPORT is the only/last ## heading', () => { + const content = `# Plan + +## Context +stuff + +## Approach +more stuff + +## GSTACK REVIEW REPORT + +| col | col | +`; + const r = assertReviewReportAtBottom(content); + expect(r.ok).toBe(true); + }); + + test('fails when REVIEW REPORT is missing', () => { + const content = `# Plan + +## Context +stuff +`; + const r = assertReviewReportAtBottom(content); + expect(r.ok).toBe(false); + expect(r.reason).toMatch(/no GSTACK REVIEW REPORT/); + }); + + test('fails when REVIEW REPORT exists but a ## heading follows it', () => { + const content = `# Plan + +## GSTACK REVIEW REPORT + +| col | col | + +## Late Section +oops +`; + const r = assertReviewReportAtBottom(content); + expect(r.ok).toBe(false); + expect(r.reason).toMatch(/trailing ## heading/); + expect(r.trailingHeadings).toEqual(['## Late Section']); + }); + + test('passes when only ### subheadings follow REVIEW REPORT (deeper nesting allowed)', () => { + const content = `## GSTACK REVIEW REPORT + +### Cross-model tension +- F1: resolved +- F2: resolved +`; + const r = assertReviewReportAtBottom(content); + expect(r.ok).toBe(true); + }); + + test('fails with multiple trailing ## headings reported', () => { + const content = `## GSTACK REVIEW REPORT + +## First trailing + +## Second trailing +`; + const r = assertReviewReportAtBottom(content); + expect(r.ok).toBe(false); + expect(r.trailingHeadings).toHaveLength(2); + }); +}); + +describe('Step0BoundaryPredicate per-skill', () => { + // Helper to build a synthetic fingerprint for predicate tests. + function fp(promptSnippet: string, optionLabels: string[]): AskUserQuestionFingerprint { + const options = optionLabels.map((label, i) => ({ index: i + 1, label })); + return { + signature: auqFingerprint(promptSnippet, options), + promptSnippet, + options, + observedAtMs: 0, + preReview: true, + }; + } + + describe('ceoStep0Boundary', () => { + test('FIRES on Step 0F mode-pick AUQ (HOLD SCOPE in options)', () => { + const f = fp('Pick a mode', ['HOLD SCOPE', 'SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'SCOPE REDUCTION']); + expect(ceoStep0Boundary(f)).toBe(true); + }); + + test('does NOT fire on premise challenge AUQs', () => { + const f = fp('D1 — Premise check: is this the right problem?', ['Yes', 'No', 'Other']); + expect(ceoStep0Boundary(f)).toBe(false); + }); + + test('does NOT fire on review-section AUQs', () => { + const f = fp('Architecture: bypass helper?', ['Reuse existing', 'Roll new', 'Defer']); + expect(ceoStep0Boundary(f)).toBe(false); + }); + }); + + describe('engStep0Boundary', () => { + test('FIRES on cross-project learnings prompt', () => { + const f = fp('Enable cross-project learnings on this machine?', ['Yes', 'No']); + expect(engStep0Boundary(f)).toBe(true); + }); + + test('FIRES on scope reduction recommendation', () => { + const f = fp('Scope reduction recommendation: cut to MVP?', ['Reduce', 'Proceed', 'Modify']); + expect(engStep0Boundary(f)).toBe(true); + }); + + test('does NOT fire on review-section AUQs', () => { + const f = fp('Architecture: shared mutable state?', ['Refactor', 'Defer', 'Skip']); + expect(engStep0Boundary(f)).toBe(false); + }); + }); + + describe('designStep0Boundary', () => { + test('FIRES on design system / posture mention', () => { + const f = fp('Pick a design posture for this review', ['Polish', 'Triage', 'Expansion']); + expect(designStep0Boundary(f)).toBe(true); + }); + + test('FIRES on first-dimension prompt', () => { + const f = fp('First dimension: visual hierarchy. Score?', ['7', '8', '9']); + expect(designStep0Boundary(f)).toBe(true); + }); + + test('does NOT fire on later dimension AUQs', () => { + const f = fp('Spacing dimension score?', ['7', '8', '9']); + expect(designStep0Boundary(f)).toBe(false); + }); + }); + + describe('devexStep0Boundary', () => { + test('FIRES on developer persona selection', () => { + const f = fp('Pick the target persona for this review', ['Senior backend', 'Junior frontend', 'Other']); + expect(devexStep0Boundary(f)).toBe(true); + }); + + test('FIRES on TTHW target prompt', () => { + const f = fp('What is the TTHW target for first run?', ['<5 min', '<15 min', '<30 min']); + expect(devexStep0Boundary(f)).toBe(true); + }); + + test('does NOT fire on review-section AUQs', () => { + const f = fp('Friction point: 5-min CI wait. Address?', ['Now', 'Defer', 'Skip']); + expect(devexStep0Boundary(f)).toBe(false); + }); + }); +});