test: add per-finding count primitives + unit tests

Pure helpers landing ahead of runPlanSkillCounting: - parseQuestionPrompt(visible) — extract the 1-3 line prompt above the latest "❯ 1." cursor, normalize to a 240-char snippet - auqFingerprint(prompt, opts) — Bun.hash of normalized prompt + sorted options signature; distinct prompts with shared option labels (the generic A/B/C TODO menu) get distinct fingerprints - COMPLETION_SUMMARY_RE — terminal-signal regex matching all four plan-review skills' completion / verdict markers - assertReviewReportAtBottom(content) — checks "## GSTACK REVIEW REPORT" is present and is the last "## " heading in a plan file - Step0BoundaryPredicate type + four per-skill predicates (ceo / eng / design / devex) — fire on the answered AUQ's fingerprint, marking the end of Step 0 deterministically (event-based, not content-based, per Codex F7) Plus 37 deterministic unit tests covering option-label collision regression, prompt extraction edge cases, predicate positive AND negative cases, and review-report-at-bottom triple-check (missing / mid-file / multiple trailing). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-04-28 09:10:58 -07:00
parent e6ce1dca70
commit fa07012845
2 changed files with 552 additions and 0 deletions
@@ -376,6 +376,218 @@ export function classifyVisible(visible: string): ClassifyResult {
  return null;
 }

+// ────────────────────────────────────────────────────────────────────────────
+// Per-finding AskUserQuestion count primitives (used by runPlanSkillCounting).
+//
+// These are pure helpers extracted up-front so the unit suite can exercise
+// them deterministically before the live-PTY counter runs them. Each one is
+// independently unit-testable against synthetic visible-buffer strings.
+// ────────────────────────────────────────────────────────────────────────────
+
+/**
+ * Captured identity of an AskUserQuestion — the rendered question text plus
+ * its numbered options. Used by `runPlanSkillCounting` to dedupe redrawn
+ * prompts and to feed `Step0BoundaryPredicate` callers.
+ *
+ * `signature` is the stable hash. Two AUQs with identical prompt + options
+ * produce the same signature; differences in either field produce different
+ * signatures. Critically: two AUQs with shared option labels (e.g. the
+ * generic "A) Add to plan / B) Defer / C) Build now" menu) but different
+ * question text get DIFFERENT signatures because the prompt is in the hash.
+ */
+export interface AskUserQuestionFingerprint {
+  /** Stable hash combining normalized prompt text + options signature. */
+  signature: string;
+  /** First 240 chars of the rendered question prompt (post-normalization). */
+  promptSnippet: string;
+  /** Captured option labels, in index order. */
+  options: Array<{ index: number; label: string }>;
+  /** Wall-clock when first observed (ms since the helper started polling). */
+  observedAtMs: number;
+  /** True if observed BEFORE the Step-0 boundary fired. */
+  preReview: boolean;
+}
+
+/**
+ * Predicate fired against the AUQ we just answered (not the visible buffer).
+ * Returns true if this AUQ's fingerprint marks the LAST Step-0 question for
+ * its skill — all subsequent AUQs are review-phase findings.
+ *
+ * Event-based by design: matching against an answered AUQ's fingerprint
+ * (prompt + options) is deterministic, whereas matching against later
+ * rendered content (section headers, summary text) races with the agent's
+ * output cadence. See plan §D14 for the rationale.
+ */
+export type Step0BoundaryPredicate = (
+  answeredFingerprint: AskUserQuestionFingerprint,
+) => boolean;
+
+/**
+ * Parse the rendered question prompt out of a visible TTY buffer. The prompt
+ * is the 1–3 lines of text immediately ABOVE the latest `❯ 1.` cursor line —
+ * not part of the option list, not the permission-dialog header.
+ *
+ * Returns the prompt normalized to a single-spaced 240-char snippet (strip
+ * ANSI residue, collapse internal whitespace, trim) — short enough to use as
+ * a hash key, long enough to disambiguate distinct questions.
+ *
+ * Returns "" when no prompt could be parsed (cursor not yet rendered, or
+ * cursor is at the top of the buffer with no preceding text). Callers that
+ * use the empty string as a fingerprint input should treat empty-prompt
+ * AUQs as "wait one more poll" rather than fingerprinting them — otherwise
+ * the same options + empty prompt across two distinct questions collide.
+ */
+export function parseQuestionPrompt(visible: string): string {
+  // Tail-only — older prompts higher in the buffer are stale.
+  const tail = visible.length > 4096 ? visible.slice(-4096) : visible;
+  const lines = tail.split('\n');
+
+  // Find the latest `❯ 1.` cursor line (matching parseNumberedOptions).
+  let cursorLineIdx = -1;
+  for (let i = lines.length - 1; i >= 0; i--) {
+    if (/^\s*❯\s*1\./.test(lines[i] ?? '')) {
+      cursorLineIdx = i;
+      break;
+    }
+  }
+  if (cursorLineIdx < 0) return '';
+
+  // Walk up at most 6 lines collecting prompt text. Stop at:
+  //   - a blank line preceded by another blank line (paragraph break)
+  //   - top of buffer
+  //   - a line that itself starts with `N.` (we're inside an option list)
+  const promptLines: string[] = [];
+  let blankRun = 0;
+  for (let i = cursorLineIdx - 1; i >= 0 && promptLines.length < 6; i--) {
+    const raw = lines[i] ?? '';
+    const trimmed = raw.trim();
+    if (trimmed === '') {
+      blankRun += 1;
+      if (blankRun >= 2 && promptLines.length > 0) break;
+      continue;
+    }
+    blankRun = 0;
+    // Stop if we hit what looks like a previous numbered list.
+    if (/^[\s❯]*[1-9]\.\s+\S/.test(raw)) break;
+    promptLines.unshift(trimmed);
+  }
+
+  const joined = promptLines.join(' ').replace(/\s+/g, ' ').trim();
+  return joined.slice(0, 240);
+}
+
+/**
+ * Stable hash for an AskUserQuestion's identity — combines normalized prompt
+ * text with the options signature so two distinct questions with shared menu
+ * labels (the generic A/B/C TODO-proposal menu, for instance) get different
+ * fingerprints.
+ *
+ * Uses Bun's fast non-crypto hash since these strings are short and we only
+ * need collision resistance against accidental TTY redraws, not adversaries.
+ * Hex-encoded for diagnostic dumps.
+ */
+export function auqFingerprint(
+  promptSnippet: string,
+  opts: Array<{ index: number; label: string }>,
+): string {
+  const normalized = promptSnippet.replace(/\s+/g, ' ').trim();
+  const sig = optionsSignature(opts);
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  return (Bun as any).hash(normalized + '||' + sig).toString(16);
+}
+
+/**
+ * Detects when a plan-* skill has reached its Completion Summary / Review
+ * Report — a terminal signal complementary to plan-mode's "Ready to execute"
+ * confirmation. Each plan-review skill writes one of these phrasings near
+ * the end of its run; matching any one is enough to stop counting.
+ *
+ * Best-effort: this is a content marker, not a deterministic event. Hard
+ * ceiling (`reviewCountCeiling` in `runPlanSkillCounting`) is the reliable
+ * stop signal; this regex is the "we're done, go gracefully" hint.
+ */
+export const COMPLETION_SUMMARY_RE =
+  /(GSTACK REVIEW REPORT|## Completion [Ss]ummary|Status:\s*(clean|issues_open)|^VERDICT:)/m;
+
+/**
+ * Result of asserting that a plan file ends with `## GSTACK REVIEW REPORT`
+ * as its last `## ` heading. `ok` is true iff the report is present AND no
+ * other `## ` heading appears after it. Diagnostic fields are populated only
+ * on failure to keep the success path cheap.
+ */
+export interface ReviewReportAtBottomResult {
+  ok: boolean;
+  reason?: string;
+  trailingHeadings?: string[];
+}
+
+/**
+ * Assert that `## GSTACK REVIEW REPORT` is the last `## ` heading in a plan
+ * file's content. Pure string operation — no filesystem access. Used by the
+ * finding-count E2E tests as a second assertion on each test's produced plan.
+ *
+ * The plan-mode skill template mandates the agent move/append the review
+ * report so it's always the last `##` section. A regression where the agent
+ * appends additional sections after the report (or skips it entirely) ships
+ * silently today; this assertion catches both.
+ */
+export function assertReviewReportAtBottom(
+  content: string,
+): ReviewReportAtBottomResult {
+  const re = /^## GSTACK REVIEW REPORT\s*$/m;
+  const match = re.exec(content);
+  if (!match) {
+    return { ok: false, reason: 'no GSTACK REVIEW REPORT section' };
+  }
+  const after = content.slice(match.index + match[0].length);
+  // Match any `## ` heading after the report. Reject `## ` followed by
+  // newline-only (trailing-whitespace ## headers) to avoid false positives.
+  const trailingHeadings = Array.from(
+    after.matchAll(/^## \S.*$/gm),
+  ).map((m) => m[0]);
+  if (trailingHeadings.length > 0) {
+    return {
+      ok: false,
+      reason: 'trailing ## heading(s) after GSTACK REVIEW REPORT',
+      trailingHeadings,
+    };
+  }
+  return { ok: true };
+}
+
+/**
+ * Per-skill Step-0 boundary predicates. Each fires `true` when the answered
+ * AUQ's fingerprint matches the LAST question of that skill's Step 0 phase.
+ *
+ * - `ceoStep0Boundary`: matches the mode-pick AUQ (options match `MODE_RE`).
+ * - `engStep0Boundary`: matches the cross-project-learnings or scope-reduction
+ *   AUQ that closes plan-eng-review's preamble.
+ * - `designStep0Boundary`: matches plan-design-review's first dimension /
+ *   posture AUQ.
+ * - `devexStep0Boundary`: matches plan-devex-review's persona-selection AUQ.
+ *
+ * Predicates live alongside the helper so the unit suite can exercise each
+ * against synthetic fingerprints (positive AND negative cases). Skill test
+ * files import them directly.
+ */
+export const ceoStep0Boundary: Step0BoundaryPredicate = (fp) =>
+  fp.options.some((o) => MODE_RE.test(o.label));
+
+export const engStep0Boundary: Step0BoundaryPredicate = (fp) =>
+  /scope reduction recommendation|cross[\s-]?project learnings/i.test(
+    fp.promptSnippet,
+  );
+
+export const designStep0Boundary: Step0BoundaryPredicate = (fp) =>
+  /design system|design posture|design score|first dimension/i.test(
+    fp.promptSnippet,
+  );
+
+export const devexStep0Boundary: Step0BoundaryPredicate = (fp) =>
+  /developer persona|target persona|persona selection|TTHW target/i.test(
+    fp.promptSnippet,
+  );
+
 /**
 * Spawn `claude --permission-mode plan` in a real PTY and return a session
 * handle. Caller is responsible for `await session.close()` to release the
@@ -30,7 +30,17 @@ import {
  parseNumberedOptions,
  classifyVisible,
  TAIL_SCAN_BYTES,
+  optionsSignature,
+  parseQuestionPrompt,
+  auqFingerprint,
+  COMPLETION_SUMMARY_RE,
+  assertReviewReportAtBottom,
+  ceoStep0Boundary,
+  engStep0Boundary,
+  designStep0Boundary,
+  devexStep0Boundary,
  type ClaudePtyOptions,
+  type AskUserQuestionFingerprint,
 } from './claude-pty-runner';

 describe('isPermissionDialogVisible', () => {
@@ -318,3 +328,333 @@ describe('runPlanSkillObservation env passthrough surface', () => {
    expect(opts.env).toEqual({ QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' });
  });
 });
+
+// ────────────────────────────────────────────────────────────────────────────
+// Per-finding count primitives — Section 3 unit tests #1–#5, #7, #12.
+// ────────────────────────────────────────────────────────────────────────────
+
+describe('optionsSignature', () => {
+  test('returns a "|"-joined `index:label` string for a clean list', () => {
+    const sig = optionsSignature([
+      { index: 1, label: 'HOLD SCOPE' },
+      { index: 2, label: 'SCOPE EXPANSION' },
+    ]);
+    expect(sig).toBe('1:HOLD SCOPE|2:SCOPE EXPANSION');
+  });
+
+  test('order-independent: shuffled inputs produce the same signature', () => {
+    // parseNumberedOptions already returns sorted, but defensive sort means
+    // a future caller that hands us shuffled input still produces a stable
+    // dedupe signature.
+    const a = optionsSignature([
+      { index: 2, label: 'B' },
+      { index: 1, label: 'A' },
+      { index: 3, label: 'C' },
+    ]);
+    const b = optionsSignature([
+      { index: 1, label: 'A' },
+      { index: 2, label: 'B' },
+      { index: 3, label: 'C' },
+    ]);
+    expect(a).toBe(b);
+  });
+
+  test('empty list returns empty string', () => {
+    expect(optionsSignature([])).toBe('');
+  });
+
+  test('single-item list returns just that entry', () => {
+    expect(optionsSignature([{ index: 1, label: 'Only' }])).toBe('1:Only');
+  });
+});
+
+describe('parseQuestionPrompt', () => {
+  test('captures 1-line prompt above the cursor', () => {
+    const visible = `
+      D1 — Pick a mode
+
+      ❯ 1. HOLD SCOPE
+        2. SCOPE EXPANSION
+    `;
+    const prompt = parseQuestionPrompt(visible);
+    expect(prompt).toBe('D1 — Pick a mode');
+  });
+
+  test('captures multi-line prompt above the cursor', () => {
+    const visible = `
+      D2 — Approach selection
+
+      Which architecture should we follow?
+
+      ❯ 1. Bypass existing helper
+        2. Reuse existing helper
+    `;
+    const prompt = parseQuestionPrompt(visible);
+    // Multi-line prompts get joined with single spaces.
+    expect(prompt).toContain('D2 — Approach selection');
+    expect(prompt).toContain('Which architecture should we follow?');
+  });
+
+  test('returns "" when no cursor is rendered', () => {
+    expect(parseQuestionPrompt('Just some prose.\nNo cursor.')).toBe('');
+  });
+
+  test('truncates to 240 chars', () => {
+    const longPrompt = 'A'.repeat(500);
+    const visible = `${longPrompt}\n\n      ❯ 1. yes\n        2. no`;
+    expect(parseQuestionPrompt(visible).length).toBeLessThanOrEqual(240);
+  });
+
+  test('does not pull text from a previous numbered list above', () => {
+    const visible = `
+      ❯ 1. previous answered question
+        2. previous option two
+
+      D2 — A new question text
+
+      ❯ 1. fresh option A
+        2. fresh option B
+    `;
+    const prompt = parseQuestionPrompt(visible);
+    // Stops at the previous numbered-list line; should NOT contain "previous answered question".
+    expect(prompt).toContain('D2 — A new question text');
+    expect(prompt).not.toContain('previous answered question');
+  });
+
+  test('normalizes whitespace (collapses runs of spaces and tabs)', () => {
+    const visible = `D1   —    Spaced     out
+
+      ❯ 1. yes
+        2. no`;
+    expect(parseQuestionPrompt(visible)).toBe('D1 — Spaced out');
+  });
+});
+
+describe('auqFingerprint', () => {
+  test('returns the same fingerprint for identical inputs', () => {
+    const opts = [
+      { index: 1, label: 'A' },
+      { index: 2, label: 'B' },
+    ];
+    expect(auqFingerprint('hello', opts)).toBe(auqFingerprint('hello', opts));
+  });
+
+  test('different prompts with shared option labels produce DIFFERENT fingerprints', () => {
+    // The collision regression Codex F1 caught: option-label-only fingerprints
+    // collapsed multiple distinct findings into one when they shared menu shape.
+    const sharedOpts = [
+      { index: 1, label: 'Add to plan' },
+      { index: 2, label: 'Defer' },
+      { index: 3, label: 'Build now' },
+    ];
+    const fpFinding1 = auqFingerprint('D5 — Architecture: bypass helper?', sharedOpts);
+    const fpFinding2 = auqFingerprint('D6 — Tests: zero coverage?', sharedOpts);
+    expect(fpFinding1).not.toBe(fpFinding2);
+  });
+
+  test('same prompt with different options produces DIFFERENT fingerprints', () => {
+    const prompt = 'D1 — Pick a mode';
+    const fpA = auqFingerprint(prompt, [
+      { index: 1, label: 'HOLD SCOPE' },
+      { index: 2, label: 'SCOPE EXPANSION' },
+    ]);
+    const fpB = auqFingerprint(prompt, [
+      { index: 1, label: 'HOLD SCOPE' },
+      { index: 2, label: 'SCOPE REDUCTION' },
+    ]);
+    expect(fpA).not.toBe(fpB);
+  });
+
+  test('whitespace-only differences in prompt do NOT change the fingerprint', () => {
+    // Same content, different rendering whitespace (TTY redraw artifact)
+    // must produce the same fingerprint so dedupe survives reflow.
+    const opts = [{ index: 1, label: 'A' }, { index: 2, label: 'B' }];
+    const fpA = auqFingerprint('Pick   a     mode', opts);
+    const fpB = auqFingerprint('Pick a mode', opts);
+    expect(fpA).toBe(fpB);
+  });
+
+  test('empty prompt + same options collide (caller must guard against this)', () => {
+    // Documents the contract: empty-prompt fingerprints WILL collide if the
+    // caller fingerprints them. runPlanSkillCounting must skip empty-prompt
+    // AUQs and re-poll instead.
+    const opts = [{ index: 1, label: 'A' }];
+    expect(auqFingerprint('', opts)).toBe(auqFingerprint('', opts));
+  });
+});
+
+describe('COMPLETION_SUMMARY_RE', () => {
+  test('matches GSTACK REVIEW REPORT heading', () => {
+    expect(COMPLETION_SUMMARY_RE.test('## GSTACK REVIEW REPORT')).toBe(true);
+  });
+
+  test('matches Completion Summary heading (ceo + eng)', () => {
+    expect(COMPLETION_SUMMARY_RE.test('## Completion Summary')).toBe(true);
+    expect(COMPLETION_SUMMARY_RE.test('## Completion summary')).toBe(true);
+  });
+
+  test('matches Status: clean (CEO review-log shape)', () => {
+    expect(COMPLETION_SUMMARY_RE.test('Status: clean')).toBe(true);
+    expect(COMPLETION_SUMMARY_RE.test('Status: issues_open')).toBe(true);
+  });
+
+  test('matches VERDICT: line', () => {
+    expect(COMPLETION_SUMMARY_RE.test('VERDICT: CLEARED — Eng Review passed')).toBe(true);
+  });
+
+  test('does NOT match prose mentions of "verdict" mid-line', () => {
+    // VERDICT must be at the start of a line to count.
+    expect(COMPLETION_SUMMARY_RE.test('the final verdict: undecided')).toBe(false);
+  });
+});
+
+describe('assertReviewReportAtBottom', () => {
+  test('passes when REVIEW REPORT is the only/last ## heading', () => {
+    const content = `# Plan
+
+## Context
+stuff
+
+## Approach
+more stuff
+
+## GSTACK REVIEW REPORT
+
+| col | col |
+`;
+    const r = assertReviewReportAtBottom(content);
+    expect(r.ok).toBe(true);
+  });
+
+  test('fails when REVIEW REPORT is missing', () => {
+    const content = `# Plan
+
+## Context
+stuff
+`;
+    const r = assertReviewReportAtBottom(content);
+    expect(r.ok).toBe(false);
+    expect(r.reason).toMatch(/no GSTACK REVIEW REPORT/);
+  });
+
+  test('fails when REVIEW REPORT exists but a ## heading follows it', () => {
+    const content = `# Plan
+
+## GSTACK REVIEW REPORT
+
+| col | col |
+
+## Late Section
+oops
+`;
+    const r = assertReviewReportAtBottom(content);
+    expect(r.ok).toBe(false);
+    expect(r.reason).toMatch(/trailing ## heading/);
+    expect(r.trailingHeadings).toEqual(['## Late Section']);
+  });
+
+  test('passes when only ### subheadings follow REVIEW REPORT (deeper nesting allowed)', () => {
+    const content = `## GSTACK REVIEW REPORT
+
+### Cross-model tension
+- F1: resolved
+- F2: resolved
+`;
+    const r = assertReviewReportAtBottom(content);
+    expect(r.ok).toBe(true);
+  });
+
+  test('fails with multiple trailing ## headings reported', () => {
+    const content = `## GSTACK REVIEW REPORT
+
+## First trailing
+
+## Second trailing
+`;
+    const r = assertReviewReportAtBottom(content);
+    expect(r.ok).toBe(false);
+    expect(r.trailingHeadings).toHaveLength(2);
+  });
+});
+
+describe('Step0BoundaryPredicate per-skill', () => {
+  // Helper to build a synthetic fingerprint for predicate tests.
+  function fp(promptSnippet: string, optionLabels: string[]): AskUserQuestionFingerprint {
+    const options = optionLabels.map((label, i) => ({ index: i + 1, label }));
+    return {
+      signature: auqFingerprint(promptSnippet, options),
+      promptSnippet,
+      options,
+      observedAtMs: 0,
+      preReview: true,
+    };
+  }
+
+  describe('ceoStep0Boundary', () => {
+    test('FIRES on Step 0F mode-pick AUQ (HOLD SCOPE in options)', () => {
+      const f = fp('Pick a mode', ['HOLD SCOPE', 'SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'SCOPE REDUCTION']);
+      expect(ceoStep0Boundary(f)).toBe(true);
+    });
+
+    test('does NOT fire on premise challenge AUQs', () => {
+      const f = fp('D1 — Premise check: is this the right problem?', ['Yes', 'No', 'Other']);
+      expect(ceoStep0Boundary(f)).toBe(false);
+    });
+
+    test('does NOT fire on review-section AUQs', () => {
+      const f = fp('Architecture: bypass helper?', ['Reuse existing', 'Roll new', 'Defer']);
+      expect(ceoStep0Boundary(f)).toBe(false);
+    });
+  });
+
+  describe('engStep0Boundary', () => {
+    test('FIRES on cross-project learnings prompt', () => {
+      const f = fp('Enable cross-project learnings on this machine?', ['Yes', 'No']);
+      expect(engStep0Boundary(f)).toBe(true);
+    });
+
+    test('FIRES on scope reduction recommendation', () => {
+      const f = fp('Scope reduction recommendation: cut to MVP?', ['Reduce', 'Proceed', 'Modify']);
+      expect(engStep0Boundary(f)).toBe(true);
+    });
+
+    test('does NOT fire on review-section AUQs', () => {
+      const f = fp('Architecture: shared mutable state?', ['Refactor', 'Defer', 'Skip']);
+      expect(engStep0Boundary(f)).toBe(false);
+    });
+  });
+
+  describe('designStep0Boundary', () => {
+    test('FIRES on design system / posture mention', () => {
+      const f = fp('Pick a design posture for this review', ['Polish', 'Triage', 'Expansion']);
+      expect(designStep0Boundary(f)).toBe(true);
+    });
+
+    test('FIRES on first-dimension prompt', () => {
+      const f = fp('First dimension: visual hierarchy. Score?', ['7', '8', '9']);
+      expect(designStep0Boundary(f)).toBe(true);
+    });
+
+    test('does NOT fire on later dimension AUQs', () => {
+      const f = fp('Spacing dimension score?', ['7', '8', '9']);
+      expect(designStep0Boundary(f)).toBe(false);
+    });
+  });
+
+  describe('devexStep0Boundary', () => {
+    test('FIRES on developer persona selection', () => {
+      const f = fp('Pick the target persona for this review', ['Senior backend', 'Junior frontend', 'Other']);
+      expect(devexStep0Boundary(f)).toBe(true);
+    });
+
+    test('FIRES on TTHW target prompt', () => {
+      const f = fp('What is the TTHW target for first run?', ['<5 min', '<15 min', '<30 min']);
+      expect(devexStep0Boundary(f)).toBe(true);
+    });
+
+    test('does NOT fire on review-section AUQs', () => {
+      const f = fp('Friction point: 5-min CI wait. Address?', ['Now', 'Defer', 'Skip']);
+      expect(devexStep0Boundary(f)).toBe(false);
+    });
+  });
+});