test: add per-finding count primitives + unit tests

Pure helpers landing ahead of runPlanSkillCounting:

  - parseQuestionPrompt(visible) — extract the 1-3 line prompt above
    the latest "❯ 1." cursor, normalize to a 240-char snippet
  - auqFingerprint(prompt, opts) — Bun.hash of normalized prompt + sorted
    options signature; distinct prompts with shared option labels
    (the generic A/B/C TODO menu) get distinct fingerprints
  - COMPLETION_SUMMARY_RE — terminal-signal regex matching all four
    plan-review skills' completion / verdict markers
  - assertReviewReportAtBottom(content) — checks "## GSTACK REVIEW
    REPORT" is present and is the last "## " heading in a plan file
  - Step0BoundaryPredicate type + four per-skill predicates
    (ceo / eng / design / devex) — fire on the answered AUQ's
    fingerprint, marking the end of Step 0 deterministically
    (event-based, not content-based, per Codex F7)

Plus 37 deterministic unit tests covering option-label collision
regression, prompt extraction edge cases, predicate positive AND
negative cases, and review-report-at-bottom triple-check
(missing / mid-file / multiple trailing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-28 09:10:58 -07:00
parent e6ce1dca70
commit fa07012845
2 changed files with 552 additions and 0 deletions
+212
View File
@@ -376,6 +376,218 @@ export function classifyVisible(visible: string): ClassifyResult {
return null;
}
// ────────────────────────────────────────────────────────────────────────────
// Per-finding AskUserQuestion count primitives (used by runPlanSkillCounting).
//
// These are pure helpers extracted up-front so the unit suite can exercise
// them deterministically before the live-PTY counter runs them. Each one is
// independently unit-testable against synthetic visible-buffer strings.
// ────────────────────────────────────────────────────────────────────────────
/**
* Captured identity of an AskUserQuestion — the rendered question text plus
* its numbered options. Used by `runPlanSkillCounting` to dedupe redrawn
* prompts and to feed `Step0BoundaryPredicate` callers.
*
* `signature` is the stable hash. Two AUQs with identical prompt + options
* produce the same signature; differences in either field produce different
* signatures. Critically: two AUQs with shared option labels (e.g. the
* generic "A) Add to plan / B) Defer / C) Build now" menu) but different
* question text get DIFFERENT signatures because the prompt is in the hash.
*/
export interface AskUserQuestionFingerprint {
/** Stable hash combining normalized prompt text + options signature. */
signature: string;
/** First 240 chars of the rendered question prompt (post-normalization). */
promptSnippet: string;
/** Captured option labels, in index order. */
options: Array<{ index: number; label: string }>;
/** Wall-clock when first observed (ms since the helper started polling). */
observedAtMs: number;
/** True if observed BEFORE the Step-0 boundary fired. */
preReview: boolean;
}
/**
* Predicate fired against the AUQ we just answered (not the visible buffer).
* Returns true if this AUQ's fingerprint marks the LAST Step-0 question for
* its skill — all subsequent AUQs are review-phase findings.
*
* Event-based by design: matching against an answered AUQ's fingerprint
* (prompt + options) is deterministic, whereas matching against later
* rendered content (section headers, summary text) races with the agent's
* output cadence. See plan §D14 for the rationale.
*/
export type Step0BoundaryPredicate = (
answeredFingerprint: AskUserQuestionFingerprint,
) => boolean;
/**
* Parse the rendered question prompt out of a visible TTY buffer. The prompt
* is the 13 lines of text immediately ABOVE the latest ` 1.` cursor line —
* not part of the option list, not the permission-dialog header.
*
* Returns the prompt normalized to a single-spaced 240-char snippet (strip
* ANSI residue, collapse internal whitespace, trim) — short enough to use as
* a hash key, long enough to disambiguate distinct questions.
*
* Returns "" when no prompt could be parsed (cursor not yet rendered, or
* cursor is at the top of the buffer with no preceding text). Callers that
* use the empty string as a fingerprint input should treat empty-prompt
* AUQs as "wait one more poll" rather than fingerprinting them — otherwise
* the same options + empty prompt across two distinct questions collide.
*/
export function parseQuestionPrompt(visible: string): string {
// Tail-only — older prompts higher in the buffer are stale.
const tail = visible.length > 4096 ? visible.slice(-4096) : visible;
const lines = tail.split('\n');
// Find the latest ` 1.` cursor line (matching parseNumberedOptions).
let cursorLineIdx = -1;
for (let i = lines.length - 1; i >= 0; i--) {
if (/^\s*\s*1\./.test(lines[i] ?? '')) {
cursorLineIdx = i;
break;
}
}
if (cursorLineIdx < 0) return '';
// Walk up at most 6 lines collecting prompt text. Stop at:
// - a blank line preceded by another blank line (paragraph break)
// - top of buffer
// - a line that itself starts with `N.` (we're inside an option list)
const promptLines: string[] = [];
let blankRun = 0;
for (let i = cursorLineIdx - 1; i >= 0 && promptLines.length < 6; i--) {
const raw = lines[i] ?? '';
const trimmed = raw.trim();
if (trimmed === '') {
blankRun += 1;
if (blankRun >= 2 && promptLines.length > 0) break;
continue;
}
blankRun = 0;
// Stop if we hit what looks like a previous numbered list.
if (/^[\s]*[1-9]\.\s+\S/.test(raw)) break;
promptLines.unshift(trimmed);
}
const joined = promptLines.join(' ').replace(/\s+/g, ' ').trim();
return joined.slice(0, 240);
}
/**
* Stable hash for an AskUserQuestion's identity — combines normalized prompt
* text with the options signature so two distinct questions with shared menu
* labels (the generic A/B/C TODO-proposal menu, for instance) get different
* fingerprints.
*
* Uses Bun's fast non-crypto hash since these strings are short and we only
* need collision resistance against accidental TTY redraws, not adversaries.
* Hex-encoded for diagnostic dumps.
*/
export function auqFingerprint(
promptSnippet: string,
opts: Array<{ index: number; label: string }>,
): string {
const normalized = promptSnippet.replace(/\s+/g, ' ').trim();
const sig = optionsSignature(opts);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
return (Bun as any).hash(normalized + '||' + sig).toString(16);
}
/**
* Detects when a plan-* skill has reached its Completion Summary / Review
* Report — a terminal signal complementary to plan-mode's "Ready to execute"
* confirmation. Each plan-review skill writes one of these phrasings near
* the end of its run; matching any one is enough to stop counting.
*
* Best-effort: this is a content marker, not a deterministic event. Hard
* ceiling (`reviewCountCeiling` in `runPlanSkillCounting`) is the reliable
* stop signal; this regex is the "we're done, go gracefully" hint.
*/
export const COMPLETION_SUMMARY_RE =
/(GSTACK REVIEW REPORT|## Completion [Ss]ummary|Status:\s*(clean|issues_open)|^VERDICT:)/m;
/**
* Result of asserting that a plan file ends with `## GSTACK REVIEW REPORT`
* as its last `## ` heading. `ok` is true iff the report is present AND no
* other `## ` heading appears after it. Diagnostic fields are populated only
* on failure to keep the success path cheap.
*/
export interface ReviewReportAtBottomResult {
ok: boolean;
reason?: string;
trailingHeadings?: string[];
}
/**
* Assert that `## GSTACK REVIEW REPORT` is the last `## ` heading in a plan
* file's content. Pure string operation — no filesystem access. Used by the
* finding-count E2E tests as a second assertion on each test's produced plan.
*
* The plan-mode skill template mandates the agent move/append the review
* report so it's always the last `##` section. A regression where the agent
* appends additional sections after the report (or skips it entirely) ships
* silently today; this assertion catches both.
*/
export function assertReviewReportAtBottom(
content: string,
): ReviewReportAtBottomResult {
const re = /^## GSTACK REVIEW REPORT\s*$/m;
const match = re.exec(content);
if (!match) {
return { ok: false, reason: 'no GSTACK REVIEW REPORT section' };
}
const after = content.slice(match.index + match[0].length);
// Match any `## ` heading after the report. Reject `## ` followed by
// newline-only (trailing-whitespace ## headers) to avoid false positives.
const trailingHeadings = Array.from(
after.matchAll(/^## \S.*$/gm),
).map((m) => m[0]);
if (trailingHeadings.length > 0) {
return {
ok: false,
reason: 'trailing ## heading(s) after GSTACK REVIEW REPORT',
trailingHeadings,
};
}
return { ok: true };
}
/**
* Per-skill Step-0 boundary predicates. Each fires `true` when the answered
* AUQ's fingerprint matches the LAST question of that skill's Step 0 phase.
*
* - `ceoStep0Boundary`: matches the mode-pick AUQ (options match `MODE_RE`).
* - `engStep0Boundary`: matches the cross-project-learnings or scope-reduction
* AUQ that closes plan-eng-review's preamble.
* - `designStep0Boundary`: matches plan-design-review's first dimension /
* posture AUQ.
* - `devexStep0Boundary`: matches plan-devex-review's persona-selection AUQ.
*
* Predicates live alongside the helper so the unit suite can exercise each
* against synthetic fingerprints (positive AND negative cases). Skill test
* files import them directly.
*/
export const ceoStep0Boundary: Step0BoundaryPredicate = (fp) =>
fp.options.some((o) => MODE_RE.test(o.label));
export const engStep0Boundary: Step0BoundaryPredicate = (fp) =>
/scope reduction recommendation|cross[\s-]?project learnings/i.test(
fp.promptSnippet,
);
export const designStep0Boundary: Step0BoundaryPredicate = (fp) =>
/design system|design posture|design score|first dimension/i.test(
fp.promptSnippet,
);
export const devexStep0Boundary: Step0BoundaryPredicate = (fp) =>
/developer persona|target persona|persona selection|TTHW target/i.test(
fp.promptSnippet,
);
/**
* Spawn `claude --permission-mode plan` in a real PTY and return a session
* handle. Caller is responsible for `await session.close()` to release the
+340
View File
@@ -30,7 +30,17 @@ import {
parseNumberedOptions,
classifyVisible,
TAIL_SCAN_BYTES,
optionsSignature,
parseQuestionPrompt,
auqFingerprint,
COMPLETION_SUMMARY_RE,
assertReviewReportAtBottom,
ceoStep0Boundary,
engStep0Boundary,
designStep0Boundary,
devexStep0Boundary,
type ClaudePtyOptions,
type AskUserQuestionFingerprint,
} from './claude-pty-runner';
describe('isPermissionDialogVisible', () => {
@@ -318,3 +328,333 @@ describe('runPlanSkillObservation env passthrough surface', () => {
expect(opts.env).toEqual({ QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' });
});
});
// ────────────────────────────────────────────────────────────────────────────
// Per-finding count primitives — Section 3 unit tests #1#5, #7, #12.
// ────────────────────────────────────────────────────────────────────────────
describe('optionsSignature', () => {
test('returns a "|"-joined `index:label` string for a clean list', () => {
const sig = optionsSignature([
{ index: 1, label: 'HOLD SCOPE' },
{ index: 2, label: 'SCOPE EXPANSION' },
]);
expect(sig).toBe('1:HOLD SCOPE|2:SCOPE EXPANSION');
});
test('order-independent: shuffled inputs produce the same signature', () => {
// parseNumberedOptions already returns sorted, but defensive sort means
// a future caller that hands us shuffled input still produces a stable
// dedupe signature.
const a = optionsSignature([
{ index: 2, label: 'B' },
{ index: 1, label: 'A' },
{ index: 3, label: 'C' },
]);
const b = optionsSignature([
{ index: 1, label: 'A' },
{ index: 2, label: 'B' },
{ index: 3, label: 'C' },
]);
expect(a).toBe(b);
});
test('empty list returns empty string', () => {
expect(optionsSignature([])).toBe('');
});
test('single-item list returns just that entry', () => {
expect(optionsSignature([{ index: 1, label: 'Only' }])).toBe('1:Only');
});
});
describe('parseQuestionPrompt', () => {
test('captures 1-line prompt above the cursor', () => {
const visible = `
D1 — Pick a mode
1. HOLD SCOPE
2. SCOPE EXPANSION
`;
const prompt = parseQuestionPrompt(visible);
expect(prompt).toBe('D1 — Pick a mode');
});
test('captures multi-line prompt above the cursor', () => {
const visible = `
D2 — Approach selection
Which architecture should we follow?
1. Bypass existing helper
2. Reuse existing helper
`;
const prompt = parseQuestionPrompt(visible);
// Multi-line prompts get joined with single spaces.
expect(prompt).toContain('D2 — Approach selection');
expect(prompt).toContain('Which architecture should we follow?');
});
test('returns "" when no cursor is rendered', () => {
expect(parseQuestionPrompt('Just some prose.\nNo cursor.')).toBe('');
});
test('truncates to 240 chars', () => {
const longPrompt = 'A'.repeat(500);
const visible = `${longPrompt}\n\n 1. yes\n 2. no`;
expect(parseQuestionPrompt(visible).length).toBeLessThanOrEqual(240);
});
test('does not pull text from a previous numbered list above', () => {
const visible = `
1. previous answered question
2. previous option two
D2 — A new question text
1. fresh option A
2. fresh option B
`;
const prompt = parseQuestionPrompt(visible);
// Stops at the previous numbered-list line; should NOT contain "previous answered question".
expect(prompt).toContain('D2 — A new question text');
expect(prompt).not.toContain('previous answered question');
});
test('normalizes whitespace (collapses runs of spaces and tabs)', () => {
const visible = `D1 — Spaced out
1. yes
2. no`;
expect(parseQuestionPrompt(visible)).toBe('D1 — Spaced out');
});
});
describe('auqFingerprint', () => {
test('returns the same fingerprint for identical inputs', () => {
const opts = [
{ index: 1, label: 'A' },
{ index: 2, label: 'B' },
];
expect(auqFingerprint('hello', opts)).toBe(auqFingerprint('hello', opts));
});
test('different prompts with shared option labels produce DIFFERENT fingerprints', () => {
// The collision regression Codex F1 caught: option-label-only fingerprints
// collapsed multiple distinct findings into one when they shared menu shape.
const sharedOpts = [
{ index: 1, label: 'Add to plan' },
{ index: 2, label: 'Defer' },
{ index: 3, label: 'Build now' },
];
const fpFinding1 = auqFingerprint('D5 — Architecture: bypass helper?', sharedOpts);
const fpFinding2 = auqFingerprint('D6 — Tests: zero coverage?', sharedOpts);
expect(fpFinding1).not.toBe(fpFinding2);
});
test('same prompt with different options produces DIFFERENT fingerprints', () => {
const prompt = 'D1 — Pick a mode';
const fpA = auqFingerprint(prompt, [
{ index: 1, label: 'HOLD SCOPE' },
{ index: 2, label: 'SCOPE EXPANSION' },
]);
const fpB = auqFingerprint(prompt, [
{ index: 1, label: 'HOLD SCOPE' },
{ index: 2, label: 'SCOPE REDUCTION' },
]);
expect(fpA).not.toBe(fpB);
});
test('whitespace-only differences in prompt do NOT change the fingerprint', () => {
// Same content, different rendering whitespace (TTY redraw artifact)
// must produce the same fingerprint so dedupe survives reflow.
const opts = [{ index: 1, label: 'A' }, { index: 2, label: 'B' }];
const fpA = auqFingerprint('Pick a mode', opts);
const fpB = auqFingerprint('Pick a mode', opts);
expect(fpA).toBe(fpB);
});
test('empty prompt + same options collide (caller must guard against this)', () => {
// Documents the contract: empty-prompt fingerprints WILL collide if the
// caller fingerprints them. runPlanSkillCounting must skip empty-prompt
// AUQs and re-poll instead.
const opts = [{ index: 1, label: 'A' }];
expect(auqFingerprint('', opts)).toBe(auqFingerprint('', opts));
});
});
describe('COMPLETION_SUMMARY_RE', () => {
test('matches GSTACK REVIEW REPORT heading', () => {
expect(COMPLETION_SUMMARY_RE.test('## GSTACK REVIEW REPORT')).toBe(true);
});
test('matches Completion Summary heading (ceo + eng)', () => {
expect(COMPLETION_SUMMARY_RE.test('## Completion Summary')).toBe(true);
expect(COMPLETION_SUMMARY_RE.test('## Completion summary')).toBe(true);
});
test('matches Status: clean (CEO review-log shape)', () => {
expect(COMPLETION_SUMMARY_RE.test('Status: clean')).toBe(true);
expect(COMPLETION_SUMMARY_RE.test('Status: issues_open')).toBe(true);
});
test('matches VERDICT: line', () => {
expect(COMPLETION_SUMMARY_RE.test('VERDICT: CLEARED — Eng Review passed')).toBe(true);
});
test('does NOT match prose mentions of "verdict" mid-line', () => {
// VERDICT must be at the start of a line to count.
expect(COMPLETION_SUMMARY_RE.test('the final verdict: undecided')).toBe(false);
});
});
describe('assertReviewReportAtBottom', () => {
test('passes when REVIEW REPORT is the only/last ## heading', () => {
const content = `# Plan
## Context
stuff
## Approach
more stuff
## GSTACK REVIEW REPORT
| col | col |
`;
const r = assertReviewReportAtBottom(content);
expect(r.ok).toBe(true);
});
test('fails when REVIEW REPORT is missing', () => {
const content = `# Plan
## Context
stuff
`;
const r = assertReviewReportAtBottom(content);
expect(r.ok).toBe(false);
expect(r.reason).toMatch(/no GSTACK REVIEW REPORT/);
});
test('fails when REVIEW REPORT exists but a ## heading follows it', () => {
const content = `# Plan
## GSTACK REVIEW REPORT
| col | col |
## Late Section
oops
`;
const r = assertReviewReportAtBottom(content);
expect(r.ok).toBe(false);
expect(r.reason).toMatch(/trailing ## heading/);
expect(r.trailingHeadings).toEqual(['## Late Section']);
});
test('passes when only ### subheadings follow REVIEW REPORT (deeper nesting allowed)', () => {
const content = `## GSTACK REVIEW REPORT
### Cross-model tension
- F1: resolved
- F2: resolved
`;
const r = assertReviewReportAtBottom(content);
expect(r.ok).toBe(true);
});
test('fails with multiple trailing ## headings reported', () => {
const content = `## GSTACK REVIEW REPORT
## First trailing
## Second trailing
`;
const r = assertReviewReportAtBottom(content);
expect(r.ok).toBe(false);
expect(r.trailingHeadings).toHaveLength(2);
});
});
describe('Step0BoundaryPredicate per-skill', () => {
// Helper to build a synthetic fingerprint for predicate tests.
function fp(promptSnippet: string, optionLabels: string[]): AskUserQuestionFingerprint {
const options = optionLabels.map((label, i) => ({ index: i + 1, label }));
return {
signature: auqFingerprint(promptSnippet, options),
promptSnippet,
options,
observedAtMs: 0,
preReview: true,
};
}
describe('ceoStep0Boundary', () => {
test('FIRES on Step 0F mode-pick AUQ (HOLD SCOPE in options)', () => {
const f = fp('Pick a mode', ['HOLD SCOPE', 'SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'SCOPE REDUCTION']);
expect(ceoStep0Boundary(f)).toBe(true);
});
test('does NOT fire on premise challenge AUQs', () => {
const f = fp('D1 — Premise check: is this the right problem?', ['Yes', 'No', 'Other']);
expect(ceoStep0Boundary(f)).toBe(false);
});
test('does NOT fire on review-section AUQs', () => {
const f = fp('Architecture: bypass helper?', ['Reuse existing', 'Roll new', 'Defer']);
expect(ceoStep0Boundary(f)).toBe(false);
});
});
describe('engStep0Boundary', () => {
test('FIRES on cross-project learnings prompt', () => {
const f = fp('Enable cross-project learnings on this machine?', ['Yes', 'No']);
expect(engStep0Boundary(f)).toBe(true);
});
test('FIRES on scope reduction recommendation', () => {
const f = fp('Scope reduction recommendation: cut to MVP?', ['Reduce', 'Proceed', 'Modify']);
expect(engStep0Boundary(f)).toBe(true);
});
test('does NOT fire on review-section AUQs', () => {
const f = fp('Architecture: shared mutable state?', ['Refactor', 'Defer', 'Skip']);
expect(engStep0Boundary(f)).toBe(false);
});
});
describe('designStep0Boundary', () => {
test('FIRES on design system / posture mention', () => {
const f = fp('Pick a design posture for this review', ['Polish', 'Triage', 'Expansion']);
expect(designStep0Boundary(f)).toBe(true);
});
test('FIRES on first-dimension prompt', () => {
const f = fp('First dimension: visual hierarchy. Score?', ['7', '8', '9']);
expect(designStep0Boundary(f)).toBe(true);
});
test('does NOT fire on later dimension AUQs', () => {
const f = fp('Spacing dimension score?', ['7', '8', '9']);
expect(designStep0Boundary(f)).toBe(false);
});
});
describe('devexStep0Boundary', () => {
test('FIRES on developer persona selection', () => {
const f = fp('Pick the target persona for this review', ['Senior backend', 'Junior frontend', 'Other']);
expect(devexStep0Boundary(f)).toBe(true);
});
test('FIRES on TTHW target prompt', () => {
const f = fp('What is the TTHW target for first run?', ['<5 min', '<15 min', '<30 min']);
expect(devexStep0Boundary(f)).toBe(true);
});
test('does NOT fire on review-section AUQs', () => {
const f = fp('Friction point: 5-min CI wait. Address?', ['Now', 'Defer', 'Skip']);
expect(devexStep0Boundary(f)).toBe(false);
});
});
});