test: add runPlanSkillCounting PTY helper

Drives a plan-* skill end-to-end and counts distinct review-phase AskUserQuestions. Composes the primitives from the previous commit: - Boot + auto-trust handler (existing launchClaudePty) - Send slash command alone, sleep 3s, send plan content as follow-up message (proven pattern from skill-e2e-plan-design-with-ui) - Poll loop with permission-dialog auto-grant, same-redraw skip, empty-prompt re-poll - Event-based Step-0 boundary via isLastStep0AUQ predicate fired on the answered AUQ's fingerprint (Codex F7 — boundary is observed event, not later rendered content) - Multi-signal terminals: hard ceiling, COMPLETION_SUMMARY_RE, plan_ready, silent_write, exited, timeout Empty-prompt fingerprints are skipped per the contract documented in auqFingerprint's unit tests — fingerprinting them would re-introduce the option-label collision regression Codex F1 caught. No E2E tests yet — those land in commit 5 with the four skill fixtures. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-04-28 09:18:24 -07:00
parent fa07012845
commit 85e27216e8
1 changed files with 262 additions and 0 deletions
@@ -958,3 +958,265 @@ export async function runPlanSkillObservation(opts: {
    await session.close();
  }
 }
+
+// ────────────────────────────────────────────────────────────────────────────
+// runPlanSkillCounting — drives a plan-* skill end-to-end through Step 0 then
+// counts distinct review-phase AskUserQuestion fingerprints. The actual
+// product asserted by the per-finding-count tests.
+// ────────────────────────────────────────────────────────────────────────────
+
+/**
+ * Result of a `runPlanSkillCounting` run. Includes both the count summary
+ * (`step0Count`, `reviewCount`) and the full fingerprint list for diagnostic
+ * dumps when an assertion fails.
+ */
+export interface PlanSkillCountObservation {
+  outcome:
+    | 'plan_ready'
+    | 'completion_summary'
+    | 'ceiling_reached'
+    | 'silent_write'
+    | 'exited'
+    | 'timeout';
+  summary: string;
+  /** Visible terminal text at terminal time (last 3KB). */
+  evidence: string;
+  /** Wall time (ms) until the outcome was decided. */
+  elapsedMs: number;
+  /** All distinct AskUserQuestions observed, in observation order. */
+  fingerprints: AskUserQuestionFingerprint[];
+  /** Count of fingerprints with `preReview === true`. */
+  step0Count: number;
+  /** Count of fingerprints with `preReview === false`. */
+  reviewCount: number;
+}
+
+/**
+ * Drive a plan-* skill in plan mode and count distinct review-phase
+ * AskUserQuestions until a terminal signal fires.
+ *
+ * Flow:
+ *   1. Boot PTY in plan mode (8s grace + auto-trust dialog).
+ *   2. Send `slashCommand` alone. Sleep ~3s.
+ *   3. Send `followUpPrompt` as a chat message — this is the plan content
+ *      the skill reviews. Slash commands with trailing args are rejected by
+ *      Claude Code unless the skill defines them, so the plan goes as a
+ *      follow-up message (the proven pattern at
+ *      skill-e2e-plan-design-with-ui.test.ts:57-71).
+ *   4. Poll loop:
+ *      - Skip permission dialogs (auto-grant with `defaultPick`).
+ *      - On a new numbered-option list, parse prompt + options, build
+ *        fingerprint via `auqFingerprint`. Empty-prompt parses are skipped
+ *        and re-polled (avoids the empty-prompt collision documented in
+ *        the auqFingerprint contract).
+ *      - First time we see a fingerprint: push it, classify as Step 0 or
+ *        review-phase based on `boundaryFired`, press `defaultPick` to
+ *        advance.
+ *      - After pressing, evaluate `isLastStep0AUQ(fingerprint)`. If true,
+ *        all subsequent AUQs are review-phase.
+ *      - Hard ceiling: if `reviewCount >= reviewCountCeiling`, return
+ *        `ceiling_reached`. This bounds runaway counts; tests should set
+ *        the ceiling above their assertion CEILING.
+ *      - Soft terminals: `COMPLETION_SUMMARY_RE` match → `completion_summary`;
+ *        plan-ready confirmation → `plan_ready`; silent write outside
+ *        sanctioned dirs → `silent_write`; process exited → `exited`;
+ *        wall clock exceeded → `timeout`.
+ *
+ * Boundary detection (D14): event-based, fired against the answered AUQ's
+ * fingerprint, not against later rendered content. This avoids the race
+ * where Step-0-final and Section-1-first AUQs straddle a section header
+ * regex match.
+ *
+ * Fingerprint composition (D9): `auqFingerprint(prompt, options)` mixes
+ * normalized prompt text with the options signature so distinct findings
+ * with shared menu structure (the generic A/B/C TODO menu) get distinct
+ * fingerprints.
+ */
+export async function runPlanSkillCounting(opts: {
+  /** Skill name, e.g. 'plan-ceo-review'. Used for diagnostic strings only. */
+  skillName: string;
+  /** Slash command to send alone, e.g. '/plan-ceo-review'. No trailing args. */
+  slashCommand: string;
+  /** Plan content sent as a follow-up message ~3s after the slash command. */
+  followUpPrompt: string;
+  /** Per-skill predicate: which answered AUQ is the last Step-0 question. */
+  isLastStep0AUQ: Step0BoundaryPredicate;
+  /** Hard cap on review-phase count; helper returns when reached. Should be
+   *  set ABOVE the test's assertion ceiling so the test sees the cap as a
+   *  failure rather than a silent stop. */
+  reviewCountCeiling: number;
+  /** Numbered option to press by default. Defaults to 1 (recommended). */
+  defaultPick?: number;
+  /** Working directory. Default process.cwd() (repo cwd holds skill registry). */
+  cwd?: string;
+  /** Total budget for skill to reach a terminal outcome. Default 1_500_000 (25 min). */
+  timeoutMs?: number;
+  /** Extra env merged into the spawned `claude` process. */
+  env?: Record<string, string>;
+}): Promise<PlanSkillCountObservation> {
+  const startedAt = Date.now();
+  const defaultPick = opts.defaultPick ?? 1;
+  const timeoutMs = opts.timeoutMs ?? 1_500_000;
+
+  const session = await launchClaudePty({
+    permissionMode: 'plan',
+    cwd: opts.cwd,
+    timeoutMs: timeoutMs + 60_000,
+    env: opts.env,
+  });
+
+  const fingerprints: AskUserQuestionFingerprint[] = [];
+  const seen = new Set<string>();
+  let boundaryFired = false;
+  let step0Count = 0;
+  let reviewCount = 0;
+  let lastSig = '';
+
+  function snapshot(
+    outcome: PlanSkillCountObservation['outcome'],
+    summary: string,
+    visible: string,
+  ): PlanSkillCountObservation {
+    return {
+      outcome,
+      summary,
+      evidence: visible.slice(-3000),
+      elapsedMs: Date.now() - startedAt,
+      fingerprints,
+      step0Count,
+      reviewCount,
+    };
+  }
+
+  try {
+    await Bun.sleep(8000); // boot grace + auto-trust handler window
+    const since = session.mark();
+    session.send(`${opts.slashCommand}\r`);
+    await Bun.sleep(3000);
+    session.send(`${opts.followUpPrompt}\r`);
+
+    const budgetStart = Date.now();
+    while (Date.now() - budgetStart < timeoutMs) {
+      await Bun.sleep(2000);
+      const visible = session.visibleSince(since);
+
+      // Process exited?
+      if (session.exited()) {
+        return snapshot(
+          'exited',
+          `claude exited (code=${session.exitCode()}) during counting (step0=${step0Count}, review=${reviewCount})`,
+          visible,
+        );
+      }
+      if (visible.includes('Unknown command:')) {
+        return snapshot(
+          'exited',
+          `claude rejected ${opts.slashCommand} as unknown command (skill not registered in this cwd)`,
+          visible,
+        );
+      }
+
+      // Silent write detection — only fires if no numbered prompt is on
+      // screen (otherwise the write is gated by a permission/AUQ).
+      const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
+      let m: RegExpExecArray | null;
+      while ((m = writeRe.exec(visible)) !== null) {
+        const target = m[1] ?? '';
+        const sanctioned = SANCTIONED_WRITE_SUBSTRINGS.some((s) =>
+          target.includes(s),
+        );
+        if (!sanctioned && !isNumberedOptionListVisible(visible)) {
+          return snapshot(
+            'silent_write',
+            `Write/Edit to ${target} fired before any AskUserQuestion`,
+            visible,
+          );
+        }
+      }
+
+      // Soft terminal signals — check before AUQ processing so a final
+      // completion-summary doesn't get misclassified as a bonus AUQ.
+      if (COMPLETION_SUMMARY_RE.test(visible)) {
+        return snapshot(
+          'completion_summary',
+          `skill emitted completion summary / verdict / status line (step0=${step0Count}, review=${reviewCount})`,
+          visible,
+        );
+      }
+      if (isPlanReadyVisible(visible)) {
+        return snapshot(
+          'plan_ready',
+          `skill emitted plan-mode "Ready to execute" confirmation (step0=${step0Count}, review=${reviewCount})`,
+          visible,
+        );
+      }
+
+      // Numbered option list?
+      if (!isNumberedOptionListVisible(visible)) continue;
+
+      // Permission dialog? Auto-grant with defaultPick. Only act on the
+      // recent tail to avoid re-triggering on stale dialogs in scrollback.
+      if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
+        session.send(`${defaultPick}\r`);
+        await Bun.sleep(1500);
+        continue;
+      }
+
+      // Parse the active AUQ. Skip same-redraw and empty-prompt cases.
+      const options = parseNumberedOptions(visible);
+      if (options.length < 2) continue;
+      const sig = optionsSignature(options);
+      if (sig === lastSig) continue;
+      const promptSnippet = parseQuestionPrompt(visible);
+      if (promptSnippet === '') continue; // not yet rendered, poll again
+      lastSig = sig;
+
+      const fingerprintHash = auqFingerprint(promptSnippet, options);
+      if (seen.has(fingerprintHash)) {
+        // Same content, already counted (TTY redrew with whitespace diff).
+        continue;
+      }
+      seen.add(fingerprintHash);
+
+      const fp: AskUserQuestionFingerprint = {
+        signature: fingerprintHash,
+        promptSnippet,
+        options,
+        observedAtMs: Date.now() - startedAt,
+        preReview: !boundaryFired,
+      };
+      fingerprints.push(fp);
+      if (boundaryFired) reviewCount += 1;
+      else step0Count += 1;
+
+      // Press to advance.
+      session.send(`${defaultPick}\r`);
+
+      // Evaluate boundary AFTER pressing — if THIS AUQ was the last Step 0
+      // question, all subsequent AUQs go to reviewCount.
+      if (!boundaryFired && opts.isLastStep0AUQ(fp)) {
+        boundaryFired = true;
+      }
+
+      // Hard ceiling — runaway protection.
+      if (reviewCount >= opts.reviewCountCeiling) {
+        return snapshot(
+          'ceiling_reached',
+          `review-phase AUQ count reached ceiling (${opts.reviewCountCeiling})`,
+          session.visibleSince(since),
+        );
+      }
+
+      // Give the agent a beat to advance to the next state.
+      await Bun.sleep(2000);
+    }
+
+    return snapshot(
+      'timeout',
+      `no terminal outcome within ${timeoutMs}ms (step0=${step0Count}, review=${reviewCount})`,
+      session.visibleSince(since),
+    );
+  } finally {
+    await session.close();
+  }
+}