test(harness): plumb extraArgs and auto_decided outcome through PTY runner

runPlanSkillObservation now accepts extraArgs that pass through to
launchClaudePty (which already supported them at the lower level), and
exposes a new 'auto_decided' outcome detected via isAutoDecidedVisible
when the AUTO_DECIDE preamble template fires (Auto-decided ... (your
preference)).

Both pieces are needed for the v1.21+ AskUserQuestion-blocked regression
tests in the next commit. Detection order is deliberate: 'asked' (rendered
numbered list) wins over 'auto_decided' (text only, no list), which wins
over 'plan_ready' so the auto-decide evidence isn't masked by a downstream
plan-mode confirmation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-30 21:13:30 -07:00
parent e8893a18b1
commit 6c2db0bec6
+55 -16
View File
@@ -138,6 +138,19 @@ export function isPlanReadyVisible(visible: string): boolean {
return /ready to execute|Would you like to proceed/i.test(visible);
}
/**
* Detect the AUTO_DECIDE preamble template firing. The model prints
* "Auto-decided <summary> → <option> (your preference). Change with /plan-tune."
* when it short-circuits an AskUserQuestion via the question-tuning resolver
* (`scripts/resolvers/question-tuning.ts:26`). We detect any of those phrases
* — the wording can drift slightly between model invocations, so each cue is
* checked independently. The arrow + "(your preference)" combination is the
* tightest signal.
*/
export function isAutoDecidedVisible(visible: string): boolean {
return /Auto-decided\b/i.test(visible) && /\(your preference\)/i.test(visible);
}
/**
* Detect a Claude Code permission dialog. These render as a numbered
* option list (so isNumberedOptionListVisible matches them) but they
@@ -521,16 +534,23 @@ export async function invokeAndObserve(
export interface PlanSkillObservation {
/**
* What happened first. One of:
* - 'asked' — skill emitted a numbered-option prompt (its Step 0
* AskUserQuestion or the routing-injection prompt)
* - 'plan_ready' — claude wrote a plan and emitted its native
* "Ready to execute" confirmation
* - 'asked' — skill emitted a numbered-option prompt (its Step 0
* AskUserQuestion or the routing-injection prompt)
* - 'auto_decided' — visible TTY shows "Auto-decided ... → ..." (the
* AUTO_DECIDE preamble template fired). Distinguishes
* "the regression we're tracking" (auto-mode silently
* auto-deciding questions the user wanted to see) from
* "skill legitimately reached plan_ready". Detected
* before plan_ready/silent_write so the auto-decide
* evidence wins when both are present.
* - 'plan_ready' — claude wrote a plan and emitted its native
* "Ready to execute" confirmation
* - 'silent_write' — a Write/Edit landed BEFORE any prompt, to a path
* outside the sanctioned plan/project directories
* - 'exited' — claude process died before any of the above
* - 'timeout' — none of the above within budget
* outside the sanctioned plan/project directories
* - 'exited' — claude process died before any of the above
* - 'timeout' — none of the above within budget
*/
outcome: 'asked' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
outcome: 'asked' | 'auto_decided' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
/** Human-readable summary. */
summary: string;
/** Visible terminal text since the slash command was sent (last 2KB). */
@@ -566,12 +586,19 @@ export async function runPlanSkillObservation(opts: {
cwd?: string;
/** Total budget for skill to reach a terminal outcome. Default 180000. */
timeoutMs?: number;
/** Extra CLI args appended after --permission-mode. Used by the v1.21+
* AskUserQuestion-blocked regression tests to pass
* `['--disallowedTools', 'AskUserQuestion']` (the flag set Conductor
* uses to remove native AskUserQuestion in favor of its MCP variant).
* Plumbs straight through to launchClaudePty. */
extraArgs?: string[];
}): Promise<PlanSkillObservation> {
const startedAt = Date.now();
const session = await launchClaudePty({
permissionMode: opts.inPlanMode === false ? null : 'plan',
cwd: opts.cwd,
timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
extraArgs: opts.extraArgs,
});
try {
@@ -624,14 +651,10 @@ export async function runPlanSkillObservation(opts: {
};
}
}
if (isPlanReadyVisible(visible)) {
return {
outcome: 'plan_ready',
summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
evidence: visible.slice(-2000),
elapsedMs: Date.now() - startedAt,
};
}
// Order: 'asked' first (rendered numbered list = user being asked),
// then 'auto_decided' (auto-decide text fired upstream of plan_ready
// — surfacing this distinguishes the auto-mode regression from a
// legitimate plan_ready outcome), then 'plan_ready'.
if (isNumberedOptionListVisible(visible)) {
return {
outcome: 'asked',
@@ -640,6 +663,22 @@ export async function runPlanSkillObservation(opts: {
elapsedMs: Date.now() - startedAt,
};
}
if (isAutoDecidedVisible(visible)) {
return {
outcome: 'auto_decided',
summary: 'skill auto-decided an AskUserQuestion via the AUTO_DECIDE preamble (the user never saw the prompt)',
evidence: visible.slice(-2000),
elapsedMs: Date.now() - startedAt,
};
}
if (isPlanReadyVisible(visible)) {
return {
outcome: 'plan_ready',
summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
evidence: visible.slice(-2000),
elapsedMs: Date.now() - startedAt,
};
}
}
return {