test(periodic): AUTO_DECIDE opt-in preserved under Conductor flags

Periodic-tier eval that exercises the legitimate /plan-tune AUTO_DECIDE
path under the same flags Conductor uses (--disallowedTools
AskUserQuestion). Confirms the new Tool resolution preamble doesn't trip
opt-in users: when the user has set a never-ask preference for a
question, the model should auto-pick (outcome 'auto_decided' or
'plan_ready') rather than surface the prompt.

Setup runs in an isolated GSTACK_HOME tmpdir — never touches the user's
real ~/.gstack state. Writes question_tuning=true + a never-ask
preference for plan-ceo-review-mode (source: 'plan-tune', which bypasses
the inline-user origin gate). Spawns claude with
--disallowedTools AskUserQuestion in plan mode, runs /plan-ceo-review,
asserts outcome is NOT 'asked' (i.e., the model honored the preference).

Periodic tier because AUTO_DECIDE behavior depends on the model adhering
to the QUESTION_TUNING preamble injection — non-deterministic, weekly
cron is the right cadence rather than CI gating.

Touchfiles cover the AUTO_DECIDE-bearing resolvers + the question-tuning
binaries the test setup invokes. touchfiles.test.ts count updates 19 ->
20 because auto-decide-preserved also depends on plan-ceo-review/**.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-30 21:28:58 -07:00
parent bec54c2b40
commit 916b6ff50f
3 changed files with 143 additions and 3 deletions
+7
View File
@@ -105,6 +105,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// skills with no prior plan-mode test:
'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
// v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution
// fix doesn't trip the legitimate /plan-tune opt-in path: when the user has
// written a never-ask preference, AUQ should still auto-decide rather than
// surfacing the question. Touches the question-tuning + preference
// infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'],
// Real-PTY E2E batch (#6 new tests on the harness).
// Each one tests behavior the SDK harness can't observe (rendered TTY,
@@ -385,6 +391,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// v1.21+ auto-mode regression tests
'autoplan-auto-mode': 'gate',
'office-hours-auto-mode': 'gate',
'auto-decide-preserved': 'periodic',
'e2e-harness-audit': 'gate',
// Real-PTY E2E batch — tier classification:
@@ -0,0 +1,131 @@
/**
* AUTO_DECIDE opt-in preserved under Conductor flags (periodic-tier, paid, real-PTY).
*
* Regression test for v1.21+ fix: the new "Tool resolution" preamble
* (scripts/resolvers/preamble/generate-ask-user-format.ts) tells the model
* to prefer mcp__*__AskUserQuestion variants and fall back to plan-file
* decisions when neither is callable. This must NOT break the legitimate
* `/plan-tune` AUTO_DECIDE path: when the user has explicitly opted into
* auto-deciding a specific question via `gstack-question-preference --write
* never-ask`, the model is supposed to honor that — it should still
* auto-pick the recommended option and emit the AUTO_DECIDE annotation
* ("Auto-decided <summary> → <option> (your preference). Change with
* /plan-tune.") instead of opening a question prompt.
*
* Periodic tier: AUTO_DECIDE behavior depends on the model adhering to
* the QUESTION_TUNING preamble injection. Non-deterministic; runs weekly
* or manually rather than gating CI.
*
* Set up:
* - tmpDir as GSTACK_HOME (isolated state, doesn't touch the user's
* real ~/.gstack)
* - question_tuning=true in the tmp config
* - preference for plan-ceo-review-mode → never-ask (source: plan-tune)
*
* Spawn:
* claude --permission-mode plan --disallowedTools AskUserQuestion
* /plan-ceo-review
*
* Expected:
* - outcome === 'auto_decided' (the AUTO_DECIDE preamble fired and the
* "Auto-decided ... (your preference)" text rendered)
*
* If outcome is 'asked', the model ignored the user's `/plan-tune`
* preference — that's a regression against the opt-in feature. If outcome
* is 'plan_ready' with no AUTO_DECIDE text, the model auto-decided BUT
* skipped the annotation (acceptable; AUTO_DECIDE annotation is good
* practice but not the load-bearing behavior).
*/
import { describe, test, expect } from 'bun:test';
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import { spawnSync } from 'child_process';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const ROOT = path.resolve(import.meta.dir, '..');
describeE2E('AUTO_DECIDE opt-in preserved under Conductor flags (periodic)', () => {
test('user-opted-in question still auto-decides when AskUserQuestion is --disallowedTools', async () => {
const tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-auto-decide-'));
try {
// 1. Bootstrap the tmp GSTACK_HOME with question_tuning=true.
const configBin = path.join(ROOT, 'bin', 'gstack-config');
const setRes = spawnSync(configBin, ['set', 'question_tuning', 'true'], {
env: { ...process.env, GSTACK_HOME: tmpHome },
encoding: 'utf-8',
});
if (setRes.status !== 0) {
throw new Error(`gstack-config set failed: ${setRes.stderr || setRes.stdout}`);
}
// 2. Resolve slug for the project (uses git remote — same as the spawned
// claude would resolve). The preference file path keys on this slug.
const slugBin = path.join(ROOT, 'bin', 'gstack-slug');
const slugRes = spawnSync(slugBin, [], {
cwd: ROOT,
env: { ...process.env, GSTACK_HOME: tmpHome },
encoding: 'utf-8',
});
// gstack-slug emits `eval`-able shell exports like `SLUG=garrytan-gstack`.
const slug = (slugRes.stdout.match(/SLUG=([^\s;]+)/)?.[1] ?? 'unknown').replace(/['"]/g, '');
// 3. Write the preference: plan-ceo-review-mode → never-ask. The
// 'plan-tune' source bypasses the inline-user origin gate.
const prefBin = path.join(ROOT, 'bin', 'gstack-question-preference');
const writeRes = spawnSync(
prefBin,
['--write', JSON.stringify({
question_id: 'plan-ceo-review-mode',
preference: 'never-ask',
source: 'plan-tune',
})],
{
env: { ...process.env, GSTACK_HOME: tmpHome },
encoding: 'utf-8',
},
);
if (writeRes.status !== 0) {
throw new Error(`gstack-question-preference --write failed: ${writeRes.stderr || writeRes.stdout}`);
}
// Sanity: the preference file landed where we expect.
const prefFile = path.join(tmpHome, 'projects', slug, 'question-preferences.json');
if (!fs.existsSync(prefFile)) {
throw new Error(`expected preference file at ${prefFile}; not found. slug=${slug}`);
}
// 4. Run /plan-ceo-review with the Conductor flag set + isolated state.
const obs = await runPlanSkillObservation({
skillName: 'plan-ceo-review',
inPlanMode: true,
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
timeoutMs: 300_000,
});
// 5. Pass: 'auto_decided' (the strongest signal) or 'plan_ready' with
// no question rendered. Fail: 'asked' (model ignored the opt-in).
if (obs.outcome === 'asked') {
throw new Error(
`AUTO_DECIDE regression: the model surfaced an AskUserQuestion despite the user's never-ask preference.\n` +
`summary: ${obs.summary}\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') {
throw new Error(
`AUTO_DECIDE preserve test inconclusive: outcome=${obs.outcome}\n` +
`summary: ${obs.summary}\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
expect(['auto_decided', 'plan_ready']).toContain(obs.outcome);
} finally {
try { fs.rmSync(tmpHome, { recursive: true, force: true }); } catch { /* best-effort */ }
}
}, 360_000);
});
+5 -3
View File
@@ -97,10 +97,12 @@ describe('selectTests', () => {
expect(result.selected).toContain('ask-user-question-format-pty');
expect(result.selected).toContain('plan-ceo-mode-routing');
expect(result.selected).toContain('autoplan-chain-pty');
// v1.21+ auto-mode regression: autoplan-auto-mode also depends on plan-ceo-review/**
// v1.21+ regression: autoplan-auto-mode + auto-decide-preserved also
// depend on plan-ceo-review/**
expect(result.selected).toContain('autoplan-auto-mode');
expect(result.selected.length).toBe(19);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 19);
expect(result.selected).toContain('auto-decide-preserved');
expect(result.selected.length).toBe(20);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 20);
});
test('global touchfile triggers ALL tests', () => {