From 916b6ff50fce07ba3dee845a86011c823f6fec10 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 30 Apr 2026 21:28:58 -0700 Subject: [PATCH] test(periodic): AUTO_DECIDE opt-in preserved under Conductor flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Periodic-tier eval that exercises the legitimate /plan-tune AUTO_DECIDE path under the same flags Conductor uses (--disallowedTools AskUserQuestion). Confirms the new Tool resolution preamble doesn't trip opt-in users: when the user has set a never-ask preference for a question, the model should auto-pick (outcome 'auto_decided' or 'plan_ready') rather than surface the prompt. Setup runs in an isolated GSTACK_HOME tmpdir — never touches the user's real ~/.gstack state. Writes question_tuning=true + a never-ask preference for plan-ceo-review-mode (source: 'plan-tune', which bypasses the inline-user origin gate). Spawns claude with --disallowedTools AskUserQuestion in plan mode, runs /plan-ceo-review, asserts outcome is NOT 'asked' (i.e., the model honored the preference). Periodic tier because AUTO_DECIDE behavior depends on the model adhering to the QUESTION_TUNING preamble injection — non-deterministic, weekly cron is the right cadence rather than CI gating. Touchfiles cover the AUTO_DECIDE-bearing resolvers + the question-tuning binaries the test setup invokes. touchfiles.test.ts count updates 19 -> 20 because auto-decide-preserved also depends on plan-ceo-review/**. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 7 + test/skill-e2e-auto-decide-preserved.test.ts | 131 +++++++++++++++++++ test/touchfiles.test.ts | 8 +- 3 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 test/skill-e2e-auto-decide-preserved.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 79eed956..e0c6ebcf 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -105,6 +105,12 @@ export const E2E_TOUCHFILES: Record = { // skills with no prior plan-mode test: 'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], 'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + // v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution + // fix doesn't trip the legitimate /plan-tune opt-in path: when the user has + // written a never-ask preference, AUQ should still auto-decide rather than + // surfacing the question. Touches the question-tuning + preference + // infrastructure plus the resolvers that own the AUTO_DECIDE preamble. + 'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'], // Real-PTY E2E batch (#6 new tests on the harness). // Each one tests behavior the SDK harness can't observe (rendered TTY, @@ -385,6 +391,7 @@ export const E2E_TIERS: Record = { // v1.21+ auto-mode regression tests 'autoplan-auto-mode': 'gate', 'office-hours-auto-mode': 'gate', + 'auto-decide-preserved': 'periodic', 'e2e-harness-audit': 'gate', // Real-PTY E2E batch — tier classification: diff --git a/test/skill-e2e-auto-decide-preserved.test.ts b/test/skill-e2e-auto-decide-preserved.test.ts new file mode 100644 index 00000000..8b773d5f --- /dev/null +++ b/test/skill-e2e-auto-decide-preserved.test.ts @@ -0,0 +1,131 @@ +/** + * AUTO_DECIDE opt-in preserved under Conductor flags (periodic-tier, paid, real-PTY). + * + * Regression test for v1.21+ fix: the new "Tool resolution" preamble + * (scripts/resolvers/preamble/generate-ask-user-format.ts) tells the model + * to prefer mcp__*__AskUserQuestion variants and fall back to plan-file + * decisions when neither is callable. This must NOT break the legitimate + * `/plan-tune` AUTO_DECIDE path: when the user has explicitly opted into + * auto-deciding a specific question via `gstack-question-preference --write + * never-ask`, the model is supposed to honor that — it should still + * auto-pick the recommended option and emit the AUTO_DECIDE annotation + * ("Auto-decided