mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 14:06:42 +02:00
test(periodic): AUTO_DECIDE opt-in preserved under Conductor flags
Periodic-tier eval that exercises the legitimate /plan-tune AUTO_DECIDE path under the same flags Conductor uses (--disallowedTools AskUserQuestion). Confirms the new Tool resolution preamble doesn't trip opt-in users: when the user has set a never-ask preference for a question, the model should auto-pick (outcome 'auto_decided' or 'plan_ready') rather than surface the prompt. Setup runs in an isolated GSTACK_HOME tmpdir — never touches the user's real ~/.gstack state. Writes question_tuning=true + a never-ask preference for plan-ceo-review-mode (source: 'plan-tune', which bypasses the inline-user origin gate). Spawns claude with --disallowedTools AskUserQuestion in plan mode, runs /plan-ceo-review, asserts outcome is NOT 'asked' (i.e., the model honored the preference). Periodic tier because AUTO_DECIDE behavior depends on the model adhering to the QUESTION_TUNING preamble injection — non-deterministic, weekly cron is the right cadence rather than CI gating. Touchfiles cover the AUTO_DECIDE-bearing resolvers + the question-tuning binaries the test setup invokes. touchfiles.test.ts count updates 19 -> 20 because auto-decide-preserved also depends on plan-ceo-review/**. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -105,6 +105,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// skills with no prior plan-mode test:
|
||||
'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
// v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution
|
||||
// fix doesn't trip the legitimate /plan-tune opt-in path: when the user has
|
||||
// written a never-ask preference, AUQ should still auto-decide rather than
|
||||
// surfacing the question. Touches the question-tuning + preference
|
||||
// infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
|
||||
'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'],
|
||||
|
||||
// Real-PTY E2E batch (#6 new tests on the harness).
|
||||
// Each one tests behavior the SDK harness can't observe (rendered TTY,
|
||||
@@ -385,6 +391,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// v1.21+ auto-mode regression tests
|
||||
'autoplan-auto-mode': 'gate',
|
||||
'office-hours-auto-mode': 'gate',
|
||||
'auto-decide-preserved': 'periodic',
|
||||
'e2e-harness-audit': 'gate',
|
||||
|
||||
// Real-PTY E2E batch — tier classification:
|
||||
|
||||
Reference in New Issue
Block a user