mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 05:05:08 +02:00
test(periodic): AUTO_DECIDE opt-in preserved under Conductor flags
Periodic-tier eval that exercises the legitimate /plan-tune AUTO_DECIDE path under the same flags Conductor uses (--disallowedTools AskUserQuestion). Confirms the new Tool resolution preamble doesn't trip opt-in users: when the user has set a never-ask preference for a question, the model should auto-pick (outcome 'auto_decided' or 'plan_ready') rather than surface the prompt. Setup runs in an isolated GSTACK_HOME tmpdir — never touches the user's real ~/.gstack state. Writes question_tuning=true + a never-ask preference for plan-ceo-review-mode (source: 'plan-tune', which bypasses the inline-user origin gate). Spawns claude with --disallowedTools AskUserQuestion in plan mode, runs /plan-ceo-review, asserts outcome is NOT 'asked' (i.e., the model honored the preference). Periodic tier because AUTO_DECIDE behavior depends on the model adhering to the QUESTION_TUNING preamble injection — non-deterministic, weekly cron is the right cadence rather than CI gating. Touchfiles cover the AUTO_DECIDE-bearing resolvers + the question-tuning binaries the test setup invokes. touchfiles.test.ts count updates 19 -> 20 because auto-decide-preserved also depends on plan-ceo-review/**. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -105,6 +105,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// skills with no prior plan-mode test:
|
||||
'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
// v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution
|
||||
// fix doesn't trip the legitimate /plan-tune opt-in path: when the user has
|
||||
// written a never-ask preference, AUQ should still auto-decide rather than
|
||||
// surfacing the question. Touches the question-tuning + preference
|
||||
// infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
|
||||
'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'],
|
||||
|
||||
// Real-PTY E2E batch (#6 new tests on the harness).
|
||||
// Each one tests behavior the SDK harness can't observe (rendered TTY,
|
||||
@@ -385,6 +391,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// v1.21+ auto-mode regression tests
|
||||
'autoplan-auto-mode': 'gate',
|
||||
'office-hours-auto-mode': 'gate',
|
||||
'auto-decide-preserved': 'periodic',
|
||||
'e2e-harness-audit': 'gate',
|
||||
|
||||
// Real-PTY E2E batch — tier classification:
|
||||
|
||||
@@ -0,0 +1,131 @@
|
||||
/**
|
||||
* AUTO_DECIDE opt-in preserved under Conductor flags (periodic-tier, paid, real-PTY).
|
||||
*
|
||||
* Regression test for v1.21+ fix: the new "Tool resolution" preamble
|
||||
* (scripts/resolvers/preamble/generate-ask-user-format.ts) tells the model
|
||||
* to prefer mcp__*__AskUserQuestion variants and fall back to plan-file
|
||||
* decisions when neither is callable. This must NOT break the legitimate
|
||||
* `/plan-tune` AUTO_DECIDE path: when the user has explicitly opted into
|
||||
* auto-deciding a specific question via `gstack-question-preference --write
|
||||
* never-ask`, the model is supposed to honor that — it should still
|
||||
* auto-pick the recommended option and emit the AUTO_DECIDE annotation
|
||||
* ("Auto-decided <summary> → <option> (your preference). Change with
|
||||
* /plan-tune.") instead of opening a question prompt.
|
||||
*
|
||||
* Periodic tier: AUTO_DECIDE behavior depends on the model adhering to
|
||||
* the QUESTION_TUNING preamble injection. Non-deterministic; runs weekly
|
||||
* or manually rather than gating CI.
|
||||
*
|
||||
* Set up:
|
||||
* - tmpDir as GSTACK_HOME (isolated state, doesn't touch the user's
|
||||
* real ~/.gstack)
|
||||
* - question_tuning=true in the tmp config
|
||||
* - preference for plan-ceo-review-mode → never-ask (source: plan-tune)
|
||||
*
|
||||
* Spawn:
|
||||
* claude --permission-mode plan --disallowedTools AskUserQuestion
|
||||
* /plan-ceo-review
|
||||
*
|
||||
* Expected:
|
||||
* - outcome === 'auto_decided' (the AUTO_DECIDE preamble fired and the
|
||||
* "Auto-decided ... (your preference)" text rendered)
|
||||
*
|
||||
* If outcome is 'asked', the model ignored the user's `/plan-tune`
|
||||
* preference — that's a regression against the opt-in feature. If outcome
|
||||
* is 'plan_ready' with no AUTO_DECIDE text, the model auto-decided BUT
|
||||
* skipped the annotation (acceptable; AUTO_DECIDE annotation is good
|
||||
* practice but not the load-bearing behavior).
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import { spawnSync } from 'child_process';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
describeE2E('AUTO_DECIDE opt-in preserved under Conductor flags (periodic)', () => {
|
||||
test('user-opted-in question still auto-decides when AskUserQuestion is --disallowedTools', async () => {
|
||||
const tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-auto-decide-'));
|
||||
try {
|
||||
// 1. Bootstrap the tmp GSTACK_HOME with question_tuning=true.
|
||||
const configBin = path.join(ROOT, 'bin', 'gstack-config');
|
||||
const setRes = spawnSync(configBin, ['set', 'question_tuning', 'true'], {
|
||||
env: { ...process.env, GSTACK_HOME: tmpHome },
|
||||
encoding: 'utf-8',
|
||||
});
|
||||
if (setRes.status !== 0) {
|
||||
throw new Error(`gstack-config set failed: ${setRes.stderr || setRes.stdout}`);
|
||||
}
|
||||
|
||||
// 2. Resolve slug for the project (uses git remote — same as the spawned
|
||||
// claude would resolve). The preference file path keys on this slug.
|
||||
const slugBin = path.join(ROOT, 'bin', 'gstack-slug');
|
||||
const slugRes = spawnSync(slugBin, [], {
|
||||
cwd: ROOT,
|
||||
env: { ...process.env, GSTACK_HOME: tmpHome },
|
||||
encoding: 'utf-8',
|
||||
});
|
||||
// gstack-slug emits `eval`-able shell exports like `SLUG=garrytan-gstack`.
|
||||
const slug = (slugRes.stdout.match(/SLUG=([^\s;]+)/)?.[1] ?? 'unknown').replace(/['"]/g, '');
|
||||
|
||||
// 3. Write the preference: plan-ceo-review-mode → never-ask. The
|
||||
// 'plan-tune' source bypasses the inline-user origin gate.
|
||||
const prefBin = path.join(ROOT, 'bin', 'gstack-question-preference');
|
||||
const writeRes = spawnSync(
|
||||
prefBin,
|
||||
['--write', JSON.stringify({
|
||||
question_id: 'plan-ceo-review-mode',
|
||||
preference: 'never-ask',
|
||||
source: 'plan-tune',
|
||||
})],
|
||||
{
|
||||
env: { ...process.env, GSTACK_HOME: tmpHome },
|
||||
encoding: 'utf-8',
|
||||
},
|
||||
);
|
||||
if (writeRes.status !== 0) {
|
||||
throw new Error(`gstack-question-preference --write failed: ${writeRes.stderr || writeRes.stdout}`);
|
||||
}
|
||||
|
||||
// Sanity: the preference file landed where we expect.
|
||||
const prefFile = path.join(tmpHome, 'projects', slug, 'question-preferences.json');
|
||||
if (!fs.existsSync(prefFile)) {
|
||||
throw new Error(`expected preference file at ${prefFile}; not found. slug=${slug}`);
|
||||
}
|
||||
|
||||
// 4. Run /plan-ceo-review with the Conductor flag set + isolated state.
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-ceo-review',
|
||||
inPlanMode: true,
|
||||
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
// 5. Pass: 'auto_decided' (the strongest signal) or 'plan_ready' with
|
||||
// no question rendered. Fail: 'asked' (model ignored the opt-in).
|
||||
if (obs.outcome === 'asked') {
|
||||
throw new Error(
|
||||
`AUTO_DECIDE regression: the model surfaced an AskUserQuestion despite the user's never-ask preference.\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`AUTO_DECIDE preserve test inconclusive: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
expect(['auto_decided', 'plan_ready']).toContain(obs.outcome);
|
||||
} finally {
|
||||
try { fs.rmSync(tmpHome, { recursive: true, force: true }); } catch { /* best-effort */ }
|
||||
}
|
||||
}, 360_000);
|
||||
});
|
||||
@@ -97,10 +97,12 @@ describe('selectTests', () => {
|
||||
expect(result.selected).toContain('ask-user-question-format-pty');
|
||||
expect(result.selected).toContain('plan-ceo-mode-routing');
|
||||
expect(result.selected).toContain('autoplan-chain-pty');
|
||||
// v1.21+ auto-mode regression: autoplan-auto-mode also depends on plan-ceo-review/**
|
||||
// v1.21+ regression: autoplan-auto-mode + auto-decide-preserved also
|
||||
// depend on plan-ceo-review/**
|
||||
expect(result.selected).toContain('autoplan-auto-mode');
|
||||
expect(result.selected.length).toBe(19);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 19);
|
||||
expect(result.selected).toContain('auto-decide-preserved');
|
||||
expect(result.selected.length).toBe(20);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 20);
|
||||
});
|
||||
|
||||
test('global touchfile triggers ALL tests', () => {
|
||||
|
||||
Reference in New Issue
Block a user