Merge branch 'main' into garrytan/gbrain-support

Resolves:
- VERSION: keep 1.7.0.0 (my branch's bump is higher than main's 1.6.3.0)
- package.json: keep 1.7.0.0 (same logic)
- CHANGELOG.md: keep 1.7.0.0 entry on top, preserve main's 1.6.2.0 +
  1.6.3.0 entries chronologically between 1.7.0.0 and the shared 1.6.1.0
  tail
- context-save/SKILL.md.tmpl: accept main's deletion of the "Resume flow"
  section (logic moved to the separate /context-restore skill)
- Regenerated all SKILL.md files via bun run gen:skill-docs so they match
  both branches' template state post-merge
This commit is contained in:
Garry Tan
2026-04-23 07:30:58 -07:00
45 changed files with 981 additions and 373 deletions
+320
View File
@@ -0,0 +1,320 @@
/**
* AskUserQuestion format regression test for /plan-ceo-review and /plan-eng-review
* running under Codex CLI (GPT-5.4).
*
* Context: GPT-class models under the "No preamble / Prefer doing over listing"
* gpt.md overlay tend to skip the Simplify (ELI10) paragraph and the RECOMMENDATION
* line on AskUserQuestion calls. The user has to manually re-prompt "ELI10 and don't
* forget to recommend" almost every time. This test pins that behavior so future
* regressions surface automatically.
*
* Mirrors test/skill-e2e-plan-format.test.ts (the Claude version) but uses
* test/helpers/codex-session-runner.ts to drive `codex exec` instead of `claude -p`.
*
* Four cases:
* 1. plan-ceo-review mode selection (kind-differentiated)
* 2. plan-ceo-review approach menu (coverage-differentiated)
* 3. plan-eng-review per-issue coverage decision
* 4. plan-eng-review per-issue architectural choice (kind-differentiated)
*
* Assertions on captured AskUserQuestion text:
* - RECOMMENDATION: Choose present (all cases)
* - Completeness: N/10 present on coverage cases, absent on kind cases
* - "options differ in kind" note present on kind cases
* - ELI10-style plain-English explanation present (length floor + no raw jargon)
*
* Periodic tier (Codex non-determinism). Cost: ~$2-3 per full run.
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runCodexSkill, installSkillToTempHome } from './helpers/codex-session-runner';
import type { CodexResult } from './helpers/codex-session-runner';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { spawnSync } from 'child_process';
const ROOT = path.resolve(import.meta.dir, '..');
// --- Prerequisites ---
const CODEX_AVAILABLE = (() => {
try {
const result = Bun.spawnSync(['which', 'codex']);
return result.exitCode === 0;
} catch { return false; }
})();
const evalsEnabled = !!process.env.EVALS;
const SKIP = !CODEX_AVAILABLE || !evalsEnabled;
const describeCodex = SKIP ? describe.skip : describe;
// --- Touchfiles ---
const CODEX_FORMAT_TOUCHFILES: Record<string, string[]> = {
'codex-plan-ceo-format-mode': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
'codex-plan-ceo-format-approach': ['.agents/skills/gstack-plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
'codex-plan-eng-format-coverage': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
'codex-plan-eng-format-kind': ['.agents/skills/gstack-plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'model-overlays/gpt.md', 'model-overlays/gpt-5.4.md'],
};
let selectedTests: string[] | null = null;
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE || detectBaseBranch(ROOT) || 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length > 0) {
const selection = selectTests(changedFiles, CODEX_FORMAT_TOUCHFILES, GLOBAL_TOUCHFILES);
selectedTests = selection.selected;
}
}
function testIfSelected(name: string, fn: () => Promise<void>, timeout?: number) {
if (selectedTests !== null && !selectedTests.includes(name)) {
test.skip(name, fn, timeout);
} else {
test(name, fn, timeout);
}
}
// --- Eval collector ---
let evalCollector: EvalCollector | null = null;
if (!SKIP) {
evalCollector = new EvalCollector('codex-e2e-plan-format');
}
function recordCodexResult(testName: string, result: CodexResult, passed: boolean) {
evalCollector?.addTest({
name: testName,
suite: 'codex-e2e-plan-format',
tier: 'e2e',
passed,
duration_ms: result.durationMs,
cost_usd: 0, // Codex doesn't report cost in the same way; tokens tracked separately
output: result.output?.slice(0, 2000),
turns_used: result.toolCalls.length,
exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
});
}
afterAll(async () => {
if (evalCollector) {
await evalCollector.finalize();
}
});
// --- Fixtures ---
const SAMPLE_PLAN = `# Plan: Add User Dashboard
## Context
We're building a new user dashboard that shows recent activity, notifications, and quick actions.
## Changes
1. New React component \`UserDashboard\` in \`src/components/\`
2. REST API endpoint \`GET /api/dashboard\` returning user stats
3. PostgreSQL query for activity aggregation
4. Redis cache layer for dashboard data (5min TTL)
## Architecture
- Frontend: React + TailwindCSS
- Backend: Express.js REST API
- Database: PostgreSQL with existing user/activity tables
- Cache: Redis for dashboard aggregates
`;
function setupCodexSkillDir(tmpPrefix: string, skillName: 'plan-ceo-review' | 'plan-eng-review'): { skillDir: string; planDir: string; outFile: string } {
const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(planDir, 'plan.md'), SAMPLE_PLAN);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add plan']);
// Codex skill lives in .agents/skills/gstack-{name}/ per the gstack host convention.
const codexSkillSource = path.join(ROOT, '.agents', 'skills', `gstack-${skillName}`);
const skillDir = path.join(planDir, '.agents', 'skills', `gstack-${skillName}`);
fs.mkdirSync(skillDir, { recursive: true });
fs.cpSync(codexSkillSource, skillDir, { recursive: true });
const outFile = path.join(planDir, 'ask-capture.md');
return { skillDir, planDir, outFile };
}
// Capture instruction — same shape as the Claude version. Codex may ignore tool calls,
// so we tell it to write prose to the file directly.
function captureInstruction(outFile: string): string {
return `Write the verbatim text of every AskUserQuestion you would have presented to the user to the file ${outFile} (one question per session, full text including the re-ground, ELI10 paragraph, RECOMMENDATION line, and options). Do NOT ask the user interactively. Do NOT paraphrase. This is a format-capture test, not an interactive session.`;
}
// --- Regex predicates ---
// Match RECOMMENDATION lenient to markdown bolding around it.
const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/;
const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/;
const KIND_NOTE_RE = /options differ in kind/i;
// ELI10 signal: some plain-English explanation must exist. Weak proxy: >= 200 chars
// of narrative prose between the re-ground and the options, AND at least one of the
// plain-English hints ("plain English", "16-year-old", or "what this means").
// We test for the length floor and absence of a bare options-list-only output.
const ELI10_LENGTH_FLOOR = 400; // full AskUserQuestion content should be at least this long
// --- Tests ---
describeCodex('Codex Plan Format — CEO Mode Selection', () => {
let skillDir: string, planDir: string, outFile: string;
beforeAll(() => {
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-mode-', 'plan-ceo-review'));
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('codex-plan-ceo-format-mode', async () => {
const result = await runCodexSkill({
skillDir,
prompt: `Read the plan-ceo-review skill. Read plan.md (the plan to review). Proceed to Step 0F (Mode Selection) where the skill presents 4 mode options (SCOPE EXPANSION, SELECTIVE EXPANSION, HOLD SCOPE, SCOPE REDUCTION) via AskUserQuestion. These options differ in kind (review posture), not coverage. ${captureInstruction(outFile)}`,
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-ceo-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-ceo-format-mode', result, result.exitCode === 0);
console.log(`codex-plan-ceo-format-mode: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
// Codex may timeout — accept as non-fatal (same pattern as existing codex-e2e tests)
if (result.exitCode === 124 || result.exitCode === 137) {
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
return;
}
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
expect(captured).toMatch(RECOMMENDATION_RE);
// kind-differentiated: no fabricated score, must have note
expect(captured).not.toMatch(COMPLETENESS_RE);
expect(captured).toMatch(KIND_NOTE_RE);
}, 360_000);
});
describeCodex('Codex Plan Format — CEO Approach Menu', () => {
let skillDir: string, planDir: string, outFile: string;
beforeAll(() => {
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-ceo-approach-', 'plan-ceo-review'));
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('codex-plan-ceo-format-approach', async () => {
const result = await runCodexSkill({
skillDir,
prompt: `Read the plan-ceo-review skill. Read plan.md. Proceed to Step 0C-bis (Implementation Alternatives / Approach Menu) where the skill generates 2-3 approaches (minimal viable vs ideal architecture) and presents them via AskUserQuestion. These options differ in coverage so Completeness: N/10 applies. ${captureInstruction(outFile)}`,
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-ceo-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-ceo-format-approach', result, result.exitCode === 0);
console.log(`codex-plan-ceo-format-approach: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
if (result.exitCode === 124 || result.exitCode === 137) {
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
return;
}
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
expect(captured).toMatch(RECOMMENDATION_RE);
expect(captured).toMatch(COMPLETENESS_RE);
}, 360_000);
});
describeCodex('Codex Plan Format — Eng Coverage Issue', () => {
let skillDir: string, planDir: string, outFile: string;
beforeAll(() => {
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-cov-', 'plan-eng-review'));
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('codex-plan-eng-format-coverage', async () => {
const result = await runCodexSkill({
skillDir,
prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 3 Test Review, generate ONE AskUserQuestion about test coverage depth where options are clearly coverage-differentiated: A) full coverage incl. edge + error paths (Completeness 10/10), B) happy path only (7/10), C) smoke test (3/10). ${captureInstruction(outFile)}`,
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-eng-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-eng-format-coverage', result, result.exitCode === 0);
console.log(`codex-plan-eng-format-coverage: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
if (result.exitCode === 124 || result.exitCode === 137) {
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
return;
}
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
expect(captured).toMatch(RECOMMENDATION_RE);
expect(captured).toMatch(COMPLETENESS_RE);
}, 360_000);
});
describeCodex('Codex Plan Format — Eng Kind Issue', () => {
let skillDir: string, planDir: string, outFile: string;
beforeAll(() => {
({ skillDir, planDir, outFile } = setupCodexSkillDir('codex-e2e-plan-format-eng-kind-', 'plan-eng-review'));
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testIfSelected('codex-plan-eng-format-kind', async () => {
const result = await runCodexSkill({
skillDir,
prompt: `Read the plan-eng-review skill. Read plan.md. In your Section 1 Architecture review, generate ONE AskUserQuestion about an architectural choice where the options differ in kind (e.g. Redis vs Postgres materialized view vs in-process cache — different kinds of systems with different tradeoffs, NOT more-or-less-complete versions of the same thing). ${captureInstruction(outFile)}`,
timeoutMs: 300_000,
cwd: planDir,
skillName: 'gstack-plan-eng-review',
sandbox: 'workspace-write',
});
recordCodexResult('codex-plan-eng-format-kind', result, result.exitCode === 0);
console.log(`codex-plan-eng-format-kind: ${result.tokens}t, ${Math.round(result.durationMs/1000)}s, exit=${result.exitCode}`);
if (result.exitCode === 124 || result.exitCode === 137) {
console.warn(`codex timed out (exit ${result.exitCode}) — skipping assertions`);
return;
}
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(ELI10_LENGTH_FLOOR);
expect(captured).toMatch(RECOMMENDATION_RE);
// kind-differentiated: no fabricated score
expect(captured).not.toMatch(COMPLETENESS_RE);
expect(captured).toMatch(KIND_NOTE_RE);
}, 360_000);
});
+7 -5
View File
@@ -470,11 +470,13 @@ available]. [Health score if available]." Keep it to 2-3 sentences.
## AskUserQuestion Format
**ALWAYS follow this structure for every AskUserQuestion call:**
**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.**
1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called.
3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it.
4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time.
3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind.
4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores.
5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
@@ -594,7 +596,7 @@ AI makes completeness near-free. Always recommend the complete option over short
| Feature | 1 week | 30 min | ~30x |
| Bug fix | 4 hours | 15 min | ~20x |
Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut).
When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores.
## Confusion Protocol
+7 -5
View File
@@ -459,11 +459,13 @@ available]. [Health score if available]." Keep it to 2-3 sentences.
## AskUserQuestion Format
**ALWAYS follow this structure for every AskUserQuestion call:**
**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.**
1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called.
3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it.
4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time.
3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind.
4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores.
5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
@@ -583,7 +585,7 @@ AI makes completeness near-free. Always recommend the complete option over short
| Feature | 1 week | 30 min | ~30x |
| Bug fix | 4 hours | 15 min | ~20x |
Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut).
When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores.
## Confusion Protocol
+7 -5
View File
@@ -461,11 +461,13 @@ available]. [Health score if available]." Keep it to 2-3 sentences.
## AskUserQuestion Format
**ALWAYS follow this structure for every AskUserQuestion call:**
**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.**
1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called.
3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it.
4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time.
3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind.
4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores.
5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
@@ -585,7 +587,7 @@ AI makes completeness near-free. Always recommend the complete option over short
| Feature | 1 week | 30 min | ~30x |
| Bug fix | 4 hours | 15 min | ~20x |
Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut).
When options differ in coverage (e.g. full vs happy-path vs shortcut), include `Completeness: X/10` on each option (10 = all edge cases, 7 = happy path, 3 = shortcut). When options differ in kind (mode posture, architectural choice, cherry-pick A/B/C where each is a different kind of thing, not a more-or-less-complete version of the same thing), skip the score and write one line explaining why: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate scores.
## Confusion Protocol
+2 -1
View File
@@ -244,8 +244,9 @@ describe('gen-skill-docs', () => {
test('tier 2+ skills contain ELI16 simplification rules (AskUserQuestion format)', () => {
// Root SKILL.md is tier 1 (no AskUserQuestion format). Check a tier 2+ skill instead.
const content = fs.readFileSync(path.join(ROOT, 'cso', 'SKILL.md'), 'utf-8');
expect(content).toContain('No raw function names');
expect(content).toContain('Simplify (ELI10');
expect(content).toContain('plain English');
expect(content).toContain('not function names');
});
test('tier 1 skills do NOT contain AskUserQuestion format', () => {
+13
View File
@@ -82,6 +82,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'plan-eng-review-artifact': ['plan-eng-review/**'],
'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
// AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
// Fires when either template OR the two preamble resolvers change.
'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
// /plan-tune (v1 observational)
'plan-tune-inspect': ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'],
@@ -275,6 +282,12 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'plan-eng-coverage-audit': 'gate',
'plan-review-report': 'gate',
// AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark)
'plan-ceo-review-format-mode': 'periodic',
'plan-ceo-review-format-approach': 'periodic',
'plan-eng-review-format-coverage': 'periodic',
'plan-eng-review-format-kind': 'periodic',
// /plan-tune — gate (core v1 DX promise: plain-English intent routing)
'plan-tune-inspect': 'gate',
+297
View File
@@ -0,0 +1,297 @@
/**
* AskUserQuestion format regression test for /plan-ceo-review and /plan-eng-review.
*
* Context: a user on Opus 4.7 reported the RECOMMENDATION line and the
* `Completeness: N/10` per-option score stopped appearing on AskUserQuestion
* prompts. This test captures the agent's AskUserQuestion output verbatim
* and asserts the format rule is applied.
*
* Capture shape: `claude -p` sessions inside this harness do not have the
* AskUserQuestion MCP tool wired. We instruct the agent to write the verbatim
* AskUserQuestion text it would have made to $OUT_FILE instead of calling
* any tool. Assertions read that file.
*
* Coverage-vs-kind split: the format rule says to include `Completeness: N/10`
* only when options differ in coverage. When options differ in kind (mode
* selection, posture choice, cherry-pick Add/Defer/Skip), the score is
* intentionally absent and a one-line note explains why. Assertions split
* accordingly.
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import {
ROOT, runId,
describeIfSelected, testConcurrentIfSelected,
logCost, recordE2E,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-plan-format');
// Regex predicates applied to captured AskUserQuestion content.
// RECOMMENDATION regex is lenient on intervening markdown markers (e.g.
// agent writes `**RECOMMENDATION:** Choose` — the `**` closers are benign).
const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/;
const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/;
const KIND_NOTE_RE = /options differ in kind/i;
const SAMPLE_PLAN = `# Plan: Add User Dashboard
## Context
We're building a new user dashboard that shows recent activity, notifications, and quick actions.
## Changes
1. New React component \`UserDashboard\` in \`src/components/\`
2. REST API endpoint \`GET /api/dashboard\` returning user stats
3. PostgreSQL query for activity aggregation
4. Redis cache layer for dashboard data (5min TTL)
## Architecture
- Frontend: React + TailwindCSS
- Backend: Express.js REST API
- Database: PostgreSQL with existing user/activity tables
- Cache: Redis for dashboard aggregates
`;
function setupPlanDir(tmpPrefix: string, skillName: 'plan-ceo-review' | 'plan-eng-review'): string {
const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(planDir, 'plan.md'), SAMPLE_PLAN);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add plan']);
fs.mkdirSync(path.join(planDir, skillName), { recursive: true });
fs.copyFileSync(
path.join(ROOT, skillName, 'SKILL.md'),
path.join(planDir, skillName, 'SKILL.md'),
);
return planDir;
}
// The capture instruction passed to every case. Tells the agent to dump
// AskUserQuestion content to a file instead of calling a tool.
function captureInstruction(outFile: string): string {
return `Write the verbatim text of every AskUserQuestion you would have made to ${outFile} (one question per session, full text including options and recommendation line). Do NOT call any tool to ask the user. Do NOT paraphrase — include the exact prose you would have shown. This is a format-capture test, not an interactive session.`;
}
// --- Case 1: plan-ceo-review mode selection (kind-differentiated) ---
describeIfSelected('Plan Format — CEO Mode Selection', ['plan-ceo-review-format-mode'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-format-ceo-mode-', 'plan-ceo-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-ceo-review-format-mode', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration or system audit steps.
Proceed to Step 0F (Mode Selection). This is where the skill presents 4 mode options (SCOPE EXPANSION, SELECTIVE EXPANSION, HOLD SCOPE, SCOPE REDUCTION) to the user via AskUserQuestion. These options differ in kind (review posture), not in coverage.
${captureInstruction(outFile)}
After writing the file, stop. Do not continue the review.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-ceo-review-format-mode',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-ceo-review format (mode)', result);
recordE2E(evalCollector, '/plan-ceo-review-format-mode', 'Plan Format — CEO Mode Selection', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(100);
// Kind-differentiated: RECOMMENDATION required, Completeness: N/10 must NOT appear,
// "options differ in kind" note must appear.
expect(captured).toMatch(RECOMMENDATION_RE);
expect(captured).not.toMatch(COMPLETENESS_RE);
expect(captured).toMatch(KIND_NOTE_RE);
}, 300_000);
});
// --- Case 2: plan-ceo-review approach menu (coverage-differentiated) ---
describeIfSelected('Plan Format — CEO Approach Menu', ['plan-ceo-review-format-approach'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-format-ceo-approach-', 'plan-ceo-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-ceo-review-format-approach', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration or system audit steps.
Proceed to Step 0C-bis (Implementation Alternatives / Approach Menu). This is where the skill generates 2-3 approaches (minimal viable vs ideal architecture) and presents them via AskUserQuestion. These options differ in coverage (complete vs shortcut), so Completeness: N/10 applies.
${captureInstruction(outFile)}
After writing the file, stop. Do not continue the review.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-ceo-review-format-approach',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-ceo-review format (approach)', result);
recordE2E(evalCollector, '/plan-ceo-review-format-approach', 'Plan Format — CEO Approach Menu', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(100);
// Coverage-differentiated: both RECOMMENDATION and Completeness: N/10 required.
expect(captured).toMatch(RECOMMENDATION_RE);
expect(captured).toMatch(COMPLETENESS_RE);
}, 300_000);
});
// --- Case 3: plan-eng-review coverage-differentiated per-issue AskUserQuestion ---
describeIfSelected('Plan Format — Eng Coverage Issue', ['plan-eng-review-format-coverage'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-format-eng-cov-', 'plan-eng-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-eng-review-format-coverage', async () => {
const result = await runSkillTest({
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.
During your review (Section 3 Test Review is the natural place), generate ONE AskUserQuestion about test coverage depth where the options are clearly coverage-differentiated. For example:
A) Full coverage: happy path + edge cases + error paths (Completeness 10/10)
B) Happy path only (Completeness 7/10)
C) Smoke test (Completeness 3/10)
${captureInstruction(outFile)}
After writing the file with that ONE question, stop. Do not continue the review.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-eng-review-format-coverage',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-eng-review format (coverage)', result);
recordE2E(evalCollector, '/plan-eng-review-format-coverage', 'Plan Format — Eng Coverage Issue', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(100);
// Coverage-differentiated: both RECOMMENDATION and Completeness: N/10 required.
expect(captured).toMatch(RECOMMENDATION_RE);
expect(captured).toMatch(COMPLETENESS_RE);
}, 300_000);
});
// --- Case 4: plan-eng-review kind-differentiated per-issue AskUserQuestion ---
describeIfSelected('Plan Format — Eng Kind Issue', ['plan-eng-review-format-kind'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-format-eng-kind-', 'plan-eng-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-eng-review-format-kind', async () => {
const result = await runSkillTest({
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.
During your review (Section 1 Architecture), generate ONE AskUserQuestion about an architectural choice where the options differ in kind, not in coverage. For example, "should we use Redis or Postgres for the cache layer?" — the options are different kinds of systems with different tradeoffs, not more-or-less-complete versions of the same thing.
${captureInstruction(outFile)}
After writing the file with that ONE question, stop. Do not continue the review.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-eng-review-format-kind',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-eng-review format (kind)', result);
recordE2E(evalCollector, '/plan-eng-review-format-kind', 'Plan Format — Eng Kind Issue', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(100);
// Kind-differentiated: RECOMMENDATION required, Completeness: N/10 must NOT appear,
// "options differ in kind" note must appear.
expect(captured).toMatch(RECOMMENDATION_RE);
expect(captured).not.toMatch(COMPLETENESS_RE);
expect(captured).toMatch(KIND_NOTE_RE);
}, 300_000);
});
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
});
+4 -2
View File
@@ -83,8 +83,10 @@ describe('selectTests', () => {
expect(result.selected).toContain('plan-ceo-review-expansion-energy');
expect(result.selected).toContain('autoplan-core');
expect(result.selected).toContain('codex-offered-ceo-review');
expect(result.selected.length).toBe(6);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 6);
expect(result.selected).toContain('plan-ceo-review-format-mode');
expect(result.selected).toContain('plan-ceo-review-format-approach');
expect(result.selected.length).toBe(8);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 8);
});
test('global touchfile triggers ALL tests', () => {