Merge remote-tracking branch 'origin/main' into garrytan/plan-review-regressions

This commit is contained in:
Garry Tan
2026-04-22 12:29:35 -07:00
81 changed files with 4209 additions and 857 deletions
+41 -16
View File
@@ -269,23 +269,44 @@ If A: Append this section to the end of CLAUDE.md:
## Skill routing
When the user's request matches an available skill, ALWAYS invoke it using the Skill
tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
The skill has specialized workflows that produce better results than ad-hoc answers.
When the user's request matches an available skill, invoke it via the Skill tool. The
skill has multi-step workflows, checklists, and quality gates that produce better
results than an ad-hoc answer. When in doubt, invoke the skill. A false positive is
cheaper than a false negative.
Key routing rules:
- Product ideas, "is this worth building", brainstorming → invoke office-hours
- Bugs, errors, "why is this broken", 500 errors → invoke investigate
- Ship, deploy, push, create PR → invoke ship
- QA, test the site, find bugs → invoke qa
- Code review, check my diff → invoke review
- Update docs after shipping → invoke document-release
- Weekly retro → invoke retro
- Design system, brand → invoke design-consultation
- Visual audit, design polish → invoke design-review
- Architecture review → invoke plan-eng-review
- Save progress, checkpoint, resume → invoke checkpoint
- Code quality, health check → invoke health
- Product ideas, "is this worth building", brainstorming → invoke /office-hours
- Strategy, scope, "think bigger", "what should we build" → invoke /plan-ceo-review
- Architecture, "does this design make sense" → invoke /plan-eng-review
- Design system, brand, "how should this look" → invoke /design-consultation
- Design review of a plan → invoke /plan-design-review
- Developer experience of a plan → invoke /plan-devex-review
- "Review everything", full review pipeline → invoke /autoplan
- Bugs, errors, "why is this broken", "wtf", "this doesn't work" → invoke /investigate
- Test the site, find bugs, "does this work" → invoke /qa (or /qa-only for report only)
- Code review, check the diff, "look at my changes" → invoke /review
- Visual polish, design audit, "this looks off" → invoke /design-review
- Developer experience audit, try onboarding → invoke /devex-review
- Ship, deploy, create a PR, "send it" → invoke /ship
- Merge + deploy + verify → invoke /land-and-deploy
- Configure deployment → invoke /setup-deploy
- Post-deploy monitoring → invoke /canary
- Update docs after shipping → invoke /document-release
- Weekly retro, "how'd we do" → invoke /retro
- Second opinion, codex review → invoke /codex
- Safety mode, careful mode, lock it down → invoke /careful or /guard
- Restrict edits to a directory → invoke /freeze or /unfreeze
- Upgrade gstack → invoke /gstack-upgrade
- Save progress, "save my work" → invoke /context-save
- Resume, restore, "where was I" → invoke /context-restore
- Security audit, OWASP, "is this secure" → invoke /cso
- Make a PDF, document, publication → invoke /make-pdf
- Launch real browser for QA → invoke /open-gstack-browser
- Import cookies for authenticated testing → invoke /setup-browser-cookies
- Performance regression, page speed, benchmarks → invoke /benchmark
- Review what gstack has learned → invoke /learn
- Tune question sensitivity → invoke /plan-tune
- Code quality dashboard → invoke /health
```
Then commit the change: `git add CLAUDE.md && git commit -m "chore: add gstack skill routing rules to CLAUDE.md"`
@@ -396,6 +417,10 @@ Avoid filler, throat-clearing, generic optimism, founder cosplay, and unsupporte
- Stay curious, not lecturing. "What's interesting here is..." beats "It is important to understand..."
- End with what to do. Give the action.
**Example of the right voice:**
"auth.ts:47 returns undefined when the session cookie expires. Your users hit a white screen. Fix: add a null check and redirect to /login. Two lines. Want me to fix it?"
Not: "I've identified a potential issue in the authentication flow that may cause problems for some users under certain conditions. Let me explain the approach I'd recommend..."
**Final test:** does this sound like a real cross-functional builder who wants to help someone make something people want, ship it, and make it actually work?
## Context Recovery
@@ -2761,7 +2786,7 @@ user via AskUserQuestion rather than destroying non-WIP commits.
git commit -m "$(cat <<'EOF'
chore: bump version and changelog (vX.Y.Z.W)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
EOF
)"
```
+40 -15
View File
@@ -258,23 +258,44 @@ If A: Append this section to the end of CLAUDE.md:
## Skill routing
When the user's request matches an available skill, ALWAYS invoke it using the Skill
tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
The skill has specialized workflows that produce better results than ad-hoc answers.
When the user's request matches an available skill, invoke it via the Skill tool. The
skill has multi-step workflows, checklists, and quality gates that produce better
results than an ad-hoc answer. When in doubt, invoke the skill. A false positive is
cheaper than a false negative.
Key routing rules:
- Product ideas, "is this worth building", brainstorming → invoke office-hours
- Bugs, errors, "why is this broken", 500 errors → invoke investigate
- Ship, deploy, push, create PR → invoke ship
- QA, test the site, find bugs → invoke qa
- Code review, check my diff → invoke review
- Update docs after shipping → invoke document-release
- Weekly retro → invoke retro
- Design system, brand → invoke design-consultation
- Visual audit, design polish → invoke design-review
- Architecture review → invoke plan-eng-review
- Save progress, checkpoint, resume → invoke checkpoint
- Code quality, health check → invoke health
- Product ideas, "is this worth building", brainstorming → invoke /office-hours
- Strategy, scope, "think bigger", "what should we build" → invoke /plan-ceo-review
- Architecture, "does this design make sense" → invoke /plan-eng-review
- Design system, brand, "how should this look" → invoke /design-consultation
- Design review of a plan → invoke /plan-design-review
- Developer experience of a plan → invoke /plan-devex-review
- "Review everything", full review pipeline → invoke /autoplan
- Bugs, errors, "why is this broken", "wtf", "this doesn't work" → invoke /investigate
- Test the site, find bugs, "does this work" → invoke /qa (or /qa-only for report only)
- Code review, check the diff, "look at my changes" → invoke /review
- Visual polish, design audit, "this looks off" → invoke /design-review
- Developer experience audit, try onboarding → invoke /devex-review
- Ship, deploy, create a PR, "send it" → invoke /ship
- Merge + deploy + verify → invoke /land-and-deploy
- Configure deployment → invoke /setup-deploy
- Post-deploy monitoring → invoke /canary
- Update docs after shipping → invoke /document-release
- Weekly retro, "how'd we do" → invoke /retro
- Second opinion, codex review → invoke /codex
- Safety mode, careful mode, lock it down → invoke /careful or /guard
- Restrict edits to a directory → invoke /freeze or /unfreeze
- Upgrade gstack → invoke /gstack-upgrade
- Save progress, "save my work" → invoke /context-save
- Resume, restore, "where was I" → invoke /context-restore
- Security audit, OWASP, "is this secure" → invoke /cso
- Make a PDF, document, publication → invoke /make-pdf
- Launch real browser for QA → invoke /open-gstack-browser
- Import cookies for authenticated testing → invoke /setup-browser-cookies
- Performance regression, page speed, benchmarks → invoke /benchmark
- Review what gstack has learned → invoke /learn
- Tune question sensitivity → invoke /plan-tune
- Code quality dashboard → invoke /health
```
Then commit the change: `git add CLAUDE.md && git commit -m "chore: add gstack skill routing rules to CLAUDE.md"`
@@ -385,6 +406,10 @@ Avoid filler, throat-clearing, generic optimism, founder cosplay, and unsupporte
- Stay curious, not lecturing. "What's interesting here is..." beats "It is important to understand..."
- End with what to do. Give the action.
**Example of the right voice:**
"auth.ts:47 returns undefined when the session cookie expires. Your users hit a white screen. Fix: add a null check and redirect to /login. Two lines. Want me to fix it?"
Not: "I've identified a potential issue in the authentication flow that may cause problems for some users under certain conditions. Let me explain the approach I'd recommend..."
**Final test:** does this sound like a real cross-functional builder who wants to help someone make something people want, ship it, and make it actually work?
## Context Recovery
+40 -15
View File
@@ -260,23 +260,44 @@ If A: Append this section to the end of CLAUDE.md:
## Skill routing
When the user's request matches an available skill, ALWAYS invoke it using the Skill
tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
The skill has specialized workflows that produce better results than ad-hoc answers.
When the user's request matches an available skill, invoke it via the Skill tool. The
skill has multi-step workflows, checklists, and quality gates that produce better
results than an ad-hoc answer. When in doubt, invoke the skill. A false positive is
cheaper than a false negative.
Key routing rules:
- Product ideas, "is this worth building", brainstorming → invoke office-hours
- Bugs, errors, "why is this broken", 500 errors → invoke investigate
- Ship, deploy, push, create PR → invoke ship
- QA, test the site, find bugs → invoke qa
- Code review, check my diff → invoke review
- Update docs after shipping → invoke document-release
- Weekly retro → invoke retro
- Design system, brand → invoke design-consultation
- Visual audit, design polish → invoke design-review
- Architecture review → invoke plan-eng-review
- Save progress, checkpoint, resume → invoke checkpoint
- Code quality, health check → invoke health
- Product ideas, "is this worth building", brainstorming → invoke /office-hours
- Strategy, scope, "think bigger", "what should we build" → invoke /plan-ceo-review
- Architecture, "does this design make sense" → invoke /plan-eng-review
- Design system, brand, "how should this look" → invoke /design-consultation
- Design review of a plan → invoke /plan-design-review
- Developer experience of a plan → invoke /plan-devex-review
- "Review everything", full review pipeline → invoke /autoplan
- Bugs, errors, "why is this broken", "wtf", "this doesn't work" → invoke /investigate
- Test the site, find bugs, "does this work" → invoke /qa (or /qa-only for report only)
- Code review, check the diff, "look at my changes" → invoke /review
- Visual polish, design audit, "this looks off" → invoke /design-review
- Developer experience audit, try onboarding → invoke /devex-review
- Ship, deploy, create a PR, "send it" → invoke /ship
- Merge + deploy + verify → invoke /land-and-deploy
- Configure deployment → invoke /setup-deploy
- Post-deploy monitoring → invoke /canary
- Update docs after shipping → invoke /document-release
- Weekly retro, "how'd we do" → invoke /retro
- Second opinion, codex review → invoke /codex
- Safety mode, careful mode, lock it down → invoke /careful or /guard
- Restrict edits to a directory → invoke /freeze or /unfreeze
- Upgrade gstack → invoke /gstack-upgrade
- Save progress, "save my work" → invoke /context-save
- Resume, restore, "where was I" → invoke /context-restore
- Security audit, OWASP, "is this secure" → invoke /cso
- Make a PDF, document, publication → invoke /make-pdf
- Launch real browser for QA → invoke /open-gstack-browser
- Import cookies for authenticated testing → invoke /setup-browser-cookies
- Performance regression, page speed, benchmarks → invoke /benchmark
- Review what gstack has learned → invoke /learn
- Tune question sensitivity → invoke /plan-tune
- Code quality dashboard → invoke /health
```
Then commit the change: `git add CLAUDE.md && git commit -m "chore: add gstack skill routing rules to CLAUDE.md"`
@@ -387,6 +408,10 @@ Avoid filler, throat-clearing, generic optimism, founder cosplay, and unsupporte
- Stay curious, not lecturing. "What's interesting here is..." beats "It is important to understand..."
- End with what to do. Give the action.
**Example of the right voice:**
"auth.ts:47 returns undefined when the session cookie expires. Your users hit a white screen. Fix: add a null check and redirect to /login. Two lines. Want me to fix it?"
Not: "I've identified a potential issue in the authentication flow that may cause problems for some users under certain conditions. Let me explain the approach I'd recommend..."
**Final test:** does this sound like a real cross-functional builder who wants to help someone make something people want, ship it, and make it actually work?
## Context Recovery
+15 -4
View File
@@ -1361,10 +1361,21 @@ describe('preamble routing injection', () => {
});
test('routing section content includes key routing rules', () => {
expect(shipContent).toContain('invoke office-hours');
expect(shipContent).toContain('invoke investigate');
expect(shipContent).toContain('invoke ship');
expect(shipContent).toContain('invoke qa');
expect(shipContent).toContain('invoke /office-hours');
expect(shipContent).toContain('invoke /investigate');
expect(shipContent).toContain('invoke /ship');
expect(shipContent).toContain('invoke /qa');
});
test('routing section uses renamed checkpoint skills (not stale /checkpoint)', () => {
expect(shipContent).toContain('invoke /context-save');
expect(shipContent).toContain('invoke /context-restore');
expect(shipContent).not.toContain('invoke checkpoint');
});
test('routing section uses soft "when in doubt" policy, not hard "ALWAYS invoke"', () => {
expect(shipContent).toContain('When in doubt, invoke the skill');
expect(shipContent).not.toContain('Do NOT answer directly');
});
});
+13
View File
@@ -213,6 +213,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'journey-retro': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-design-system': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
// Opus 4.7 behavior evals — keys match testName: values in the test file.
// Routing sub-tests use template literal `routing-${c.name}` testNames,
// which the touchfile completeness scanner skips; they inherit selection
// from the file-level touchfile entry via GLOBAL_TOUCHFILES.
'fanout-arm-overlay-on':
['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],
'fanout-arm-overlay-off':
['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],
};
/**
@@ -385,6 +394,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'journey-retro': 'periodic',
'journey-design-system': 'periodic',
'journey-visual-qa': 'periodic',
// Opus 4.7 overlay evals — periodic (non-deterministic LLM behavior + Opus cost)
'fanout-arm-overlay-on': 'periodic',
'fanout-arm-overlay-off': 'periodic',
};
/**
+345
View File
@@ -0,0 +1,345 @@
/**
* Opus 4.7 behavior evals.
*
* Two cases, both pinned to claude-opus-4-7:
*
* 1. Fanout rate the "Fan out explicitly" overlay nudge should make 4.7
* spawn parallel tool calls when the prompt has independent sub-problems.
* A/B: SKILL.md regenerated with `--model opus-4-7` (overlay ON) vs
* default `--model claude` (overlay OFF). Assert A B on parallel-call
* count in the first assistant turn.
*
* 2. Routing precision the new "when in doubt, invoke the skill" policy
* should route ambiguous dev prompts to the right skill WITHOUT routing
* casual/non-dev prompts. A handful of positive and negative controls.
*
* Both cases require a running Anthropic API key. Gated behind EVALS=1.
* Classify as `periodic` in touchfiles behavior measurement, not gate.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import { EvalCollector } from './helpers/eval-store';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const OPUS_47 = 'claude-opus-4-7';
const evalsEnabled = !!process.env.EVALS;
const describeE2E = evalsEnabled ? describe : describe.skip;
const evalCollector = evalsEnabled ? new EvalCollector('e2e-opus-47') : null;
const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
// --- Helpers ---
/** Skills that must exist as individual .claude/skills/{name}/SKILL.md files
* for Claude Code's auto-discovery to treat them as invokable via Skill tool.
* Matches the pattern in skill-routing-e2e.test.ts. */
const INSTALLED_SKILLS = [
'qa', 'qa-only', 'ship', 'review', 'plan-ceo-review', 'plan-eng-review',
'plan-design-review', 'design-review', 'design-consultation', 'retro',
'document-release', 'investigate', 'office-hours', 'browse',
];
/** Write a scratch root with:
* - Per-skill SKILL.md files under .claude/skills/ (so Skill tool sees them)
* - Project CLAUDE.md with explicit routing rules AND (optionally) the
* 4.7 overlay content directly inlined so `claude -p` sees it
* - git init
*
* `includeOverlay` controls whether the opus-4-7 nudges (Fan out, Literal,
* etc.) get inlined into CLAUDE.md this is the A/B axis for the fanout
* test. `claude -p` doesn't auto-load SKILL.md content, so CLAUDE.md is
* the only way to make the overlay visible to the model in this test
* harness.
*/
function mkEvalRoot(suffix: string, includeOverlay: boolean): string {
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `opus47-${suffix}-`));
// Regenerate at opus-4-7 so the per-skill SKILL.md files reflect that
// model's overlay. If includeOverlay is false we'll re-regen at default
// later just for the root SKILL.md copy. For individual skills, opus-4-7
// content doesn't matter for the routing test (we only need discovery).
const result = spawnSync(
'bun',
['run', 'scripts/gen-skill-docs.ts', '--model', includeOverlay ? 'opus-4-7' : 'claude'],
{ cwd: ROOT, stdio: 'pipe', encoding: 'utf-8', timeout: 60_000 },
);
if (result.status !== 0) {
throw new Error(`gen-skill-docs failed: ${result.stderr}`);
}
// Install per-skill SKILL.md files for Skill tool discovery.
const skillsDir = path.join(tmp, '.claude', 'skills');
for (const skill of INSTALLED_SKILLS) {
const src = path.join(ROOT, skill, 'SKILL.md');
if (!fs.existsSync(src)) continue;
const destDir = path.join(skillsDir, skill);
fs.mkdirSync(destDir, { recursive: true });
fs.copyFileSync(src, path.join(destDir, 'SKILL.md'));
}
// Extract the opus-4-7 model-overlay content from the checked-in file
// so we can inline it into CLAUDE.md when includeOverlay is true.
const overlayText = includeOverlay
? fs.readFileSync(path.join(ROOT, 'model-overlays', 'opus-4-7.md'), 'utf-8')
.replace(/\{\{INHERIT:claude\}\}\s*/, '')
.trim()
: '';
// Project CLAUDE.md. Explicit routing rules so the agent reaches for
// Skill tool on matching prompts, plus the optional overlay.
const routingBlock = `## Skill routing
When the user's request matches an available skill, invoke it via the Skill tool
as your FIRST action. The skill has multi-step workflows, checklists, and quality
gates that produce better results than an ad-hoc answer. When in doubt, invoke.
- Bugs, errors, "why is this broken", "wtf" invoke investigate
- Ship, deploy, "send it", create a PR invoke ship
- QA, test the site, "does this work" invoke qa
- Code review, check my diff invoke review
- Product ideas, brainstorming, "is this worth building" invoke office-hours
- Architecture, "does this design make sense" invoke plan-eng-review
- Design system, visual polish invoke design-review
- Weekly retro, what did we ship invoke retro`;
const claudeMd = includeOverlay
? `# Project\n\n${overlayText}\n\n${routingBlock}\n`
: `# Project\n\n${routingBlock}\n`;
fs.writeFileSync(path.join(tmp, 'CLAUDE.md'), claudeMd);
fs.writeFileSync(path.join(tmp, 'package.json'), '{"name":"opus47-eval"}');
const git = (args: string[]) =>
spawnSync('git', args, { cwd: tmp, stdio: 'pipe', timeout: 5_000 });
git(['init']);
git(['config', 'user.email', 't@t.com']);
git(['config', 'user.name', 'T']);
git(['add', '.']);
git(['commit', '-m', 'init']);
return tmp;
}
/** Count parallel tool calls in the first assistant turn. */
function firstTurnParallelism(transcript: any[]): number {
const firstAssistant = transcript.find((e) => e.type === 'assistant');
if (!firstAssistant) return 0;
const content = firstAssistant.message?.content ?? [];
return content.filter((c: any) => c.type === 'tool_use').length;
}
interface RoutingCase {
name: string;
prompt: string;
shouldRoute: boolean;
expectedSkill?: string;
}
/** Small, intentionally chosen routing cases. Positive cases are ambiguous
* phrasings the user actually says, not template text. Negative cases are
* casual or off-topic prompts that match routing keywords but shouldn't
* trigger a skill. */
const ROUTING_CASES: RoutingCase[] = [
// Positive — should route
{ name: 'pos-wtf-bug', prompt: "wtf is this error coming from auth.ts:47 when the cookie expires?", shouldRoute: true, expectedSkill: 'investigate' },
{ name: 'pos-send-it', prompt: "ok this is good enough, let's send it.", shouldRoute: true, expectedSkill: 'ship' },
{ name: 'pos-does-it-work', prompt: "I just pushed the login flow changes. Test the deployed site and find any bugs.", shouldRoute: true, expectedSkill: 'qa' },
// Negative — should NOT route
{ name: 'neg-syntax-q', prompt: "wtf does this Python list comprehension syntax even mean, [x for x in y if z]?", shouldRoute: false },
{ name: 'neg-algo-q', prompt: "does this bubble sort algorithm actually work in O(n log n)?", shouldRoute: false },
{ name: 'neg-slack-send', prompt: "can you help me write the slack message? I want to send it to the team.", shouldRoute: false },
];
// --- Tests ---
describeE2E('Opus 4.7 overlay behavior evals', () => {
afterAll(() => {
evalCollector?.finalize();
// Restore working tree: mkEvalRoot runs `gen-skill-docs` with various
// --model flags, leaving the in-repo SKILL.md files generated at
// whichever model ran last. Reset to the default (claude) so the tree
// matches what would be checked in.
spawnSync('bun', ['run', 'scripts/gen-skill-docs.ts'], {
cwd: ROOT,
stdio: 'pipe',
timeout: 60_000,
});
});
test(
'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task',
async () => {
const armA = mkEvalRoot('on', true);
const armB = mkEvalRoot('off', false);
// Populate three tiny independent files in each arm. The prompt asks
// the agent to read all three and report. Opus 4.7 (without nudge)
// tends to serialize; with the nudge it should parallelize.
for (const dir of [armA, armB]) {
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'alpha content: 1\n');
fs.writeFileSync(path.join(dir, 'beta.txt'), 'beta content: 2\n');
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'gamma content: 3\n');
}
const prompt =
"Read alpha.txt, beta.txt, and gamma.txt in this directory and report what's inside each. These three reads are independent.";
try {
const [resA, resB] = await Promise.all([
runSkillTest({
prompt,
workingDirectory: armA,
maxTurns: 5,
allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
timeout: 90_000,
testName: 'fanout-arm-overlay-on',
runId,
model: OPUS_47,
}),
runSkillTest({
prompt,
workingDirectory: armB,
maxTurns: 5,
allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
timeout: 90_000,
testName: 'fanout-arm-overlay-off',
runId,
model: OPUS_47,
}),
]);
const parA = firstTurnParallelism(resA.transcript);
const parB = firstTurnParallelism(resB.transcript);
console.log(
`[opus-4-7 fanout] arm A (overlay ON): ${parA} parallel tool calls in first turn; ` +
`arm B (overlay OFF): ${parB}`,
);
console.log(` cost A=$${resA.costEstimate.estimatedCost.toFixed(2)} B=$${resB.costEstimate.estimatedCost.toFixed(2)}`);
evalCollector?.addTest({
name: 'fanout-arm-overlay-on',
suite: 'Opus 4.7 overlay',
tier: 'e2e',
passed: parA >= parB,
duration_ms: resA.duration,
cost_usd: resA.costEstimate.estimatedCost,
transcript: resA.transcript,
output: `parallel=${parA}`,
turns_used: resA.costEstimate.turnsUsed,
exit_reason: resA.exitReason,
});
evalCollector?.addTest({
name: 'fanout-arm-overlay-off',
suite: 'Opus 4.7 overlay',
tier: 'e2e',
passed: true, // baseline arm, recorded for comparison
duration_ms: resB.duration,
cost_usd: resB.costEstimate.estimatedCost,
transcript: resB.transcript,
output: `parallel=${parB}`,
turns_used: resB.costEstimate.turnsUsed,
exit_reason: resB.exitReason,
});
// Main assertion: overlay arm is at least as parallel as baseline.
expect(parA, `overlay arm emitted ${parA} parallel calls, baseline ${parB}`).toBeGreaterThanOrEqual(parB);
} finally {
fs.rmSync(armA, { recursive: true, force: true });
fs.rmSync(armB, { recursive: true, force: true });
}
},
240_000,
);
test(
'routing precision: positives route, negatives do not',
async () => {
// Single SKILL.md tree shared by all cases. We run claude-opus-4-7 with
// tool access to Skill; measure whether the first tool call is Skill(..)
// and if so, which skill.
const root = mkEvalRoot('routing', true);
try {
const results = await Promise.all(
ROUTING_CASES.map((c) =>
runSkillTest({
prompt: c.prompt,
workingDirectory: root,
maxTurns: 3,
allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
timeout: 90_000,
testName: `routing-${c.name}`,
runId,
model: OPUS_47,
}).then((r) => ({ c, r })),
),
);
let tp = 0, fn = 0, fp = 0, tn = 0;
const rows: string[] = [];
let totalCost = 0;
for (const { c, r } of results) {
const skillCalls = r.toolCalls.filter((tc) => tc.tool === 'Skill');
const routed = skillCalls.length > 0;
const actualSkill = routed ? skillCalls[0]?.input?.skill : undefined;
const correct = c.shouldRoute
? routed && (!c.expectedSkill || actualSkill === c.expectedSkill)
: !routed;
if (c.shouldRoute && routed) tp++;
else if (c.shouldRoute && !routed) fn++;
else if (!c.shouldRoute && routed) fp++;
else tn++;
totalCost += r.costEstimate.estimatedCost;
rows.push(
` ${c.name.padEnd(18)} routed=${String(routed).padEnd(5)} skill=${String(actualSkill).padEnd(16)} ` +
`expected=${c.shouldRoute ? (c.expectedSkill ?? 'any') : '(none)'} ${correct ? 'OK' : 'MISS'}`,
);
evalCollector?.addTest({
name: `routing-${c.name}`,
suite: 'Opus 4.7 routing',
tier: 'e2e',
passed: correct,
duration_ms: r.duration,
cost_usd: r.costEstimate.estimatedCost,
transcript: r.transcript,
output: `routed=${routed} actual=${actualSkill ?? '(none)'} expected=${c.shouldRoute ? c.expectedSkill ?? 'any' : '(none)'}`,
turns_used: r.costEstimate.turnsUsed,
exit_reason: r.exitReason,
});
}
const posCount = ROUTING_CASES.filter((c) => c.shouldRoute).length;
const negCount = ROUTING_CASES.length - posCount;
const tpRate = posCount > 0 ? tp / posCount : 0;
const fpRate = negCount > 0 ? fp / negCount : 0;
console.log(`[opus-4-7 routing] total cost $${totalCost.toFixed(2)}`);
console.log(rows.join('\n'));
console.log(
` TP=${tp}/${posCount} (${(tpRate * 100).toFixed(0)}%) FN=${fn} ` +
`FP=${fp}/${negCount} (${(fpRate * 100).toFixed(0)}%) TN=${tn}`,
);
// Thresholds from the test plan artifact: TP >= 80%, FP <= 30%.
// With a small N we loosen slightly: TP >= 66% (2 of 3 positive),
// FP <= 33% (no more than 1 of 3 negatives).
expect(tpRate, `true-positive rate ${(tpRate * 100).toFixed(0)}% (need >= 66%)`).toBeGreaterThanOrEqual(2 / 3);
expect(fpRate, `false-positive rate ${(fpRate * 100).toFixed(0)}% (need <= 33%)`).toBeLessThanOrEqual(1 / 3);
} finally {
fs.rmSync(root, { recursive: true, force: true });
}
},
360_000,
);
});
+52 -12
View File
@@ -1576,22 +1576,62 @@ describe('Test failure triage in ship skill', () => {
});
describe('no compiled binaries in git', () => {
// Tracked files enumerated once and reused by both assertions. git ls-files -z
// + split is ~ms; the previous xargs-per-file shell loops blew past 5s on CI.
const trackedFiles: string[] = require('child_process')
.execSync('git ls-files -z', { cwd: ROOT, encoding: 'utf-8' })
.split('\0')
.filter(Boolean);
test('git tracks no Mach-O or ELF binaries', () => {
const result = require('child_process').execSync(
'git ls-files -z | xargs -0 file --mime-type 2>/dev/null | grep -E "application/(x-mach-binary|x-executable|x-pie-executable|x-sharedlib)" || true',
{ cwd: ROOT, encoding: 'utf-8' }
).trim();
const files = result ? result.split('\n').map((l: string) => l.split(':')[0].trim()) : [];
expect(files).toEqual([]);
// Only mode 100755 (executable) files can be binaries we care about. Pre-filter
// via git ls-files -s to avoid running `file` on every text file.
const lsOut: string = require('child_process').execSync('git ls-files -s', {
cwd: ROOT,
encoding: 'utf-8',
});
const executableFiles = lsOut
.split('\n')
.filter(Boolean)
.map((line: string) => {
const parts = line.split(/\s+/);
return { mode: parts[0], file: line.split('\t')[1] };
})
.filter((e: { mode: string; file: string }) => e.mode === '100755')
.map((e: { mode: string; file: string }) => e.file);
if (executableFiles.length === 0) return;
// Batch-invoke `file --mime-type` across all executable files at once.
const result: string = require('child_process')
.execSync(`file --mime-type -- ${executableFiles.map((f: string) => `'${f.replace(/'/g, "'\\''")}'`).join(' ')}`, {
cwd: ROOT,
encoding: 'utf-8',
})
.trim();
const binaries = result
.split('\n')
.filter((l: string) =>
/application\/(x-mach-binary|x-executable|x-pie-executable|x-sharedlib)/.test(l)
)
.map((l: string) => l.split(':')[0].trim());
expect(binaries).toEqual([]);
});
test('git tracks no files larger than 2MB', () => {
const result = require('child_process').execSync(
'git ls-files -z | xargs -0 -I{} sh -c \'size=$(wc -c < "{}" 2>/dev/null | tr -d " "); [ "$size" -gt 2097152 ] 2>/dev/null && echo "{}:${size}"\' || true',
{ cwd: ROOT, encoding: 'utf-8' }
).trim();
const files = result ? result.split('\n').filter(Boolean) : [];
expect(files).toEqual([]);
// Pure fs.statSync — no shell spawn per file.
const MAX_BYTES = 2 * 1024 * 1024;
const oversized = trackedFiles.filter((f: string) => {
const full = path.join(ROOT, f);
try {
return fs.statSync(full).size > MAX_BYTES;
} catch {
return false;
}
});
expect(oversized).toEqual([]);
});
});
+23 -12
View File
@@ -323,17 +323,28 @@ describe('gstack-team-init', () => {
});
describe('setup --team / --no-team / -q', () => {
test('setup -q produces no stdout', () => {
const result = run(`${path.join(ROOT, 'setup')} -q`, { cwd: ROOT });
// -q should suppress informational output (may still have some output from build)
// The key test is that the "Skill naming:" prompt and "gstack ready" messages are suppressed
expect(result.stdout).not.toContain('Skill naming:');
expect(result.stdout).not.toContain('gstack ready');
});
// `./setup` does a full install + build + skill regeneration. On a cold cache
// it routinely takes 60-90s. Give both tests a 3-minute budget so CI doesn't
// report pre-existing timeouts as failures.
test(
'setup -q produces no stdout',
() => {
const result = run(`${path.join(ROOT, 'setup')} -q`, { cwd: ROOT });
// -q should suppress informational output (may still have some output from build)
// The key test is that the "Skill naming:" prompt and "gstack ready" messages are suppressed
expect(result.stdout).not.toContain('Skill naming:');
expect(result.stdout).not.toContain('gstack ready');
},
180_000,
);
test('setup --local prints deprecation warning', () => {
// stderr capture: run via bash redirect so we can capture stderr
const result = run(`bash -c '${path.join(ROOT, 'setup')} --local -q 2>&1'`, { cwd: ROOT });
expect(result.stdout).toContain('deprecated');
});
test(
'setup --local prints deprecation warning',
() => {
// stderr capture: run via bash redirect so we can capture stderr
const result = run(`bash -c '${path.join(ROOT, 'setup')} --local -q 2>&1'`, { cwd: ROOT });
expect(result.stdout).toContain('deprecated');
},
180_000,
);
});