gstack/test/skill-e2e-opus-47.test.ts

/**
 * Opus 4.7 behavior evals.
 *
 * Two cases, both pinned to claude-opus-4-7:
 *
 * 1. Fanout rate — the "Fan out explicitly" overlay nudge should make 4.7
 *    spawn parallel tool calls when the prompt has independent sub-problems.
 *    A/B: SKILL.md regenerated with `--model opus-4-7` (overlay ON) vs
 *    default `--model claude` (overlay OFF). Assert A ≥ B on parallel-call
 *    count in the first assistant turn.
 *
 * 2. Routing precision — the new "when in doubt, invoke the skill" policy
 *    should route ambiguous dev prompts to the right skill WITHOUT routing
 *    casual/non-dev prompts. A handful of positive and negative controls.
 *
 * Both cases require a running Anthropic API key. Gated behind EVALS=1.
 * Classify as `periodic` in touchfiles — behavior measurement, not gate.
 */

import { describe, test, expect, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import { EvalCollector } from './helpers/eval-store';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';

const ROOT = path.resolve(import.meta.dir, '..');
const OPUS_47 = 'claude-opus-4-7';

const evalsEnabled = !!process.env.EVALS;
const describeE2E = evalsEnabled ? describe : describe.skip;
const evalCollector = evalsEnabled ? new EvalCollector('e2e-opus-47') : null;
const runId = new Date().toISOString().replace(/[:.]/g, '').replace('T', '-').slice(0, 15);

// --- Helpers ---

/** Regenerate SKILL.md files at the given model into a scratch root, return that root. */
function regenSkillsAt(model: string, suffix: string): string {
  const tmp = fs.mkdtempSync(path.join(os.tmpdir(), `opus47-${suffix}-`));

  // Bun runtime: run gen-skill-docs in a fresh copy of the repo so we don't
  // pollute the main working tree. We need: SKILL.md.tmpl files, scripts/,
  // model-overlays/, hosts/. Easiest is to run from ROOT and copy outputs.
  const result = spawnSync(
    'bun',
    ['run', 'scripts/gen-skill-docs.ts', '--model', model],
    { cwd: ROOT, stdio: 'pipe', encoding: 'utf-8', timeout: 60_000 },
  );
  if (result.status !== 0) {
    throw new Error(`gen-skill-docs failed for --model ${model}: ${result.stderr}`);
  }

  // Copy the top-level generated SKILL.md into the scratch dir (under
  // .claude/skills/gstack/ which is where Claude looks for project skills).
  const skillDir = path.join(tmp, '.claude', 'skills', 'gstack');
  fs.mkdirSync(skillDir, { recursive: true });
  fs.copyFileSync(path.join(ROOT, 'SKILL.md'), path.join(skillDir, 'SKILL.md'));

  // Minimal project context
  fs.writeFileSync(
    path.join(tmp, 'CLAUDE.md'),
    `# Project\n\nSee .claude/skills/gstack/SKILL.md for skill definitions.\n`,
  );
  fs.writeFileSync(path.join(tmp, 'package.json'), '{"name":"opus47-eval"}');

  // git init so any downstream git-aware logic doesn't blow up
  const git = (args: string[]) =>
    spawnSync('git', args, { cwd: tmp, stdio: 'pipe', timeout: 5_000 });
  git(['init']);
  git(['config', 'user.email', 't@t.com']);
  git(['config', 'user.name', 'T']);
  git(['add', '.']);
  git(['commit', '-m', 'init']);

  return tmp;
}

/** Count parallel tool calls in the first assistant turn. */
function firstTurnParallelism(transcript: any[]): number {
  const firstAssistant = transcript.find((e) => e.type === 'assistant');
  if (!firstAssistant) return 0;
  const content = firstAssistant.message?.content ?? [];
  return content.filter((c: any) => c.type === 'tool_use').length;
}

interface RoutingCase {
  name: string;
  prompt: string;
  shouldRoute: boolean;
  expectedSkill?: string;
}

/** Small, intentionally chosen routing cases. Positive cases are ambiguous
 *  phrasings the user actually says, not template text. Negative cases are
 *  casual or off-topic prompts that match routing keywords but shouldn't
 *  trigger a skill. */
const ROUTING_CASES: RoutingCase[] = [
  // Positive — should route
  { name: 'pos-wtf-bug',    prompt: "wtf is this error coming from auth.ts:47 when the cookie expires?",           shouldRoute: true, expectedSkill: 'investigate' },
  { name: 'pos-send-it',    prompt: "ok this is good enough, let's send it.",                                       shouldRoute: true, expectedSkill: 'ship' },
  { name: 'pos-does-it-work', prompt: "I just pushed the login flow changes. Test the deployed site and find any bugs.",                shouldRoute: true, expectedSkill: 'qa' },
  // Negative — should NOT route
  { name: 'neg-syntax-q',   prompt: "wtf does this Python list comprehension syntax even mean, [x for x in y if z]?", shouldRoute: false },
  { name: 'neg-algo-q',     prompt: "does this bubble sort algorithm actually work in O(n log n)?",                   shouldRoute: false },
  { name: 'neg-slack-send', prompt: "can you help me write the slack message? I want to send it to the team.",       shouldRoute: false },
];

// --- Tests ---

describeE2E('Opus 4.7 overlay behavior evals', () => {
  afterAll(() => {
    evalCollector?.finalize();
  });

  test(
    'fanout: overlay ON emits >= parallel calls vs overlay OFF on 3-file investigate task',
    async () => {
      const armA = regenSkillsAt('opus-4-7', 'on');
      const armB = regenSkillsAt('claude', 'off');

      // Populate three tiny independent files in each arm. The prompt asks
      // the agent to read all three and report. Opus 4.7 (without nudge)
      // tends to serialize; with the nudge it should parallelize.
      for (const dir of [armA, armB]) {
        fs.writeFileSync(path.join(dir, 'alpha.txt'), 'alpha content: 1\n');
        fs.writeFileSync(path.join(dir, 'beta.txt'),  'beta content: 2\n');
        fs.writeFileSync(path.join(dir, 'gamma.txt'), 'gamma content: 3\n');
      }

      const prompt =
        "Read alpha.txt, beta.txt, and gamma.txt in this directory and report what's inside each. These three reads are independent.";

      try {
        const [resA, resB] = await Promise.all([
          runSkillTest({
            prompt,
            workingDirectory: armA,
            maxTurns: 5,
            allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
            timeout: 90_000,
            testName: 'fanout-arm-overlay-on',
            runId,
            model: OPUS_47,
          }),
          runSkillTest({
            prompt,
            workingDirectory: armB,
            maxTurns: 5,
            allowedTools: ['Read', 'Bash', 'Glob', 'Grep'],
            timeout: 90_000,
            testName: 'fanout-arm-overlay-off',
            runId,
            model: OPUS_47,
          }),
        ]);

        const parA = firstTurnParallelism(resA.transcript);
        const parB = firstTurnParallelism(resB.transcript);

        console.log(
          `[opus-4-7 fanout] arm A (overlay ON): ${parA} parallel tool calls in first turn; ` +
            `arm B (overlay OFF): ${parB}`,
        );
        console.log(`  cost A=$${resA.costEstimate.estimatedCost.toFixed(2)} B=$${resB.costEstimate.estimatedCost.toFixed(2)}`);

        evalCollector?.addTest({
          name: 'fanout-arm-overlay-on',
          suite: 'Opus 4.7 overlay',
          tier: 'e2e',
          passed: parA >= parB,
          duration_ms: resA.duration,
          cost_usd: resA.costEstimate.estimatedCost,
          transcript: resA.transcript,
          output: `parallel=${parA}`,
          turns_used: resA.costEstimate.turnsUsed,
          exit_reason: resA.exitReason,
        });
        evalCollector?.addTest({
          name: 'fanout-arm-overlay-off',
          suite: 'Opus 4.7 overlay',
          tier: 'e2e',
          passed: true, // baseline arm, recorded for comparison
          duration_ms: resB.duration,
          cost_usd: resB.costEstimate.estimatedCost,
          transcript: resB.transcript,
          output: `parallel=${parB}`,
          turns_used: resB.costEstimate.turnsUsed,
          exit_reason: resB.exitReason,
        });

        // Main assertion: overlay arm is at least as parallel as baseline.
        expect(parA, `overlay arm emitted ${parA} parallel calls, baseline ${parB}`).toBeGreaterThanOrEqual(parB);
      } finally {
        fs.rmSync(armA, { recursive: true, force: true });
        fs.rmSync(armB, { recursive: true, force: true });
      }
    },
    240_000,
  );

  test(
    'routing precision: positives route, negatives do not',
    async () => {
      // Single SKILL.md tree shared by all cases. We run claude-opus-4-7 with
      // tool access to Skill; measure whether the first tool call is Skill(..)
      // and if so, which skill.
      const root = regenSkillsAt('opus-4-7', 'routing');

      try {
        const results = await Promise.all(
          ROUTING_CASES.map((c) =>
            runSkillTest({
              prompt: c.prompt,
              workingDirectory: root,
              maxTurns: 3,
              allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
              timeout: 90_000,
              testName: `routing-${c.name}`,
              runId,
              model: OPUS_47,
            }).then((r) => ({ c, r })),
          ),
        );

        let tp = 0, fn = 0, fp = 0, tn = 0;
        const rows: string[] = [];
        let totalCost = 0;

        for (const { c, r } of results) {
          const skillCalls = r.toolCalls.filter((tc) => tc.tool === 'Skill');
          const routed = skillCalls.length > 0;
          const actualSkill = routed ? skillCalls[0]?.input?.skill : undefined;

          const correct = c.shouldRoute
            ? routed && (!c.expectedSkill || actualSkill === c.expectedSkill)
            : !routed;

          if (c.shouldRoute && routed) tp++;
          else if (c.shouldRoute && !routed) fn++;
          else if (!c.shouldRoute && routed) fp++;
          else tn++;

          totalCost += r.costEstimate.estimatedCost;
          rows.push(
            `  ${c.name.padEnd(18)} routed=${String(routed).padEnd(5)} skill=${String(actualSkill).padEnd(16)} ` +
              `expected=${c.shouldRoute ? (c.expectedSkill ?? 'any') : '(none)'} ${correct ? 'OK' : 'MISS'}`,
          );

          evalCollector?.addTest({
            name: `routing-${c.name}`,
            suite: 'Opus 4.7 routing',
            tier: 'e2e',
            passed: correct,
            duration_ms: r.duration,
            cost_usd: r.costEstimate.estimatedCost,
            transcript: r.transcript,
            output: `routed=${routed} actual=${actualSkill ?? '(none)'} expected=${c.shouldRoute ? c.expectedSkill ?? 'any' : '(none)'}`,
            turns_used: r.costEstimate.turnsUsed,
            exit_reason: r.exitReason,
          });
        }

        const posCount = ROUTING_CASES.filter((c) => c.shouldRoute).length;
        const negCount = ROUTING_CASES.length - posCount;
        const tpRate = posCount > 0 ? tp / posCount : 0;
        const fpRate = negCount > 0 ? fp / negCount : 0;

        console.log(`[opus-4-7 routing] total cost $${totalCost.toFixed(2)}`);
        console.log(rows.join('\n'));
        console.log(
          `  TP=${tp}/${posCount} (${(tpRate * 100).toFixed(0)}%)  FN=${fn}  ` +
            `FP=${fp}/${negCount} (${(fpRate * 100).toFixed(0)}%)  TN=${tn}`,
        );

        // Thresholds from the test plan artifact: TP >= 80%, FP <= 30%.
        // With a small N we loosen slightly: TP >= 66% (2 of 3 positive),
        // FP <= 33% (no more than 1 of 3 negatives).
        expect(tpRate, `true-positive rate ${(tpRate * 100).toFixed(0)}% (need >= 66%)`).toBeGreaterThanOrEqual(2 / 3);
        expect(fpRate, `false-positive rate ${(fpRate * 100).toFixed(0)}% (need <= 33%)`).toBeLessThanOrEqual(1 / 3);
      } finally {
        fs.rmSync(root, { recursive: true, force: true });
      }
    },
    360_000,
  );
});