From 60fce976cbc36a3e63b0f76b22b598238c4dfeb8 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 14 Mar 2026 02:02:28 -0500 Subject: [PATCH] test: add e2e and LLM eval tests for SKILL.md setup block - 3 Agent SDK e2e tests: happy path, NEEDS_SETUP, non-git-repo - LLM eval: setup block clarity + actionability >= 4 - New error pattern: 'no such file or directory.*browse' These tests catch the exact failure mode where agents can't discover the browse binary via SKILL.md instructions. --- test/helpers/session-runner.ts | 1 + test/skill-e2e.test.ts | 89 ++++++++++++++++++++++++++++++++++ test/skill-llm-eval.test.ts | 13 +++++ 3 files changed, 103 insertions(+) diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts index 13e0b7eb..083536fc 100644 --- a/test/helpers/session-runner.ts +++ b/test/helpers/session-runner.ts @@ -23,6 +23,7 @@ const BROWSE_ERROR_PATTERNS = [ /Exit code 1/, /ERROR: browse binary not found/, /Server failed to start/, + /no such file or directory.*browse/i, ]; export async function runSkillTest(options: { diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index d395fe15..478b3415 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -73,6 +73,95 @@ Report what each command returned.`, expect(result.exitReason).toBe('success'); }, 90_000); + test('agent discovers browse binary via SKILL.md setup block', async () => { + const ROOT = path.resolve(import.meta.dir, '..'); + const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const setupStart = skillMd.indexOf('## SETUP'); + const setupEnd = skillMd.indexOf('## IMPORTANT'); + const setupBlock = skillMd.slice(setupStart, setupEnd); + + // Guard: verify we extracted a valid setup block + expect(setupBlock).toContain('browse/dist/browse'); + + const result = await runSkillTest({ + prompt: `Follow these instructions to find the browse binary and run a basic command. + +${setupBlock} + +After finding the binary, run: $B goto ${testServer.url} +Then run: $B text +Report whether it worked.`, + workingDirectory: tmpDir, + maxTurns: 10, + timeout: 60_000, + }); + + expect(result.browseErrors).toHaveLength(0); + expect(result.exitReason).toBe('success'); + }, 90_000); + + test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => { + // Create a tmpdir with no browse binary + const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-')); + + const ROOT = path.resolve(import.meta.dir, '..'); + const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const setupStart = skillMd.indexOf('## SETUP'); + const setupEnd = skillMd.indexOf('## IMPORTANT'); + const setupBlock = skillMd.slice(setupStart, setupEnd); + + const result = await runSkillTest({ + prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs. + +${setupBlock} + +Report the exact output. Do NOT try to fix or install anything — just report what you see.`, + workingDirectory: emptyDir, + maxTurns: 5, + timeout: 30_000, + }); + + // Agent should see NEEDS_SETUP (not crash or guess wrong paths) + const allText = result.messages + .map((m: any) => JSON.stringify(m)) + .join('\n'); + expect(allText).toContain('NEEDS_SETUP'); + + // Clean up + try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {} + }, 60_000); + + test('SKILL.md setup block works outside git repo', async () => { + // Create a tmpdir outside any git repo + const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-')); + + const ROOT = path.resolve(import.meta.dir, '..'); + const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const setupStart = skillMd.indexOf('## SETUP'); + const setupEnd = skillMd.indexOf('## IMPORTANT'); + const setupBlock = skillMd.slice(setupStart, setupEnd); + + const result = await runSkillTest({ + prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs. + +${setupBlock} + +Report the exact output — either "READY: " or "NEEDS_SETUP".`, + workingDirectory: nonGitDir, + maxTurns: 5, + timeout: 30_000, + }); + + // Should either find global binary (READY) or show NEEDS_SETUP — not crash + const allText = result.messages + .map((m: any) => JSON.stringify(m)) + .join('\n'); + expect(allText).toMatch(/READY|NEEDS_SETUP/); + + // Clean up + try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {} + }, 60_000); + test.todo('/qa quick completes without browse errors'); test.todo('/ship completes without browse errors'); test.todo('/review completes without browse errors'); diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index f978f035..1631a8b1 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -115,6 +115,19 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); + test('setup block scores >= 4 on actionability and clarity', async () => { + const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); + const setupStart = content.indexOf('## SETUP'); + const setupEnd = content.indexOf('## IMPORTANT'); + const section = content.slice(setupStart, setupEnd); + + const scores = await judge('setup/binary discovery instructions', section); + console.log('Setup block scores:', JSON.stringify(scores, null, 2)); + + expect(scores.actionability).toBeGreaterThanOrEqual(4); + expect(scores.clarity).toBeGreaterThanOrEqual(4); + }, 30_000); + test('regression check: compare branch vs baseline quality', async () => { // This test compares the generated output against the hand-maintained // baseline from main. The generated version should score equal or higher.