From 60fce976cbc36a3e63b0f76b22b598238c4dfeb8 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sat, 14 Mar 2026 02:02:28 -0500
Subject: [PATCH] test: add e2e and LLM eval tests for SKILL.md setup block

- 3 Agent SDK e2e tests: happy path, NEEDS_SETUP, non-git-repo
- LLM eval: setup block clarity + actionability >= 4
- New error pattern: 'no such file or directory.*browse'

These tests catch the exact failure mode where agents can't discover
the browse binary via SKILL.md instructions.
---
 test/helpers/session-runner.ts |  1 +
 test/skill-e2e.test.ts         | 89 ++++++++++++++++++++++++++++++++++
 test/skill-llm-eval.test.ts    | 13 +++++
 3 files changed, 103 insertions(+)

diff --git a/test/helpers/session-runner.ts b/test/helpers/session-runner.ts
index 13e0b7eb..083536fc 100644
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@@ -23,6 +23,7 @@ const BROWSE_ERROR_PATTERNS = [
   /Exit code 1/,
   /ERROR: browse binary not found/,
   /Server failed to start/,
+  /no such file or directory.*browse/i,
 ];
 
 export async function runSkillTest(options: {
diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts
index d395fe15..478b3415 100644
--- a/test/skill-e2e.test.ts
+++ b/test/skill-e2e.test.ts
@@ -73,6 +73,95 @@ Report what each command returned.`,
     expect(result.exitReason).toBe('success');
   }, 90_000);
 
+  test('agent discovers browse binary via SKILL.md setup block', async () => {
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    // Guard: verify we extracted a valid setup block
+    expect(setupBlock).toContain('browse/dist/browse');
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions to find the browse binary and run a basic command.
+
+${setupBlock}
+
+After finding the binary, run: $B goto ${testServer.url}
+Then run: $B text
+Report whether it worked.`,
+      workingDirectory: tmpDir,
+      maxTurns: 10,
+      timeout: 60_000,
+    });
+
+    expect(result.browseErrors).toHaveLength(0);
+    expect(result.exitReason).toBe('success');
+  }, 90_000);
+
+  test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => {
+    // Create a tmpdir with no browse binary
+    const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
+
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
+
+${setupBlock}
+
+Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
+      workingDirectory: emptyDir,
+      maxTurns: 5,
+      timeout: 30_000,
+    });
+
+    // Agent should see NEEDS_SETUP (not crash or guess wrong paths)
+    const allText = result.messages
+      .map((m: any) => JSON.stringify(m))
+      .join('\n');
+    expect(allText).toContain('NEEDS_SETUP');
+
+    // Clean up
+    try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
+  }, 60_000);
+
+  test('SKILL.md setup block works outside git repo', async () => {
+    // Create a tmpdir outside any git repo
+    const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
+
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
+
+${setupBlock}
+
+Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
+      workingDirectory: nonGitDir,
+      maxTurns: 5,
+      timeout: 30_000,
+    });
+
+    // Should either find global binary (READY) or show NEEDS_SETUP — not crash
+    const allText = result.messages
+      .map((m: any) => JSON.stringify(m))
+      .join('\n');
+    expect(allText).toMatch(/READY|NEEDS_SETUP/);
+
+    // Clean up
+    try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
+  }, 60_000);
+
   test.todo('/qa quick completes without browse errors');
   test.todo('/ship completes without browse errors');
   test.todo('/review completes without browse errors');
diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts
index f978f035..1631a8b1 100644
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -115,6 +115,19 @@ describeEval('LLM-as-judge quality evals', () => {
     expect(scores.actionability).toBeGreaterThanOrEqual(4);
   }, 30_000);
 
+  test('setup block scores >= 4 on actionability and clarity', async () => {
+    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = content.indexOf('## SETUP');
+    const setupEnd = content.indexOf('## IMPORTANT');
+    const section = content.slice(setupStart, setupEnd);
+
+    const scores = await judge('setup/binary discovery instructions', section);
+    console.log('Setup block scores:', JSON.stringify(scores, null, 2));
+
+    expect(scores.actionability).toBeGreaterThanOrEqual(4);
+    expect(scores.clarity).toBeGreaterThanOrEqual(4);
+  }, 30_000);
+
   test('regression check: compare branch vs baseline quality', async () => {
     // This test compares the generated output against the hand-maintained
     // baseline from main. The generated version should score equal or higher.