fix: browse binary discovery broken for agents (v0.3.5) (#44)

* fix: replace find-browse with direct path in SKILL.md setup blocks Agents were skipping the find-browse binary and guessing bin/browse (wrong path). Now the setup block explicitly checks browse/dist/browse with workspace-local priority, global fallback. Also adds || true to update check to prevent misleading exit code 1. Adds {{UPDATE_CHECK}} and {{BROWSE_SETUP}} template placeholders to gen-skill-docs.ts so all skills share a single source of truth. * refactor: convert qa/ and setup-browser-cookies/ to .tmpl templates Replaces hardcoded update check and find-browse blocks with {{UPDATE_CHECK}} and {{BROWSE_SETUP}} placeholders. Both skills are now generated from templates via gen-skill-docs. * test: add e2e and LLM eval tests for SKILL.md setup block - 3 Agent SDK e2e tests: happy path, NEEDS_SETUP, non-git-repo - LLM eval: setup block clarity + actionability >= 4 - New error pattern: 'no such file or directory.*browse' These tests catch the exact failure mode where agents can't discover the browse binary via SKILL.md instructions. * chore: bump version and changelog (v0.3.5) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-01 19:25:10 +02:00 · 2026-03-14 00:24:06 -07:00
parent 6b69c46a27
commit 1717ed2891
15 changed files with 627 additions and 47 deletions
@@ -23,6 +23,7 @@ const BROWSE_ERROR_PATTERNS = [
  /Exit code 1/,
  /ERROR: browse binary not found/,
  /Server failed to start/,
+  /no such file or directory.*browse/i,
 ];

 export async function runSkillTest(options: {
@@ -73,6 +73,95 @@ Report what each command returned.`,
    expect(result.exitReason).toBe('success');
  }, 90_000);

+  test('agent discovers browse binary via SKILL.md setup block', async () => {
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    // Guard: verify we extracted a valid setup block
+    expect(setupBlock).toContain('browse/dist/browse');
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions to find the browse binary and run a basic command.
+
+${setupBlock}
+
+After finding the binary, run: $B goto ${testServer.url}
+Then run: $B text
+Report whether it worked.`,
+      workingDirectory: tmpDir,
+      maxTurns: 10,
+      timeout: 60_000,
+    });
+
+    expect(result.browseErrors).toHaveLength(0);
+    expect(result.exitReason).toBe('success');
+  }, 90_000);
+
+  test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => {
+    // Create a tmpdir with no browse binary
+    const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
+
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
+
+${setupBlock}
+
+Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
+      workingDirectory: emptyDir,
+      maxTurns: 5,
+      timeout: 30_000,
+    });
+
+    // Agent should see NEEDS_SETUP (not crash or guess wrong paths)
+    const allText = result.messages
+      .map((m: any) => JSON.stringify(m))
+      .join('\n');
+    expect(allText).toContain('NEEDS_SETUP');
+
+    // Clean up
+    try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
+  }, 60_000);
+
+  test('SKILL.md setup block works outside git repo', async () => {
+    // Create a tmpdir outside any git repo
+    const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
+
+    const ROOT = path.resolve(import.meta.dir, '..');
+    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = skillMd.indexOf('## SETUP');
+    const setupEnd = skillMd.indexOf('## IMPORTANT');
+    const setupBlock = skillMd.slice(setupStart, setupEnd);
+
+    const result = await runSkillTest({
+      prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
+
+${setupBlock}
+
+Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
+      workingDirectory: nonGitDir,
+      maxTurns: 5,
+      timeout: 30_000,
+    });
+
+    // Should either find global binary (READY) or show NEEDS_SETUP — not crash
+    const allText = result.messages
+      .map((m: any) => JSON.stringify(m))
+      .join('\n');
+    expect(allText).toMatch(/READY|NEEDS_SETUP/);
+
+    // Clean up
+    try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
+  }, 60_000);
+
  test.todo('/qa quick completes without browse errors');
  test.todo('/ship completes without browse errors');
  test.todo('/review completes without browse errors');
@@ -115,6 +115,19 @@ describeEval('LLM-as-judge quality evals', () => {
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

+  test('setup block scores >= 4 on actionability and clarity', async () => {
+    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = content.indexOf('## SETUP');
+    const setupEnd = content.indexOf('## IMPORTANT');
+    const section = content.slice(setupStart, setupEnd);
+
+    const scores = await judge('setup/binary discovery instructions', section);
+    console.log('Setup block scores:', JSON.stringify(scores, null, 2));
+
+    expect(scores.actionability).toBeGreaterThanOrEqual(4);
+    expect(scores.clarity).toBeGreaterThanOrEqual(4);
+  }, 30_000);
+
  test('regression check: compare branch vs baseline quality', async () => {
    // This test compares the generated output against the hand-maintained
    // baseline from main. The generated version should score equal or higher.