fix: browse binary discovery broken for agents (v0.3.5) (#44)

* fix: replace find-browse with direct path in SKILL.md setup blocks

Agents were skipping the find-browse binary and guessing bin/browse
(wrong path). Now the setup block explicitly checks browse/dist/browse
with workspace-local priority, global fallback.

Also adds || true to update check to prevent misleading exit code 1.

Adds {{UPDATE_CHECK}} and {{BROWSE_SETUP}} template placeholders to
gen-skill-docs.ts so all skills share a single source of truth.

* refactor: convert qa/ and setup-browser-cookies/ to .tmpl templates

Replaces hardcoded update check and find-browse blocks with
{{UPDATE_CHECK}} and {{BROWSE_SETUP}} placeholders. Both skills
are now generated from templates via gen-skill-docs.

* test: add e2e and LLM eval tests for SKILL.md setup block

- 3 Agent SDK e2e tests: happy path, NEEDS_SETUP, non-git-repo
- LLM eval: setup block clarity + actionability >= 4
- New error pattern: 'no such file or directory.*browse'

These tests catch the exact failure mode where agents can't discover
the browse binary via SKILL.md instructions.

* chore: bump version and changelog (v0.3.5)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-14 00:24:06 -07:00
committed by GitHub
parent 6b69c46a27
commit 1717ed2891
15 changed files with 627 additions and 47 deletions
+1
View File
@@ -23,6 +23,7 @@ const BROWSE_ERROR_PATTERNS = [
/Exit code 1/,
/ERROR: browse binary not found/,
/Server failed to start/,
/no such file or directory.*browse/i,
];
export async function runSkillTest(options: {
+89
View File
@@ -73,6 +73,95 @@ Report what each command returned.`,
expect(result.exitReason).toBe('success');
}, 90_000);
test('agent discovers browse binary via SKILL.md setup block', async () => {
const ROOT = path.resolve(import.meta.dir, '..');
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
// Guard: verify we extracted a valid setup block
expect(setupBlock).toContain('browse/dist/browse');
const result = await runSkillTest({
prompt: `Follow these instructions to find the browse binary and run a basic command.
${setupBlock}
After finding the binary, run: $B goto ${testServer.url}
Then run: $B text
Report whether it worked.`,
workingDirectory: tmpDir,
maxTurns: 10,
timeout: 60_000,
});
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => {
// Create a tmpdir with no browse binary
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
const ROOT = path.resolve(import.meta.dir, '..');
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
const result = await runSkillTest({
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
${setupBlock}
Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
workingDirectory: emptyDir,
maxTurns: 5,
timeout: 30_000,
});
// Agent should see NEEDS_SETUP (not crash or guess wrong paths)
const allText = result.messages
.map((m: any) => JSON.stringify(m))
.join('\n');
expect(allText).toContain('NEEDS_SETUP');
// Clean up
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
}, 60_000);
test('SKILL.md setup block works outside git repo', async () => {
// Create a tmpdir outside any git repo
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
const ROOT = path.resolve(import.meta.dir, '..');
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
const result = await runSkillTest({
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
${setupBlock}
Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
workingDirectory: nonGitDir,
maxTurns: 5,
timeout: 30_000,
});
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
const allText = result.messages
.map((m: any) => JSON.stringify(m))
.join('\n');
expect(allText).toMatch(/READY|NEEDS_SETUP/);
// Clean up
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
}, 60_000);
test.todo('/qa quick completes without browse errors');
test.todo('/ship completes without browse errors');
test.todo('/review completes without browse errors');
+13
View File
@@ -115,6 +115,19 @@ describeEval('LLM-as-judge quality evals', () => {
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
test('setup block scores >= 4 on actionability and clarity', async () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = content.indexOf('## SETUP');
const setupEnd = content.indexOf('## IMPORTANT');
const section = content.slice(setupStart, setupEnd);
const scores = await judge('setup/binary discovery instructions', section);
console.log('Setup block scores:', JSON.stringify(scores, null, 2));
expect(scores.actionability).toBeGreaterThanOrEqual(4);
expect(scores.clarity).toBeGreaterThanOrEqual(4);
}, 30_000);
test('regression check: compare branch vs baseline quality', async () => {
// This test compares the generated output against the hand-maintained
// baseline from main. The generated version should score equal or higher.