mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
fix: browse binary discovery broken for agents (v0.3.5) (#44)
* fix: replace find-browse with direct path in SKILL.md setup blocks
Agents were skipping the find-browse binary and guessing bin/browse
(wrong path). Now the setup block explicitly checks browse/dist/browse
with workspace-local priority, global fallback.
Also adds || true to update check to prevent misleading exit code 1.
Adds {{UPDATE_CHECK}} and {{BROWSE_SETUP}} template placeholders to
gen-skill-docs.ts so all skills share a single source of truth.
* refactor: convert qa/ and setup-browser-cookies/ to .tmpl templates
Replaces hardcoded update check and find-browse blocks with
{{UPDATE_CHECK}} and {{BROWSE_SETUP}} placeholders. Both skills
are now generated from templates via gen-skill-docs.
* test: add e2e and LLM eval tests for SKILL.md setup block
- 3 Agent SDK e2e tests: happy path, NEEDS_SETUP, non-git-repo
- LLM eval: setup block clarity + actionability >= 4
- New error pattern: 'no such file or directory.*browse'
These tests catch the exact failure mode where agents can't discover
the browse binary via SKILL.md instructions.
* chore: bump version and changelog (v0.3.5)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---------
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -23,6 +23,7 @@ const BROWSE_ERROR_PATTERNS = [
|
||||
/Exit code 1/,
|
||||
/ERROR: browse binary not found/,
|
||||
/Server failed to start/,
|
||||
/no such file or directory.*browse/i,
|
||||
];
|
||||
|
||||
export async function runSkillTest(options: {
|
||||
|
||||
@@ -73,6 +73,95 @@ Report what each command returned.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('agent discovers browse binary via SKILL.md setup block', async () => {
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
// Guard: verify we extracted a valid setup block
|
||||
expect(setupBlock).toContain('browse/dist/browse');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions to find the browse binary and run a basic command.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
After finding the binary, run: $B goto ${testServer.url}
|
||||
Then run: $B text
|
||||
Report whether it worked.`,
|
||||
workingDirectory: tmpDir,
|
||||
maxTurns: 10,
|
||||
timeout: 60_000,
|
||||
});
|
||||
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => {
|
||||
// Create a tmpdir with no browse binary
|
||||
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
|
||||
workingDirectory: emptyDir,
|
||||
maxTurns: 5,
|
||||
timeout: 30_000,
|
||||
});
|
||||
|
||||
// Agent should see NEEDS_SETUP (not crash or guess wrong paths)
|
||||
const allText = result.messages
|
||||
.map((m: any) => JSON.stringify(m))
|
||||
.join('\n');
|
||||
expect(allText).toContain('NEEDS_SETUP');
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('SKILL.md setup block works outside git repo', async () => {
|
||||
// Create a tmpdir outside any git repo
|
||||
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
workingDirectory: nonGitDir,
|
||||
maxTurns: 5,
|
||||
timeout: 30_000,
|
||||
});
|
||||
|
||||
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
|
||||
const allText = result.messages
|
||||
.map((m: any) => JSON.stringify(m))
|
||||
.join('\n');
|
||||
expect(allText).toMatch(/READY|NEEDS_SETUP/);
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test.todo('/qa quick completes without browse errors');
|
||||
test.todo('/ship completes without browse errors');
|
||||
test.todo('/review completes without browse errors');
|
||||
|
||||
@@ -115,6 +115,19 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('setup block scores >= 4 on actionability and clarity', async () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = content.indexOf('## SETUP');
|
||||
const setupEnd = content.indexOf('## IMPORTANT');
|
||||
const section = content.slice(setupStart, setupEnd);
|
||||
|
||||
const scores = await judge('setup/binary discovery instructions', section);
|
||||
console.log('Setup block scores:', JSON.stringify(scores, null, 2));
|
||||
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('regression check: compare branch vs baseline quality', async () => {
|
||||
// This test compares the generated output against the hand-maintained
|
||||
// baseline from main. The generated version should score equal or higher.
|
||||
|
||||
Reference in New Issue
Block a user