mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 21:46:40 +02:00
test: add e2e and LLM eval tests for SKILL.md setup block
- 3 Agent SDK e2e tests: happy path, NEEDS_SETUP, non-git-repo - LLM eval: setup block clarity + actionability >= 4 - New error pattern: 'no such file or directory.*browse' These tests catch the exact failure mode where agents can't discover the browse binary via SKILL.md instructions.
This commit is contained in:
@@ -73,6 +73,95 @@ Report what each command returned.`,
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('agent discovers browse binary via SKILL.md setup block', async () => {
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
// Guard: verify we extracted a valid setup block
|
||||
expect(setupBlock).toContain('browse/dist/browse');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions to find the browse binary and run a basic command.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
After finding the binary, run: $B goto ${testServer.url}
|
||||
Then run: $B text
|
||||
Report whether it worked.`,
|
||||
workingDirectory: tmpDir,
|
||||
maxTurns: 10,
|
||||
timeout: 60_000,
|
||||
});
|
||||
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => {
|
||||
// Create a tmpdir with no browse binary
|
||||
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
|
||||
workingDirectory: emptyDir,
|
||||
maxTurns: 5,
|
||||
timeout: 30_000,
|
||||
});
|
||||
|
||||
// Agent should see NEEDS_SETUP (not crash or guess wrong paths)
|
||||
const allText = result.messages
|
||||
.map((m: any) => JSON.stringify(m))
|
||||
.join('\n');
|
||||
expect(allText).toContain('NEEDS_SETUP');
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('SKILL.md setup block works outside git repo', async () => {
|
||||
// Create a tmpdir outside any git repo
|
||||
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
workingDirectory: nonGitDir,
|
||||
maxTurns: 5,
|
||||
timeout: 30_000,
|
||||
});
|
||||
|
||||
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
|
||||
const allText = result.messages
|
||||
.map((m: any) => JSON.stringify(m))
|
||||
.join('\n');
|
||||
expect(allText).toMatch(/READY|NEEDS_SETUP/);
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test.todo('/qa quick completes without browse errors');
|
||||
test.todo('/ship completes without browse errors');
|
||||
test.todo('/review completes without browse errors');
|
||||
|
||||
Reference in New Issue
Block a user