mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
Merge remote-tracking branch 'origin/main' into v0.3.6-qa-upgrades
# Conflicts: # test/skill-e2e.test.ts
This commit is contained in:
@@ -33,6 +33,7 @@ const BROWSE_ERROR_PATTERNS = [
|
||||
/Exit code 1/,
|
||||
/ERROR: browse binary not found/,
|
||||
/Server failed to start/,
|
||||
/no such file or directory.*browse/i,
|
||||
];
|
||||
|
||||
export async function runSkillTest(options: {
|
||||
|
||||
+84
-1
@@ -132,6 +132,89 @@ Report what each command returned.`,
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
|
||||
test('agent discovers browse binary via SKILL.md setup block', async () => {
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
// Guard: verify we extracted a valid setup block
|
||||
expect(setupBlock).toContain('browse/dist/browse');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions to find the browse binary and run a basic command.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
After finding the binary, run: $B goto ${testServer.url}
|
||||
Then run: $B text
|
||||
Report whether it worked.`,
|
||||
workingDirectory: tmpDir,
|
||||
maxTurns: 10,
|
||||
timeout: 60_000,
|
||||
});
|
||||
|
||||
expect(result.browseErrors).toHaveLength(0);
|
||||
expect(result.exitReason).toBe('success');
|
||||
}, 90_000);
|
||||
|
||||
test('SKILL.md setup block shows NEEDS_SETUP when binary missing', async () => {
|
||||
// Create a tmpdir with no browse binary
|
||||
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
|
||||
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
|
||||
workingDirectory: emptyDir,
|
||||
maxTurns: 5,
|
||||
timeout: 30_000,
|
||||
});
|
||||
|
||||
// Agent should see NEEDS_SETUP (not crash or guess wrong paths)
|
||||
const allText = result.output || '';
|
||||
expect(allText).toContain('NEEDS_SETUP');
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('SKILL.md setup block works outside git repo', async () => {
|
||||
// Create a tmpdir outside any git repo
|
||||
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
|
||||
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = skillMd.indexOf('## SETUP');
|
||||
const setupEnd = skillMd.indexOf('## IMPORTANT');
|
||||
const setupBlock = skillMd.slice(setupStart, setupEnd);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
|
||||
|
||||
${setupBlock}
|
||||
|
||||
Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
workingDirectory: nonGitDir,
|
||||
maxTurns: 5,
|
||||
timeout: 30_000,
|
||||
});
|
||||
|
||||
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
|
||||
const allText = result.output || '';
|
||||
expect(allText).toMatch(/READY|NEEDS_SETUP/);
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
});
|
||||
|
||||
// --- B4: QA skill E2E ---
|
||||
@@ -264,7 +347,7 @@ describeOutcome('Planted-bug outcome evals', () => {
|
||||
fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true });
|
||||
const reportPath = path.join(reportDir, 'qa-report.md');
|
||||
|
||||
// Phase 1: Agent SDK runs /qa Standard
|
||||
// Phase 1: runs /qa Standard
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
||||
|
||||
|
||||
@@ -65,6 +65,19 @@ describeEval('LLM-as-judge quality evals', () => {
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('setup block scores >= 4 on actionability and clarity', async () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const setupStart = content.indexOf('## SETUP');
|
||||
const setupEnd = content.indexOf('## IMPORTANT');
|
||||
const section = content.slice(setupStart, setupEnd);
|
||||
|
||||
const scores = await judge('setup/binary discovery instructions', section);
|
||||
console.log('Setup block scores:', JSON.stringify(scores, null, 2));
|
||||
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
test('regression check: compare branch vs baseline quality', async () => {
|
||||
// This test compares the generated output against the hand-maintained
|
||||
// baseline from main. The generated version should score equal or higher.
|
||||
|
||||
Reference in New Issue
Block a user