test: add e2e and LLM eval tests for SKILL.md setup block

- 3 Agent SDK e2e tests: happy path, NEEDS_SETUP, non-git-repo - LLM eval: setup block clarity + actionability >= 4 - New error pattern: 'no such file or directory.*browse' These tests catch the exact failure mode where agents can't discover the browse binary via SKILL.md instructions.
2026-05-08 06:26:45 +02:00 · 2026-03-14 02:02:28 -05:00
parent 8e1feb7fa2
commit 60fce976cb
3 changed files with 103 additions and 0 deletions
@@ -115,6 +115,19 @@ describeEval('LLM-as-judge quality evals', () => {
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

+  test('setup block scores >= 4 on actionability and clarity', async () => {
+    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    const setupStart = content.indexOf('## SETUP');
+    const setupEnd = content.indexOf('## IMPORTANT');
+    const section = content.slice(setupStart, setupEnd);
+
+    const scores = await judge('setup/binary discovery instructions', section);
+    console.log('Setup block scores:', JSON.stringify(scores, null, 2));
+
+    expect(scores.actionability).toBeGreaterThanOrEqual(4);
+    expect(scores.clarity).toBeGreaterThanOrEqual(4);
+  }, 30_000);
+
  test('regression check: compare branch vs baseline quality', async () => {
    // This test compares the generated output against the hand-maintained
    // baseline from main. The generated version should score equal or higher.