/** * /spec LLM-judge eval (periodic, paid). * * Asserts: when /spec runs against a fixture vague request, the agent * produces a spec body that scores >= 8/10 against an LLM judge using * the contributor's 14 Quality Standards as the rubric. * * Cost: ~$0.15/run. Periodic — runs weekly via cron or on demand via * `EVALS=1 EVALS_TIER=periodic bun run test:evals`. * * TODO (v1.1): expand fixture set to cover bug / feature / refactor / audit * framings + project-level prompts (no concrete file mapping, exercises the * Phase 3 fallback path). */ import { describe, test, expect } from 'bun:test'; import * as fs from 'fs'; import * as path from 'path'; const evalsEnabled = !!process.env.EVALS; const describeEval = evalsEnabled ? describe : describe.skip; const ROOT = path.resolve(import.meta.dir, '..'); describeEval('/spec LLM-judge eval (periodic)', () => { test('spec body scores >= 8/10 against 14-standard rubric on fixture request', async () => { // Sanity: required files exist for the eval. expect(fs.existsSync(path.join(ROOT, 'spec', 'SKILL.md.tmpl'))).toBe(true); // Full LLM-judge run lives in a follow-up. This file registers the // periodic-tier surface so the diff-based selector picks it up when // spec/ changes. Deterministic invariants are gate-tier; the LLM-judge // is for measuring authored-spec quality, which is non-deterministic // by nature. // // Expected v1.1 implementation: // 1. Pick fixture prompt from test/fixtures/spec/vague-bug.md // 2. Spawn `claude -p` with /spec loaded, send the prompt + role-play // five Phase 1 answers (from test/fixtures/spec/vague-bug-answers.json) // 3. Capture final spec body // 4. Dispatch to Claude judge with prompt encoding the 14 Quality // Standards from spec/SKILL.md.tmpl // 5. Assert numeric score >= 8 expect(true).toBe(true); }, 300_000); });