From 14b1ba07e99878f39be730fde36664dfe3efe8c2 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 1 May 2026 14:18:06 -0700 Subject: [PATCH] test: wire judgeRecommendation into plan-format E2E with threshold >= 4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All four plan-format cases (CEO mode, CEO approach, eng coverage, eng kind) now run the judge after the existing regex assertions. Threshold reason_substance >= 4 catches both boilerplate ("because it's better") and generic ("because it's faster") tier reasoning — exactly the failure modes the regex couldn't. Move recordE2E to after the judge call so judge_scores and judge_reasoning land in the eval-store JSON for diagnostics. Booleans are encoded as 0/1 to fit the Record shape EvalTestEntry.judge_scores expects. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/skill-e2e-plan-format.test.ts | 91 ++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 12 deletions(-) diff --git a/test/skill-e2e-plan-format.test.ts b/test/skill-e2e-plan-format.test.ts index 0532ca24..a2f65c85 100644 --- a/test/skill-e2e-plan-format.test.ts +++ b/test/skill-e2e-plan-format.test.ts @@ -25,6 +25,7 @@ import { logCost, recordE2E, createEvalCollector, finalizeEvalCollector, } from './helpers/e2e-helpers'; +import { judgeRecommendation } from './helpers/llm-judge'; import { spawnSync } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; @@ -135,9 +136,6 @@ After writing the file, stop. Do not continue the review.`, }); logCost('/plan-ceo-review format (mode)', result); - recordE2E(evalCollector, '/plan-ceo-review-format-mode', 'Plan Format — CEO Mode Selection', result, { - passed: ['success', 'error_max_turns'].includes(result.exitReason), - }); expect(['success', 'error_max_turns']).toContain(result.exitReason); expect(fs.existsSync(outFile)).toBe(true); @@ -149,6 +147,27 @@ After writing the file, stop. Do not continue the review.`, expect(captured).toMatch(RECOMMENDATION_RE); expect(captured).not.toMatch(COMPLETENESS_RE); expect(captured).toMatch(KIND_NOTE_RE); + + // Recommendation-quality judge: deterministic regex for present/commits/has_because, + // Haiku 4.5 for reason_substance 1-5. Threshold >= 4 catches generic-tier reasoning. + const recScore = await judgeRecommendation(captured); + recordE2E(evalCollector, '/plan-ceo-review-format-mode', 'Plan Format — CEO Mode Selection', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + judge_scores: { + rec_present: recScore.present ? 1 : 0, + rec_commits: recScore.commits ? 1 : 0, + rec_has_because: recScore.has_because ? 1 : 0, + rec_substance: recScore.reason_substance, + }, + judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`, + }); + expect(recScore.present, recScore.reasoning).toBe(true); + expect(recScore.commits, recScore.reasoning).toBe(true); + expect(recScore.has_because, recScore.reasoning).toBe(true); + expect( + recScore.reason_substance, + `${recScore.reasoning}\n reason: "${recScore.reason_text}"`, + ).toBeGreaterThanOrEqual(4); }, 300_000); }); @@ -187,9 +206,6 @@ After writing the file, stop. Do not continue the review.`, }); logCost('/plan-ceo-review format (approach)', result); - recordE2E(evalCollector, '/plan-ceo-review-format-approach', 'Plan Format — CEO Approach Menu', result, { - passed: ['success', 'error_max_turns'].includes(result.exitReason), - }); expect(['success', 'error_max_turns']).toContain(result.exitReason); expect(fs.existsSync(outFile)).toBe(true); @@ -199,6 +215,25 @@ After writing the file, stop. Do not continue the review.`, // Coverage-differentiated: both RECOMMENDATION and Completeness: N/10 required. expect(captured).toMatch(RECOMMENDATION_RE); expect(captured).toMatch(COMPLETENESS_RE); + + const recScore = await judgeRecommendation(captured); + recordE2E(evalCollector, '/plan-ceo-review-format-approach', 'Plan Format — CEO Approach Menu', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + judge_scores: { + rec_present: recScore.present ? 1 : 0, + rec_commits: recScore.commits ? 1 : 0, + rec_has_because: recScore.has_because ? 1 : 0, + rec_substance: recScore.reason_substance, + }, + judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`, + }); + expect(recScore.present, recScore.reasoning).toBe(true); + expect(recScore.commits, recScore.reasoning).toBe(true); + expect(recScore.has_because, recScore.reasoning).toBe(true); + expect( + recScore.reason_substance, + `${recScore.reasoning}\n reason: "${recScore.reason_text}"`, + ).toBeGreaterThanOrEqual(4); }, 300_000); }); @@ -240,9 +275,6 @@ After writing the file with that ONE question, stop. Do not continue the review. }); logCost('/plan-eng-review format (coverage)', result); - recordE2E(evalCollector, '/plan-eng-review-format-coverage', 'Plan Format — Eng Coverage Issue', result, { - passed: ['success', 'error_max_turns'].includes(result.exitReason), - }); expect(['success', 'error_max_turns']).toContain(result.exitReason); expect(fs.existsSync(outFile)).toBe(true); @@ -252,6 +284,25 @@ After writing the file with that ONE question, stop. Do not continue the review. // Coverage-differentiated: both RECOMMENDATION and Completeness: N/10 required. expect(captured).toMatch(RECOMMENDATION_RE); expect(captured).toMatch(COMPLETENESS_RE); + + const recScore = await judgeRecommendation(captured); + recordE2E(evalCollector, '/plan-eng-review-format-coverage', 'Plan Format — Eng Coverage Issue', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + judge_scores: { + rec_present: recScore.present ? 1 : 0, + rec_commits: recScore.commits ? 1 : 0, + rec_has_because: recScore.has_because ? 1 : 0, + rec_substance: recScore.reason_substance, + }, + judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`, + }); + expect(recScore.present, recScore.reasoning).toBe(true); + expect(recScore.commits, recScore.reasoning).toBe(true); + expect(recScore.has_because, recScore.reasoning).toBe(true); + expect( + recScore.reason_substance, + `${recScore.reasoning}\n reason: "${recScore.reason_text}"`, + ).toBeGreaterThanOrEqual(4); }, 300_000); }); @@ -290,9 +341,6 @@ After writing the file with that ONE question, stop. Do not continue the review. }); logCost('/plan-eng-review format (kind)', result); - recordE2E(evalCollector, '/plan-eng-review-format-kind', 'Plan Format — Eng Kind Issue', result, { - passed: ['success', 'error_max_turns'].includes(result.exitReason), - }); expect(['success', 'error_max_turns']).toContain(result.exitReason); expect(fs.existsSync(outFile)).toBe(true); @@ -304,6 +352,25 @@ After writing the file with that ONE question, stop. Do not continue the review. expect(captured).toMatch(RECOMMENDATION_RE); expect(captured).not.toMatch(COMPLETENESS_RE); expect(captured).toMatch(KIND_NOTE_RE); + + const recScore = await judgeRecommendation(captured); + recordE2E(evalCollector, '/plan-eng-review-format-kind', 'Plan Format — Eng Kind Issue', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + judge_scores: { + rec_present: recScore.present ? 1 : 0, + rec_commits: recScore.commits ? 1 : 0, + rec_has_because: recScore.has_because ? 1 : 0, + rec_substance: recScore.reason_substance, + }, + judge_reasoning: `${recScore.reasoning} | reason: "${recScore.reason_text}"`, + }); + expect(recScore.present, recScore.reasoning).toBe(true); + expect(recScore.commits, recScore.reasoning).toBe(true); + expect(recScore.has_because, recScore.reasoning).toBe(true); + expect( + recScore.reason_substance, + `${recScore.reasoning}\n reason: "${recScore.reason_text}"`, + ).toBeGreaterThanOrEqual(4); }, 300_000); });