From ed0e00daab9c88896e511f711c002b28e237c0cf Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 1 May 2026 14:18:35 -0700 Subject: [PATCH] test(touchfiles): register Phase 4 + judge-fixture entries, add llm-judge dep to format tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new entries: - office-hours-phase4-fork (periodic) — for the silent-auto-decide regression - llm-judge-recommendation (periodic) — for the judge rubric fixture test Plus extend the four plan-{ceo,eng}-review-format-* entries with test/helpers/llm-judge.ts so rubric tweaks invalidate the wired-in tests. Verified by simulation that surgical office-hours/SKILL.md.tmpl changes fire office-hours-auto-mode + office-hours-phase4-fork without over-firing llm-judge-recommendation. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 37a97f1b..453b93ea 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -105,6 +105,8 @@ export const E2E_TOUCHFILES: Record = { // skills with no prior plan-mode test: 'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], 'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'office-hours-phase4-fork': ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/question-tuning.ts', 'test/helpers/llm-judge.ts'], + 'llm-judge-recommendation': ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts'], // v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution // fix doesn't trip the legitimate /plan-tune opt-in path: when the user has // written a never-ask preference, AUQ should still auto-decide rather than @@ -135,10 +137,10 @@ export const E2E_TOUCHFILES: Record = { // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) // Fires when either template OR the two preamble resolvers change. - 'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], - 'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], - 'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], - 'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], + 'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'], + 'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'], + 'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'], + 'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'], // v1.7.0.0 Pros/Cons format cadence + format + negative-escape evals. // Dependencies: same as format-mode + the 4 plan-review templates + overlay. @@ -432,6 +434,13 @@ export const E2E_TIERS: Record = { 'plan-eng-review-format-coverage': 'periodic', 'plan-eng-review-format-kind': 'periodic', + // Office-hours Phase 4 silent-auto-decide regression — periodic (Phase 4 + // requires the agent to invent 2-3 architectures, more open-ended than the + // 4 plan-format cases above). Reclassify to gate if it turns out stable. + 'office-hours-phase4-fork': 'periodic', + // judgeRecommendation rubric sanity (fixture-based, ~$0.04/run via Haiku) + 'llm-judge-recommendation': 'periodic', + // v1.7.0.0 Pros/Cons format — cadence + negative-escape evals (all periodic) 'plan-ceo-review-prosons-cadence': 'periodic', 'plan-review-prosons-format': 'periodic',