mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test(touchfiles): register Phase 4 + judge-fixture entries, add llm-judge dep to format tests
Two new entries:
- office-hours-phase4-fork (periodic) — for the silent-auto-decide regression
- llm-judge-recommendation (periodic) — for the judge rubric fixture test
Plus extend the four plan-{ceo,eng}-review-format-* entries with
test/helpers/llm-judge.ts so rubric tweaks invalidate the wired-in tests.
Verified by simulation that surgical office-hours/SKILL.md.tmpl changes fire
office-hours-auto-mode + office-hours-phase4-fork without over-firing
llm-judge-recommendation.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -105,6 +105,8 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// skills with no prior plan-mode test:
|
||||
'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-phase4-fork': ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/question-tuning.ts', 'test/helpers/llm-judge.ts'],
|
||||
'llm-judge-recommendation': ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts'],
|
||||
// v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution
|
||||
// fix doesn't trip the legitimate /plan-tune opt-in path: when the user has
|
||||
// written a never-ask preference, AUQ should still auto-decide rather than
|
||||
@@ -135,10 +137,10 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
|
||||
// AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
|
||||
// Fires when either template OR the two preamble resolvers change.
|
||||
'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],
|
||||
'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],
|
||||
'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],
|
||||
'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],
|
||||
|
||||
// v1.7.0.0 Pros/Cons format cadence + format + negative-escape evals.
|
||||
// Dependencies: same as format-mode + the 4 plan-review templates + overlay.
|
||||
@@ -432,6 +434,13 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'plan-eng-review-format-coverage': 'periodic',
|
||||
'plan-eng-review-format-kind': 'periodic',
|
||||
|
||||
// Office-hours Phase 4 silent-auto-decide regression — periodic (Phase 4
|
||||
// requires the agent to invent 2-3 architectures, more open-ended than the
|
||||
// 4 plan-format cases above). Reclassify to gate if it turns out stable.
|
||||
'office-hours-phase4-fork': 'periodic',
|
||||
// judgeRecommendation rubric sanity (fixture-based, ~$0.04/run via Haiku)
|
||||
'llm-judge-recommendation': 'periodic',
|
||||
|
||||
// v1.7.0.0 Pros/Cons format — cadence + negative-escape evals (all periodic)
|
||||
'plan-ceo-review-prosons-cadence': 'periodic',
|
||||
'plan-review-prosons-format': 'periodic',
|
||||
|
||||
Reference in New Issue
Block a user