mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
d06f08938f
Part 3 of 4 (plan: ~/.claude/plans/system-instruction-you-are-working-polymorphic-twilight.md). Gate-tier (E1, free, runs on every `bun test`): test/preamble-compose.test.ts — pins the composition order Asserts AskUserQuestion Format section renders BEFORE Model-Specific Behavioral Patch in tier-≥2 preamble output. Covers claude default, opus-4-7 overlay, tier 2/3, and codex host. Catches any future edit to scripts/resolvers/preamble.ts that silently reverts the order. test/resolver-ask-user-format.test.ts — pins the Pros/Cons contract 14 assertions against generateAskUserFormat output: D<N>, ELI10, Stakes if we pick wrong:, Recommendation: <choice>, Pros / cons:, ✅/❌ markers, min 2 pros + 1 con rules, hard-stop escape exact phrase, neutral-posture CT1 rule ((recommended) label preserved for AUTO_DECIDE), Completeness coverage-vs-kind, tool_use mandate (rule 11), self-check list, D-numbering model-level caveat. test/model-overlay-opus-4-7.test.ts — pins the pacing directive Asserts raw overlay file + resolved overlay output contain "Pace questions to the skill" and NOT "Batch your questions". Verifies INHERIT:claude chain still works (Todo-list, subordination wrapper), Fan out / Effort-match / Literal interpretation nudges preserved. Also asserts claude base overlay does NOT carry the Opus-specific pacing directive (no cross-contamination). Periodic-tier (E2, Opus-dependent, ~$1-2/run): test/skill-e2e-plan-prosons.test.ts — 4 cases extending v1.6.3.0 harness 1. Format positive — every token present when plan has real tradeoff 2. Hard-stop NEGATIVE — plan with genuine tradeoff must NOT dodge to "No cons — hard-stop choice" escape 3. Neutral-posture NEGATIVE — plan where one option dominates must emit (recommended) label + "because <reason>", must NOT dodge to "taste call" / "no preference" 4. Hard-stop POSITIVE — destructive-action plan may legitimately use the hard-stop escape test/helpers/touchfiles.ts — entries for all new eval cases Dependencies: overlay, preamble.ts, generate-ask-user-format.ts, and the 4 plan-review templates. Diff-based selection triggers the evals whenever those files change. Also added entries for 7 expanded-coverage cases (ship, office-hours, investigate, qa, review, design-review, document-release) — test cases will land in follow-up PRs per skill. Follow-ups noted in test file header: - True multi-turn cadence eval (3 findings → 3 distinct asks) — current harness captures one $OUT_FILE per session; multi-turn capture needs new harness support. - Expanded-coverage test cases for the 7 non-plan-review skills. Verified: - bun test: 349 pass (30 new + 319 baseline), 1 pre-existing security-bench oversize failure on main (unrelated, unchanged). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
73 lines
3.0 KiB
TypeScript
73 lines
3.0 KiB
TypeScript
/**
|
|
* Preamble composition order — gate-tier test.
|
|
*
|
|
* Asserts that the AskUserQuestion Format section renders BEFORE the
|
|
* Model-Specific Behavioral Patch section in tier-≥2 preamble output.
|
|
* This order is load-bearing: Opus 4.7 reads top-to-bottom and absorbs
|
|
* the first pacing directive it hits. v1.6.4.0 regressed plan-review
|
|
* cadence because the overlay rendered first with "Batch your questions"
|
|
* as the ambient default.
|
|
*
|
|
* If someone later reorders `scripts/resolvers/preamble.ts` so Overlay
|
|
* comes before Format, this test catches it before the next model
|
|
* migration can silently re-break the plan-review pacing.
|
|
*/
|
|
import { describe, test, expect } from 'bun:test';
|
|
import type { TemplateContext } from '../scripts/resolvers/types';
|
|
import { HOST_PATHS } from '../scripts/resolvers/types';
|
|
import { generatePreamble } from '../scripts/resolvers/preamble';
|
|
|
|
function makeCtx(
|
|
host: 'claude' | 'codex',
|
|
tier: 1 | 2 | 3 | 4,
|
|
model?: string,
|
|
): TemplateContext {
|
|
return {
|
|
skillName: 'test-skill',
|
|
tmplPath: 'test.tmpl',
|
|
host,
|
|
paths: HOST_PATHS[host],
|
|
preambleTier: tier,
|
|
...(model ? { model } : {}),
|
|
};
|
|
}
|
|
|
|
describe('Preamble composition order', () => {
|
|
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 2, claude)', () => {
|
|
const out = generatePreamble(makeCtx('claude', 2, 'claude'));
|
|
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
|
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
|
expect(formatIdx).toBeGreaterThan(-1);
|
|
expect(overlayIdx).toBeGreaterThan(-1);
|
|
expect(formatIdx).toBeLessThan(overlayIdx);
|
|
});
|
|
|
|
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 2, opus-4-7)', () => {
|
|
const out = generatePreamble(makeCtx('claude', 2, 'opus-4-7'));
|
|
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
|
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
|
expect(formatIdx).toBeGreaterThan(-1);
|
|
expect(overlayIdx).toBeGreaterThan(-1);
|
|
expect(formatIdx).toBeLessThan(overlayIdx);
|
|
});
|
|
|
|
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 3)', () => {
|
|
const out = generatePreamble(makeCtx('claude', 3, 'opus-4-7'));
|
|
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
|
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
|
expect(formatIdx).toBeLessThan(overlayIdx);
|
|
});
|
|
|
|
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (codex host)', () => {
|
|
const out = generatePreamble(makeCtx('codex', 2, 'opus-4-7'));
|
|
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
|
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
|
expect(formatIdx).toBeLessThan(overlayIdx);
|
|
});
|
|
|
|
test('tier 1 preamble does NOT include AskUserQuestion Format (but MAY include overlay)', () => {
|
|
const out = generatePreamble(makeCtx('claude', 1));
|
|
expect(out).not.toContain('## AskUserQuestion Format');
|
|
});
|
|
});
|