diff --git a/.github/workflows/evals-periodic.yml b/.github/workflows/evals-periodic.yml new file mode 100644 index 00000000..e529dbf4 --- /dev/null +++ b/.github/workflows/evals-periodic.yml @@ -0,0 +1,129 @@ +name: Periodic Evals +on: + schedule: + - cron: '0 6 * * 1' # Monday 6 AM UTC + workflow_dispatch: + +concurrency: + group: evals-periodic + cancel-in-progress: true + +env: + IMAGE: ghcr.io/${{ github.repository }}/ci + EVALS_TIER: periodic + EVALS_ALL: 1 # Ignore diff — run all periodic tests + +jobs: + build-image: + runs-on: ubicloud-standard-2 + permissions: + contents: read + packages: write + outputs: + image-tag: ${{ steps.meta.outputs.tag }} + steps: + - uses: actions/checkout@v4 + + - id: meta + run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT" + + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Check if image exists + id: check + run: | + if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then + echo "exists=true" >> "$GITHUB_OUTPUT" + else + echo "exists=false" >> "$GITHUB_OUTPUT" + fi + + - if: steps.check.outputs.exists == 'false' + run: cp package.json .github/docker/ + + - if: steps.check.outputs.exists == 'false' + uses: docker/build-push-action@v6 + with: + context: .github/docker + file: .github/docker/Dockerfile.ci + push: true + tags: | + ${{ steps.meta.outputs.tag }} + ${{ env.IMAGE }}:latest + + evals: + runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }} + needs: build-image + container: + image: ${{ needs.build-image.outputs.image-tag }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + options: --user runner + timeout-minutes: 25 + strategy: + fail-fast: false + matrix: + suite: + - name: e2e-plan + file: test/skill-e2e-plan.test.ts + - name: e2e-design + file: test/skill-e2e-design.test.ts + - name: e2e-qa-bugs + file: test/skill-e2e-qa-bugs.test.ts + - name: e2e-qa-workflow + file: test/skill-e2e-qa-workflow.test.ts + - name: e2e-review + file: test/skill-e2e-review.test.ts + - name: e2e-workflow + file: test/skill-e2e-workflow.test.ts + - name: e2e-routing + file: test/skill-routing-e2e.test.ts + - name: e2e-codex + file: test/codex-e2e.test.ts + - name: e2e-gemini + file: test/gemini-e2e.test.ts + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fix bun temp + run: | + mkdir -p /home/runner/.cache/bun + { + echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun" + echo "BUN_TMPDIR=/home/runner/.cache/bun" + echo "TMPDIR=/home/runner/.cache" + } >> "$GITHUB_ENV" + + - name: Restore deps + run: | + if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then + ln -s /opt/node_modules_cache node_modules + else + bun install + fi + + - run: bun run build + + - name: Run ${{ matrix.suite.name }} + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + EVALS_CONCURRENCY: "40" + PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers + run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }} + + - name: Upload eval results + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-periodic-${{ matrix.suite.name }} + path: ~/.gstack-dev/evals/*.json + retention-days: 90 diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index caa6f82c..c5357ce3 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -10,6 +10,7 @@ concurrency: env: IMAGE: ghcr.io/${{ github.repository }}/ci + EVALS_TIER: gate jobs: # Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change) @@ -87,10 +88,8 @@ jobs: file: test/skill-e2e-review.test.ts - name: e2e-workflow file: test/skill-e2e-workflow.test.ts - allow_failure: true # /ship + /setup-browser-cookies are env-dependent - name: e2e-routing file: test/skill-routing-e2e.test.ts - allow_failure: true # LLM routing is non-deterministic - name: e2e-codex file: test/codex-e2e.test.ts - name: e2e-gemini diff --git a/CLAUDE.md b/CLAUDE.md index 25673f4c..be24ef3c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,6 +7,8 @@ bun install # install dependencies bun test # run free tests (browse + snapshot + skill validation) bun run test:evals # run paid evals: LLM judge + E2E (diff-based, ~$4/run max) bun run test:evals:all # run ALL paid evals regardless of diff +bun run test:gate # run gate-tier tests only (CI default, blocks merge) +bun run test:periodic # run periodic-tier tests only (weekly cron / manual) bun run test:e2e # run E2E tests only (diff-based, ~$3.85/run max) bun run test:e2e:all # run ALL E2E tests regardless of diff bun run eval:select # show which tests would run based on current diff @@ -29,9 +31,17 @@ against the previous run. **Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based on `git diff` against the base branch. Each test declares its file dependencies in `test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store, -llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script +touchfiles.ts itself) trigger all tests. Use `EVALS_ALL=1` or the `:all` script variants to force all tests. Run `eval:select` to preview which tests would run. +**Two-tier system:** Tests are classified as `gate` or `periodic` in `E2E_TIERS` +(in `test/helpers/touchfiles.ts`). CI runs only gate tests (`EVALS_TIER=gate`); +periodic tests run weekly via cron or manually. Use `EVALS_TIER=gate` or +`EVALS_TIER=periodic` to filter. When adding new E2E tests, classify them: +1. Safety guardrail or deterministic functional test? -> `gate` +2. Quality benchmark, Opus model test, or non-deterministic? -> `periodic` +3. Requires external service (Codex, Gemini)? -> `periodic` + ## Testing ```bash diff --git a/package.json b/package.json index ebcd27bb..dd629887 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,8 @@ "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", "test:e2e": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", - "test:e2e:fast": "EVALS=1 EVALS_FAST=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts", + "test:gate": "EVALS=1 EVALS_TIER=gate bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:periodic": "EVALS=1 EVALS_TIER=periodic EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", "test:codex": "EVALS=1 bun test test/codex-e2e.test.ts", "test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts", "test:gemini": "EVALS=1 bun test test/gemini-e2e.test.ts", diff --git a/test/helpers/e2e-helpers.ts b/test/helpers/e2e-helpers.ts index 406639ed..4615307c 100644 --- a/test/helpers/e2e-helpers.ts +++ b/test/helpers/e2e-helpers.ts @@ -9,7 +9,7 @@ import { describe, test, beforeAll, afterAll } from 'bun:test'; import type { SkillTestResult } from './session-runner'; import { EvalCollector, judgePassed } from './eval-store'; import type { EvalTestEntry } from './eval-store'; -import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './touchfiles'; +import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './touchfiles'; import { WorktreeManager } from '../../lib/worktree'; import type { HarvestResult } from '../../lib/worktree'; import { spawnSync } from 'child_process'; @@ -32,13 +32,6 @@ export const evalsEnabled = !!process.env.EVALS; // Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch. export let selectedTests: string[] | null = null; // null = run all -// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback -const FAST_EXCLUDED_TESTS = [ - 'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch', - 'design-consultation-core', 'design-consultation-existing', - 'qa-fix-loop', 'design-review-fix', -]; - if (evalsEnabled && !process.env.EVALS_ALL) { const baseBranch = process.env.EVALS_BASE || detectBaseBranch(ROOT) @@ -57,15 +50,22 @@ if (evalsEnabled && !process.env.EVALS_ALL) { // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all } -// Apply EVALS_FAST filter after diff-based selection -if (evalsEnabled && process.env.EVALS_FAST) { +// EVALS_TIER: filter tests by tier after diff-based selection. +// 'gate' = gate tests only (CI default — blocks merge) +// 'periodic' = periodic tests only (weekly cron / manual) +// not set = run all selected tests (local dev default, backward compat) +if (evalsEnabled && process.env.EVALS_TIER) { + const tier = process.env.EVALS_TIER as 'gate' | 'periodic'; + const tierTests = Object.entries(E2E_TIERS) + .filter(([, t]) => t === tier) + .map(([name]) => name); + if (selectedTests === null) { - // Run all minus excluded - selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t)); + selectedTests = tierTests; } else { - selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t)); + selectedTests = selectedTests.filter(t => tierTests.includes(t)); } - process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`); + process.stderr.write(`EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`); } export const describeE2E = evalsEnabled ? describe : describe.skip; diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index d0d232a5..7c0be060 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -32,25 +32,25 @@ export function matchGlob(file: string, pattern: string): boolean { * Each test lists the file patterns that, if changed, require the test to run. */ export const E2E_TOUCHFILES: Record = { - // Browse core - 'browse-basic': ['browse/src/**'], - 'browse-snapshot': ['browse/src/**'], + // Browse core (+ test-server dependency) + 'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'], + 'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'], - // SKILL.md setup + preamble (depend on ROOT SKILL.md only) - 'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'], - 'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'], - 'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'], + // SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs) + 'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], + 'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], 'contributor-mode': ['SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], - 'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'], + 'session-awareness': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], - // QA - 'qa-quick': ['qa/**', 'browse/src/**'], - 'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'], - 'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'], - 'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'], + // QA (+ test-server dependency) + 'qa-quick': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'], + 'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'], + 'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'], + 'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'], 'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'], - 'qa-fix-loop': ['qa/**', 'browse/src/**'], + 'qa-fix-loop': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'], 'qa-bootstrap': ['qa/**', 'ship/**'], // Review @@ -94,13 +94,13 @@ export const E2E_TOUCHFILES: Record = { // Codex (Claude E2E — tests /codex skill via Claude) 'codex-review': ['codex/**'], - // Codex E2E (tests skills via Codex CLI) - 'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'], - 'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'], + // Codex E2E (tests skills via Codex CLI + worktree) + 'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'], + 'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'], - // Gemini E2E (tests skills via Gemini CLI) - 'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'], - 'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'], + // Gemini E2E (tests skills via Gemini CLI + worktree) + 'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'], + 'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'], // Coverage audit (shared fixture) + triage @@ -110,7 +110,7 @@ export const E2E_TOUCHFILES: Record = { 'ship-triage': ['ship/**', 'bin/gstack-repo-mode'], // Design - 'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts'], + 'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'], 'design-consultation-existing': ['design-consultation/**', 'scripts/gen-skill-docs.ts'], 'design-consultation-research': ['design-consultation/**', 'scripts/gen-skill-docs.ts'], 'design-consultation-preview': ['design-consultation/**', 'scripts/gen-skill-docs.ts'], @@ -144,6 +144,117 @@ export const E2E_TOUCHFILES: Record = { 'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], }; +/** + * E2E test tiers — 'gate' blocks PRs, 'periodic' runs weekly/on-demand. + * Must have exactly the same keys as E2E_TOUCHFILES. + */ +export const E2E_TIERS: Record = { + // Browse core — gate (if browse breaks, everything breaks) + 'browse-basic': 'gate', + 'browse-snapshot': 'gate', + + // SKILL.md setup — gate (if setup breaks, no skill works) + 'skillmd-setup-discovery': 'gate', + 'skillmd-no-local-binary': 'gate', + 'skillmd-outside-git': 'gate', + 'contributor-mode': 'gate', + 'session-awareness': 'gate', + + // QA — gate for functional, periodic for quality/benchmarks + 'qa-quick': 'gate', + 'qa-b6-static': 'periodic', + 'qa-b7-spa': 'periodic', + 'qa-b8-checkout': 'periodic', + 'qa-only-no-fix': 'gate', // CRITICAL guardrail: Edit tool forbidden + 'qa-fix-loop': 'periodic', + 'qa-bootstrap': 'gate', + + // Review — gate for functional/guardrails, periodic for quality + 'review-sql-injection': 'gate', // Security guardrail + 'review-enum-completeness': 'gate', + 'review-base-branch': 'gate', + 'review-design-lite': 'periodic', // 4/7 threshold is subjective + 'review-coverage-audit': 'gate', + + // Office Hours + 'office-hours-spec-review': 'gate', + + // Plan reviews — gate for cheap functional, periodic for Opus quality + 'plan-ceo-review': 'periodic', + 'plan-ceo-review-selective': 'periodic', + 'plan-ceo-review-benefits': 'gate', + 'plan-eng-review': 'periodic', + 'plan-eng-review-artifact': 'periodic', + 'plan-eng-coverage-audit': 'gate', + + // Ship — gate (end-to-end ship path) + 'ship-base-branch': 'gate', + 'ship-local-workflow': 'gate', + 'ship-coverage-audit': 'gate', + 'ship-triage': 'gate', + + // Setup browser cookies + 'setup-cookies-detect': 'gate', + + // Retro — gate for cheap branch detection, periodic for full Opus retro + 'retro': 'periodic', + 'retro-base-branch': 'gate', + + // Global discover + 'global-discover': 'gate', + + // CSO — gate for security guardrails, periodic for quality + 'cso-full-audit': 'gate', // Hardcoded secrets detection + 'cso-diff-mode': 'gate', + 'cso-infra-scope': 'periodic', + + // Document-release — gate (CHANGELOG guardrail) + 'document-release': 'gate', + + // Codex — periodic (Opus, requires codex CLI) + 'codex-review': 'periodic', + + // Multi-AI — periodic (require external CLIs) + 'codex-discover-skill': 'periodic', + 'codex-review-findings': 'periodic', + 'gemini-discover-skill': 'periodic', + 'gemini-review-findings': 'periodic', + + // Design — gate for cheap functional, periodic for Opus/quality + 'design-consultation-core': 'periodic', + 'design-consultation-existing': 'periodic', + 'design-consultation-research': 'gate', + 'design-consultation-preview': 'gate', + 'plan-design-review-plan-mode': 'periodic', + 'plan-design-review-no-ui-scope': 'gate', + 'design-review-fix': 'periodic', + + // gstack-upgrade + 'gstack-upgrade-happy-path': 'gate', + + // Deploy skills + 'land-and-deploy-workflow': 'gate', + 'canary-workflow': 'gate', + 'benchmark-workflow': 'gate', + 'setup-deploy-workflow': 'gate', + + // Autoplan — periodic (not yet implemented) + 'autoplan-core': 'periodic', + + // Skill routing — periodic (LLM routing is non-deterministic) + 'journey-ideation': 'periodic', + 'journey-plan-eng': 'periodic', + 'journey-think-bigger': 'periodic', + 'journey-debug': 'periodic', + 'journey-qa': 'periodic', + 'journey-code-review': 'periodic', + 'journey-ship': 'periodic', + 'journey-docs': 'periodic', + 'journey-retro': 'periodic', + 'journey-design-system': 'periodic', + 'journey-visual-qa': 'periodic', +}; + /** * LLM-judge test touchfiles — keyed by test description string. */ @@ -190,17 +301,15 @@ export const LLM_JUDGE_TOUCHFILES: Record = { /** * Changes to any of these files trigger ALL tests (both E2E and LLM-judge). + * + * Keep this list minimal — only files that genuinely affect every test. + * Scoped dependencies (gen-skill-docs, llm-judge, test-server, worktree, + * codex/gemini session runners) belong in individual test entries instead. */ export const GLOBAL_TOUCHFILES = [ - 'test/helpers/session-runner.ts', - 'test/helpers/codex-session-runner.ts', - 'test/helpers/gemini-session-runner.ts', - 'test/helpers/eval-store.ts', - 'test/helpers/llm-judge.ts', - 'scripts/gen-skill-docs.ts', - 'test/helpers/touchfiles.ts', - 'browse/test/test-server.ts', - 'lib/worktree.ts', + 'test/helpers/session-runner.ts', // All E2E tests use this runner + 'test/helpers/eval-store.ts', // All E2E tests store results here + 'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous ]; // --- Base branch detection --- diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 0e24b124..81304c41 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -13,6 +13,7 @@ import { selectTests, detectBaseBranch, E2E_TOUCHFILES, + E2E_TIERS, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES, } from './helpers/touchfiles'; @@ -91,10 +92,19 @@ describe('selectTests', () => { expect(result.reason).toContain('global'); }); - test('gen-skill-docs.ts is a global touchfile', () => { + test('gen-skill-docs.ts is a scoped touchfile, not global', () => { const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES); - expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length); - expect(result.reason).toContain('global'); + // Should select tests that list gen-skill-docs.ts in their touchfiles, not ALL tests + expect(result.selected.length).toBeGreaterThan(0); + expect(result.selected.length).toBeLessThan(Object.keys(E2E_TOUCHFILES).length); + expect(result.reason).toBe('diff'); + // Should include tests that depend on gen-skill-docs.ts + expect(result.selected).toContain('skillmd-setup-discovery'); + expect(result.selected).toContain('contributor-mode'); + expect(result.selected).toContain('journey-ideation'); + // Should NOT include tests that don't depend on it + expect(result.selected).not.toContain('retro'); + expect(result.selected).not.toContain('cso-full-audit'); }); test('unrelated file selects nothing', () => { @@ -143,7 +153,7 @@ describe('selectTests', () => { }); test('global touchfiles work for LLM-judge tests too', () => { - const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES); + const result = selectTests(['test/helpers/session-runner.ts'], LLM_JUDGE_TOUCHFILES); expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length); }); }); @@ -233,6 +243,36 @@ describe('TOUCHFILES completeness', () => { } }); + test('E2E_TIERS covers exactly the same tests as E2E_TOUCHFILES', () => { + const touchfileKeys = new Set(Object.keys(E2E_TOUCHFILES)); + const tierKeys = new Set(Object.keys(E2E_TIERS)); + + const missingFromTiers = [...touchfileKeys].filter(k => !tierKeys.has(k)); + const extraInTiers = [...tierKeys].filter(k => !touchfileKeys.has(k)); + + if (missingFromTiers.length > 0) { + throw new Error( + `E2E tests missing TIER entries: ${missingFromTiers.join(', ')}\n` + + `Add these to E2E_TIERS in test/helpers/touchfiles.ts`, + ); + } + if (extraInTiers.length > 0) { + throw new Error( + `E2E_TIERS has extra entries not in E2E_TOUCHFILES: ${extraInTiers.join(', ')}\n` + + `Remove these from E2E_TIERS or add to E2E_TOUCHFILES`, + ); + } + }); + + test('E2E_TIERS only contains valid tier values', () => { + const validTiers = ['gate', 'periodic']; + for (const [name, tier] of Object.entries(E2E_TIERS)) { + if (!validTiers.includes(tier)) { + throw new Error(`E2E_TIERS['${name}'] has invalid tier '${tier}'. Valid: ${validTiers.join(', ')}`); + } + } + }); + test('every LLM-judge test has a TOUCHFILES entry', () => { const llmContent = fs.readFileSync( path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),