mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 05:05:08 +02:00
feat: granular touchfiles + 2-tier E2E test system (gate/periodic)
- Shrink GLOBAL_TOUCHFILES from 9 to 3 (only truly global deps) - Move scoped deps (gen-skill-docs, llm-judge, test-server, worktree, codex/gemini session runners) into individual test entries - Add E2E_TIERS map classifying each test as gate or periodic - Replace EVALS_FAST with EVALS_TIER env var (gate/periodic) - Add tier validation test (E2E_TIERS keys must match E2E_TOUCHFILES) - CI runs only gate tests; periodic tests run weekly via cron - Add evals-periodic.yml workflow (Monday 6 AM UTC + manual) - Remove allow_failure flags (gate tests should be reliable) - Add test:gate and test:periodic scripts, remove test:e2e:fast
This commit is contained in:
@@ -0,0 +1,129 @@
|
||||
name: Periodic Evals
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 6 * * 1' # Monday 6 AM UTC
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: evals-periodic
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
IMAGE: ghcr.io/${{ github.repository }}/ci
|
||||
EVALS_TIER: periodic
|
||||
EVALS_ALL: 1 # Ignore diff — run all periodic tests
|
||||
|
||||
jobs:
|
||||
build-image:
|
||||
runs-on: ubicloud-standard-2
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
outputs:
|
||||
image-tag: ${{ steps.meta.outputs.tag }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- id: meta
|
||||
run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Check if image exists
|
||||
id: check
|
||||
run: |
|
||||
if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
|
||||
echo "exists=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "exists=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- if: steps.check.outputs.exists == 'false'
|
||||
run: cp package.json .github/docker/
|
||||
|
||||
- if: steps.check.outputs.exists == 'false'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .github/docker
|
||||
file: .github/docker/Dockerfile.ci
|
||||
push: true
|
||||
tags: |
|
||||
${{ steps.meta.outputs.tag }}
|
||||
${{ env.IMAGE }}:latest
|
||||
|
||||
evals:
|
||||
runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }}
|
||||
needs: build-image
|
||||
container:
|
||||
image: ${{ needs.build-image.outputs.image-tag }}
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --user runner
|
||||
timeout-minutes: 25
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
suite:
|
||||
- name: e2e-plan
|
||||
file: test/skill-e2e-plan.test.ts
|
||||
- name: e2e-design
|
||||
file: test/skill-e2e-design.test.ts
|
||||
- name: e2e-qa-bugs
|
||||
file: test/skill-e2e-qa-bugs.test.ts
|
||||
- name: e2e-qa-workflow
|
||||
file: test/skill-e2e-qa-workflow.test.ts
|
||||
- name: e2e-review
|
||||
file: test/skill-e2e-review.test.ts
|
||||
- name: e2e-workflow
|
||||
file: test/skill-e2e-workflow.test.ts
|
||||
- name: e2e-routing
|
||||
file: test/skill-routing-e2e.test.ts
|
||||
- name: e2e-codex
|
||||
file: test/codex-e2e.test.ts
|
||||
- name: e2e-gemini
|
||||
file: test/gemini-e2e.test.ts
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Fix bun temp
|
||||
run: |
|
||||
mkdir -p /home/runner/.cache/bun
|
||||
{
|
||||
echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun"
|
||||
echo "BUN_TMPDIR=/home/runner/.cache/bun"
|
||||
echo "TMPDIR=/home/runner/.cache"
|
||||
} >> "$GITHUB_ENV"
|
||||
|
||||
- name: Restore deps
|
||||
run: |
|
||||
if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
|
||||
ln -s /opt/node_modules_cache node_modules
|
||||
else
|
||||
bun install
|
||||
fi
|
||||
|
||||
- run: bun run build
|
||||
|
||||
- name: Run ${{ matrix.suite.name }}
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
||||
EVALS_CONCURRENCY: "40"
|
||||
PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers
|
||||
run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}
|
||||
|
||||
- name: Upload eval results
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: eval-periodic-${{ matrix.suite.name }}
|
||||
path: ~/.gstack-dev/evals/*.json
|
||||
retention-days: 90
|
||||
@@ -10,6 +10,7 @@ concurrency:
|
||||
|
||||
env:
|
||||
IMAGE: ghcr.io/${{ github.repository }}/ci
|
||||
EVALS_TIER: gate
|
||||
|
||||
jobs:
|
||||
# Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change)
|
||||
@@ -87,10 +88,8 @@ jobs:
|
||||
file: test/skill-e2e-review.test.ts
|
||||
- name: e2e-workflow
|
||||
file: test/skill-e2e-workflow.test.ts
|
||||
allow_failure: true # /ship + /setup-browser-cookies are env-dependent
|
||||
- name: e2e-routing
|
||||
file: test/skill-routing-e2e.test.ts
|
||||
allow_failure: true # LLM routing is non-deterministic
|
||||
- name: e2e-codex
|
||||
file: test/codex-e2e.test.ts
|
||||
- name: e2e-gemini
|
||||
|
||||
@@ -7,6 +7,8 @@ bun install # install dependencies
|
||||
bun test # run free tests (browse + snapshot + skill validation)
|
||||
bun run test:evals # run paid evals: LLM judge + E2E (diff-based, ~$4/run max)
|
||||
bun run test:evals:all # run ALL paid evals regardless of diff
|
||||
bun run test:gate # run gate-tier tests only (CI default, blocks merge)
|
||||
bun run test:periodic # run periodic-tier tests only (weekly cron / manual)
|
||||
bun run test:e2e # run E2E tests only (diff-based, ~$3.85/run max)
|
||||
bun run test:e2e:all # run ALL E2E tests regardless of diff
|
||||
bun run eval:select # show which tests would run based on current diff
|
||||
@@ -29,9 +31,17 @@ against the previous run.
|
||||
**Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based
|
||||
on `git diff` against the base branch. Each test declares its file dependencies in
|
||||
`test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store,
|
||||
llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
|
||||
touchfiles.ts itself) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
|
||||
variants to force all tests. Run `eval:select` to preview which tests would run.
|
||||
|
||||
**Two-tier system:** Tests are classified as `gate` or `periodic` in `E2E_TIERS`
|
||||
(in `test/helpers/touchfiles.ts`). CI runs only gate tests (`EVALS_TIER=gate`);
|
||||
periodic tests run weekly via cron or manually. Use `EVALS_TIER=gate` or
|
||||
`EVALS_TIER=periodic` to filter. When adding new E2E tests, classify them:
|
||||
1. Safety guardrail or deterministic functional test? -> `gate`
|
||||
2. Quality benchmark, Opus model test, or non-deterministic? -> `periodic`
|
||||
3. Requires external service (Codex, Gemini)? -> `periodic`
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
|
||||
+2
-1
@@ -17,7 +17,8 @@
|
||||
"test:evals:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:e2e": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:e2e:fast": "EVALS=1 EVALS_FAST=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts",
|
||||
"test:gate": "EVALS=1 EVALS_TIER=gate bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:periodic": "EVALS=1 EVALS_TIER=periodic EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:codex": "EVALS=1 bun test test/codex-e2e.test.ts",
|
||||
"test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts",
|
||||
"test:gemini": "EVALS=1 bun test test/gemini-e2e.test.ts",
|
||||
|
||||
+14
-14
@@ -9,7 +9,7 @@ import { describe, test, beforeAll, afterAll } from 'bun:test';
|
||||
import type { SkillTestResult } from './session-runner';
|
||||
import { EvalCollector, judgePassed } from './eval-store';
|
||||
import type { EvalTestEntry } from './eval-store';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './touchfiles';
|
||||
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './touchfiles';
|
||||
import { WorktreeManager } from '../../lib/worktree';
|
||||
import type { HarvestResult } from '../../lib/worktree';
|
||||
import { spawnSync } from 'child_process';
|
||||
@@ -32,13 +32,6 @@ export const evalsEnabled = !!process.env.EVALS;
|
||||
// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
|
||||
export let selectedTests: string[] | null = null; // null = run all
|
||||
|
||||
// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback
|
||||
const FAST_EXCLUDED_TESTS = [
|
||||
'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch',
|
||||
'design-consultation-core', 'design-consultation-existing',
|
||||
'qa-fix-loop', 'design-review-fix',
|
||||
];
|
||||
|
||||
if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
const baseBranch = process.env.EVALS_BASE
|
||||
|| detectBaseBranch(ROOT)
|
||||
@@ -57,15 +50,22 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
// If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
|
||||
}
|
||||
|
||||
// Apply EVALS_FAST filter after diff-based selection
|
||||
if (evalsEnabled && process.env.EVALS_FAST) {
|
||||
// EVALS_TIER: filter tests by tier after diff-based selection.
|
||||
// 'gate' = gate tests only (CI default — blocks merge)
|
||||
// 'periodic' = periodic tests only (weekly cron / manual)
|
||||
// not set = run all selected tests (local dev default, backward compat)
|
||||
if (evalsEnabled && process.env.EVALS_TIER) {
|
||||
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
|
||||
const tierTests = Object.entries(E2E_TIERS)
|
||||
.filter(([, t]) => t === tier)
|
||||
.map(([name]) => name);
|
||||
|
||||
if (selectedTests === null) {
|
||||
// Run all minus excluded
|
||||
selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t));
|
||||
selectedTests = tierTests;
|
||||
} else {
|
||||
selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t));
|
||||
selectedTests = selectedTests.filter(t => tierTests.includes(t));
|
||||
}
|
||||
process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`);
|
||||
process.stderr.write(`EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
|
||||
}
|
||||
|
||||
export const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||
|
||||
+139
-30
@@ -32,25 +32,25 @@ export function matchGlob(file: string, pattern: string): boolean {
|
||||
* Each test lists the file patterns that, if changed, require the test to run.
|
||||
*/
|
||||
export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Browse core
|
||||
'browse-basic': ['browse/src/**'],
|
||||
'browse-snapshot': ['browse/src/**'],
|
||||
// Browse core (+ test-server dependency)
|
||||
'browse-basic': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],
|
||||
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
// SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
|
||||
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
'contributor-mode': ['SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
|
||||
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// QA
|
||||
'qa-quick': ['qa/**', 'browse/src/**'],
|
||||
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
|
||||
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
|
||||
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
|
||||
// QA (+ test-server dependency)
|
||||
'qa-quick': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
|
||||
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
|
||||
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
|
||||
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
|
||||
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
|
||||
'qa-fix-loop': ['qa/**', 'browse/src/**'],
|
||||
'qa-fix-loop': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
|
||||
'qa-bootstrap': ['qa/**', 'ship/**'],
|
||||
|
||||
// Review
|
||||
@@ -94,13 +94,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Codex (Claude E2E — tests /codex skill via Claude)
|
||||
'codex-review': ['codex/**'],
|
||||
|
||||
// Codex E2E (tests skills via Codex CLI)
|
||||
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
|
||||
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
|
||||
// Codex E2E (tests skills via Codex CLI + worktree)
|
||||
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
|
||||
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
|
||||
|
||||
// Gemini E2E (tests skills via Gemini CLI)
|
||||
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
|
||||
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
|
||||
// Gemini E2E (tests skills via Gemini CLI + worktree)
|
||||
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
|
||||
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
|
||||
|
||||
|
||||
// Coverage audit (shared fixture) + triage
|
||||
@@ -110,7 +110,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'ship-triage': ['ship/**', 'bin/gstack-repo-mode'],
|
||||
|
||||
// Design
|
||||
'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
|
||||
'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'],
|
||||
'design-consultation-existing': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
|
||||
'design-consultation-research': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
|
||||
'design-consultation-preview': ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
|
||||
@@ -144,6 +144,117 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
};
|
||||
|
||||
/**
|
||||
* E2E test tiers — 'gate' blocks PRs, 'periodic' runs weekly/on-demand.
|
||||
* Must have exactly the same keys as E2E_TOUCHFILES.
|
||||
*/
|
||||
export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// Browse core — gate (if browse breaks, everything breaks)
|
||||
'browse-basic': 'gate',
|
||||
'browse-snapshot': 'gate',
|
||||
|
||||
// SKILL.md setup — gate (if setup breaks, no skill works)
|
||||
'skillmd-setup-discovery': 'gate',
|
||||
'skillmd-no-local-binary': 'gate',
|
||||
'skillmd-outside-git': 'gate',
|
||||
'contributor-mode': 'gate',
|
||||
'session-awareness': 'gate',
|
||||
|
||||
// QA — gate for functional, periodic for quality/benchmarks
|
||||
'qa-quick': 'gate',
|
||||
'qa-b6-static': 'periodic',
|
||||
'qa-b7-spa': 'periodic',
|
||||
'qa-b8-checkout': 'periodic',
|
||||
'qa-only-no-fix': 'gate', // CRITICAL guardrail: Edit tool forbidden
|
||||
'qa-fix-loop': 'periodic',
|
||||
'qa-bootstrap': 'gate',
|
||||
|
||||
// Review — gate for functional/guardrails, periodic for quality
|
||||
'review-sql-injection': 'gate', // Security guardrail
|
||||
'review-enum-completeness': 'gate',
|
||||
'review-base-branch': 'gate',
|
||||
'review-design-lite': 'periodic', // 4/7 threshold is subjective
|
||||
'review-coverage-audit': 'gate',
|
||||
|
||||
// Office Hours
|
||||
'office-hours-spec-review': 'gate',
|
||||
|
||||
// Plan reviews — gate for cheap functional, periodic for Opus quality
|
||||
'plan-ceo-review': 'periodic',
|
||||
'plan-ceo-review-selective': 'periodic',
|
||||
'plan-ceo-review-benefits': 'gate',
|
||||
'plan-eng-review': 'periodic',
|
||||
'plan-eng-review-artifact': 'periodic',
|
||||
'plan-eng-coverage-audit': 'gate',
|
||||
|
||||
// Ship — gate (end-to-end ship path)
|
||||
'ship-base-branch': 'gate',
|
||||
'ship-local-workflow': 'gate',
|
||||
'ship-coverage-audit': 'gate',
|
||||
'ship-triage': 'gate',
|
||||
|
||||
// Setup browser cookies
|
||||
'setup-cookies-detect': 'gate',
|
||||
|
||||
// Retro — gate for cheap branch detection, periodic for full Opus retro
|
||||
'retro': 'periodic',
|
||||
'retro-base-branch': 'gate',
|
||||
|
||||
// Global discover
|
||||
'global-discover': 'gate',
|
||||
|
||||
// CSO — gate for security guardrails, periodic for quality
|
||||
'cso-full-audit': 'gate', // Hardcoded secrets detection
|
||||
'cso-diff-mode': 'gate',
|
||||
'cso-infra-scope': 'periodic',
|
||||
|
||||
// Document-release — gate (CHANGELOG guardrail)
|
||||
'document-release': 'gate',
|
||||
|
||||
// Codex — periodic (Opus, requires codex CLI)
|
||||
'codex-review': 'periodic',
|
||||
|
||||
// Multi-AI — periodic (require external CLIs)
|
||||
'codex-discover-skill': 'periodic',
|
||||
'codex-review-findings': 'periodic',
|
||||
'gemini-discover-skill': 'periodic',
|
||||
'gemini-review-findings': 'periodic',
|
||||
|
||||
// Design — gate for cheap functional, periodic for Opus/quality
|
||||
'design-consultation-core': 'periodic',
|
||||
'design-consultation-existing': 'periodic',
|
||||
'design-consultation-research': 'gate',
|
||||
'design-consultation-preview': 'gate',
|
||||
'plan-design-review-plan-mode': 'periodic',
|
||||
'plan-design-review-no-ui-scope': 'gate',
|
||||
'design-review-fix': 'periodic',
|
||||
|
||||
// gstack-upgrade
|
||||
'gstack-upgrade-happy-path': 'gate',
|
||||
|
||||
// Deploy skills
|
||||
'land-and-deploy-workflow': 'gate',
|
||||
'canary-workflow': 'gate',
|
||||
'benchmark-workflow': 'gate',
|
||||
'setup-deploy-workflow': 'gate',
|
||||
|
||||
// Autoplan — periodic (not yet implemented)
|
||||
'autoplan-core': 'periodic',
|
||||
|
||||
// Skill routing — periodic (LLM routing is non-deterministic)
|
||||
'journey-ideation': 'periodic',
|
||||
'journey-plan-eng': 'periodic',
|
||||
'journey-think-bigger': 'periodic',
|
||||
'journey-debug': 'periodic',
|
||||
'journey-qa': 'periodic',
|
||||
'journey-code-review': 'periodic',
|
||||
'journey-ship': 'periodic',
|
||||
'journey-docs': 'periodic',
|
||||
'journey-retro': 'periodic',
|
||||
'journey-design-system': 'periodic',
|
||||
'journey-visual-qa': 'periodic',
|
||||
};
|
||||
|
||||
/**
|
||||
* LLM-judge test touchfiles — keyed by test description string.
|
||||
*/
|
||||
@@ -190,17 +301,15 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
|
||||
/**
|
||||
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
|
||||
*
|
||||
* Keep this list minimal — only files that genuinely affect every test.
|
||||
* Scoped dependencies (gen-skill-docs, llm-judge, test-server, worktree,
|
||||
* codex/gemini session runners) belong in individual test entries instead.
|
||||
*/
|
||||
export const GLOBAL_TOUCHFILES = [
|
||||
'test/helpers/session-runner.ts',
|
||||
'test/helpers/codex-session-runner.ts',
|
||||
'test/helpers/gemini-session-runner.ts',
|
||||
'test/helpers/eval-store.ts',
|
||||
'test/helpers/llm-judge.ts',
|
||||
'scripts/gen-skill-docs.ts',
|
||||
'test/helpers/touchfiles.ts',
|
||||
'browse/test/test-server.ts',
|
||||
'lib/worktree.ts',
|
||||
'test/helpers/session-runner.ts', // All E2E tests use this runner
|
||||
'test/helpers/eval-store.ts', // All E2E tests store results here
|
||||
'test/helpers/touchfiles.ts', // Self-referential — reclassifying wrong is dangerous
|
||||
];
|
||||
|
||||
// --- Base branch detection ---
|
||||
|
||||
+44
-4
@@ -13,6 +13,7 @@ import {
|
||||
selectTests,
|
||||
detectBaseBranch,
|
||||
E2E_TOUCHFILES,
|
||||
E2E_TIERS,
|
||||
LLM_JUDGE_TOUCHFILES,
|
||||
GLOBAL_TOUCHFILES,
|
||||
} from './helpers/touchfiles';
|
||||
@@ -91,10 +92,19 @@ describe('selectTests', () => {
|
||||
expect(result.reason).toContain('global');
|
||||
});
|
||||
|
||||
test('gen-skill-docs.ts is a global touchfile', () => {
|
||||
test('gen-skill-docs.ts is a scoped touchfile, not global', () => {
|
||||
const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
|
||||
expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
|
||||
expect(result.reason).toContain('global');
|
||||
// Should select tests that list gen-skill-docs.ts in their touchfiles, not ALL tests
|
||||
expect(result.selected.length).toBeGreaterThan(0);
|
||||
expect(result.selected.length).toBeLessThan(Object.keys(E2E_TOUCHFILES).length);
|
||||
expect(result.reason).toBe('diff');
|
||||
// Should include tests that depend on gen-skill-docs.ts
|
||||
expect(result.selected).toContain('skillmd-setup-discovery');
|
||||
expect(result.selected).toContain('contributor-mode');
|
||||
expect(result.selected).toContain('journey-ideation');
|
||||
// Should NOT include tests that don't depend on it
|
||||
expect(result.selected).not.toContain('retro');
|
||||
expect(result.selected).not.toContain('cso-full-audit');
|
||||
});
|
||||
|
||||
test('unrelated file selects nothing', () => {
|
||||
@@ -143,7 +153,7 @@ describe('selectTests', () => {
|
||||
});
|
||||
|
||||
test('global touchfiles work for LLM-judge tests too', () => {
|
||||
const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
|
||||
const result = selectTests(['test/helpers/session-runner.ts'], LLM_JUDGE_TOUCHFILES);
|
||||
expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
|
||||
});
|
||||
});
|
||||
@@ -233,6 +243,36 @@ describe('TOUCHFILES completeness', () => {
|
||||
}
|
||||
});
|
||||
|
||||
test('E2E_TIERS covers exactly the same tests as E2E_TOUCHFILES', () => {
|
||||
const touchfileKeys = new Set(Object.keys(E2E_TOUCHFILES));
|
||||
const tierKeys = new Set(Object.keys(E2E_TIERS));
|
||||
|
||||
const missingFromTiers = [...touchfileKeys].filter(k => !tierKeys.has(k));
|
||||
const extraInTiers = [...tierKeys].filter(k => !touchfileKeys.has(k));
|
||||
|
||||
if (missingFromTiers.length > 0) {
|
||||
throw new Error(
|
||||
`E2E tests missing TIER entries: ${missingFromTiers.join(', ')}\n` +
|
||||
`Add these to E2E_TIERS in test/helpers/touchfiles.ts`,
|
||||
);
|
||||
}
|
||||
if (extraInTiers.length > 0) {
|
||||
throw new Error(
|
||||
`E2E_TIERS has extra entries not in E2E_TOUCHFILES: ${extraInTiers.join(', ')}\n` +
|
||||
`Remove these from E2E_TIERS or add to E2E_TOUCHFILES`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('E2E_TIERS only contains valid tier values', () => {
|
||||
const validTiers = ['gate', 'periodic'];
|
||||
for (const [name, tier] of Object.entries(E2E_TIERS)) {
|
||||
if (!validTiers.includes(tier)) {
|
||||
throw new Error(`E2E_TIERS['${name}'] has invalid tier '${tier}'. Valid: ${validTiers.join(', ')}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test('every LLM-judge test has a TOUCHFILES entry', () => {
|
||||
const llmContent = fs.readFileSync(
|
||||
path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
|
||||
|
||||
Reference in New Issue
Block a user