mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
chore: merge main and resolve conflicts
Main took 0.11.15.0 for E2E test coverage. Bumped our entry to 0.11.16.0.
This commit is contained in:
+14
-1
@@ -1,6 +1,6 @@
|
||||
# Changelog
|
||||
|
||||
## [0.11.15.0] - 2026-03-24 — Ship With Teeth
|
||||
## [0.11.16.0] - 2026-03-24 — Ship With Teeth
|
||||
|
||||
`/ship` and `/review` now actually enforce the quality gates they've been talking about. Coverage audit becomes a real gate (not just a diagram), plan completion gets verified against the diff, and verification steps from your plan run automatically.
|
||||
|
||||
@@ -15,6 +15,19 @@
|
||||
- **Ship metrics logging.** Coverage %, plan completion ratio, and verification results are logged to review JSONL for /retro to track trends.
|
||||
- **Plan completion in /retro.** Weekly retros now show plan completion rates across shipped branches.
|
||||
|
||||
## [0.11.15.0] - 2026-03-24 — E2E Test Coverage for Plan Reviews & Codex
|
||||
|
||||
### Added
|
||||
|
||||
- **E2E tests verify plan review reports appear at the bottom of plans.** The `/plan-eng-review` review report is now tested end-to-end — if it stops writing `## GSTACK REVIEW REPORT` to the plan file, the test catches it.
|
||||
- **E2E tests verify Codex is offered in every plan skill.** Four new lightweight tests confirm that `/office-hours`, `/plan-ceo-review`, `/plan-design-review`, and `/plan-eng-review` all check for Codex availability, prompt the user, and handle the fallback when Codex is unavailable.
|
||||
|
||||
### For contributors
|
||||
|
||||
- New E2E tests in `test/skill-e2e-plan.test.ts`: `plan-review-report`, `codex-offered-eng-review`, `codex-offered-ceo-review`, `codex-offered-office-hours`, `codex-offered-design-review`
|
||||
- Updated touchfile mappings and selection count assertions
|
||||
- Added `touchfiles` to the documented global touchfile list in CLAUDE.md
|
||||
|
||||
## [0.11.14.0] - 2026-03-24 — Windows Browse Fix
|
||||
|
||||
### Fixed
|
||||
|
||||
@@ -29,7 +29,7 @@ against the previous run.
|
||||
**Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based
|
||||
on `git diff` against the base branch. Each test declares its file dependencies in
|
||||
`test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store,
|
||||
llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
|
||||
llm-judge, gen-skill-docs, touchfiles) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
|
||||
variants to force all tests. Run `eval:select` to preview which tests would run.
|
||||
|
||||
## Testing
|
||||
|
||||
@@ -68,6 +68,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'plan-eng-review': ['plan-eng-review/**'],
|
||||
'plan-eng-review-artifact': ['plan-eng-review/**'],
|
||||
'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Codex offering verification
|
||||
'codex-offered-office-hours': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
|
||||
'codex-offered-ceo-review': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
'codex-offered-eng-review': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Ship
|
||||
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
|
||||
|
||||
@@ -535,6 +535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Plan Review Report E2E ---
|
||||
// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section
|
||||
// to the bottom of the plan file (the living review status footer).
|
||||
|
||||
describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => {
|
||||
let planDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System
|
||||
|
||||
## Context
|
||||
We're building a real-time notification system for our SaaS app.
|
||||
|
||||
## Changes
|
||||
1. WebSocket server for push notifications
|
||||
2. Notification preferences API
|
||||
3. Email digest fallback for offline users
|
||||
4. PostgreSQL table for notification storage
|
||||
|
||||
## Architecture
|
||||
- WebSocket: Socket.io on Express
|
||||
- Queue: Bull + Redis for email digests
|
||||
- Storage: PostgreSQL notifications table
|
||||
- Frontend: React toast component
|
||||
|
||||
## Open questions
|
||||
- Retry policy for failed WebSocket delivery?
|
||||
- Max notifications stored per user?
|
||||
`);
|
||||
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'add plan']);
|
||||
|
||||
// Copy plan-eng-review skill
|
||||
fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
|
||||
path.join(planDir, 'plan-eng-review', 'SKILL.md'),
|
||||
);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
|
||||
|
||||
Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.
|
||||
|
||||
Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive.
|
||||
Skip the preamble bash block, lake intro, telemetry, and contributor mode sections.
|
||||
|
||||
CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content.
|
||||
|
||||
This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 20,
|
||||
timeout: 360_000,
|
||||
testName: 'plan-review-report',
|
||||
runId,
|
||||
model: 'claude-opus-4-6',
|
||||
});
|
||||
|
||||
logCost('/plan-eng-review report', result);
|
||||
recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
// Verify the review report was written to the plan file
|
||||
const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8');
|
||||
|
||||
// Original plan content should still be present
|
||||
expect(planContent).toContain('# Plan: Add Notifications System');
|
||||
expect(planContent).toContain('WebSocket');
|
||||
|
||||
// Review report section must exist
|
||||
expect(planContent).toContain('## GSTACK REVIEW REPORT');
|
||||
|
||||
// Report should be at the bottom of the file
|
||||
const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT');
|
||||
const afterReport = planContent.slice(reportIndex);
|
||||
|
||||
// Should contain the review table with standard rows
|
||||
expect(afterReport).toMatch(/\|\s*Review\s*\|/);
|
||||
expect(afterReport).toContain('CEO Review');
|
||||
expect(afterReport).toContain('Eng Review');
|
||||
expect(afterReport).toContain('Design Review');
|
||||
|
||||
console.log('Plan review report found at bottom of plan.md');
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Codex Offering E2E ---
|
||||
// Verifies that Codex is properly offered (with availability check, user prompt,
|
||||
// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review.
|
||||
|
||||
describeIfSelected('Codex Offering E2E', [
|
||||
'codex-offered-office-hours', 'codex-offered-ceo-review',
|
||||
'codex-offered-design-review', 'codex-offered-eng-review',
|
||||
], () => {
|
||||
let testDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'init']);
|
||||
|
||||
// Copy all 4 SKILL.md files
|
||||
for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) {
|
||||
fs.mkdirSync(path.join(testDir, skill), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, skill, 'SKILL.md'),
|
||||
path.join(testDir, skill, 'SKILL.md'),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
async function checkCodexOffering(skill: string, testName: string, featureName: string) {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion".
|
||||
|
||||
Summarize the Codex/${featureName} integration — answer these specific questions:
|
||||
1. How is Codex availability checked? (what exact bash command?)
|
||||
2. How is the user prompted? (via AskUserQuestion? what are the options?)
|
||||
3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?)
|
||||
4. Is this step blocking (gates the workflow) or optional (can be skipped)?
|
||||
5. What prompt/context is sent to Codex?
|
||||
|
||||
Write your summary to ${testDir}/${testName}-summary.md`,
|
||||
workingDirectory: testDir,
|
||||
maxTurns: 8,
|
||||
timeout: 120_000,
|
||||
testName,
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost(`/${skill} codex offering`, result);
|
||||
recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
const summaryPath = path.join(testDir, `${testName}-summary.md`);
|
||||
expect(fs.existsSync(summaryPath)).toBe(true);
|
||||
|
||||
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
|
||||
// All skills should have codex availability check
|
||||
expect(summary).toMatch(/which codex/);
|
||||
// All skills should have fallback behavior
|
||||
expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/);
|
||||
// All skills should show it's optional/non-blocking
|
||||
expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/);
|
||||
|
||||
console.log(`${skill}: Codex offering verified`);
|
||||
}
|
||||
|
||||
testConcurrentIfSelected('codex-offered-office-hours', async () => {
|
||||
await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion');
|
||||
}, 180_000);
|
||||
|
||||
testConcurrentIfSelected('codex-offered-ceo-review', async () => {
|
||||
await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice');
|
||||
}, 180_000);
|
||||
|
||||
testConcurrentIfSelected('codex-offered-design-review', async () => {
|
||||
await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices');
|
||||
}, 180_000);
|
||||
|
||||
testConcurrentIfSelected('codex-offered-eng-review', async () => {
|
||||
await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice');
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
await finalizeEvalCollector(evalCollector);
|
||||
|
||||
@@ -80,8 +80,9 @@ describe('selectTests', () => {
|
||||
expect(result.selected).toContain('plan-ceo-review-selective');
|
||||
expect(result.selected).toContain('plan-ceo-review-benefits');
|
||||
expect(result.selected).toContain('autoplan-core');
|
||||
expect(result.selected.length).toBe(4);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 4);
|
||||
expect(result.selected).toContain('codex-offered-ceo-review');
|
||||
expect(result.selected.length).toBe(5);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5);
|
||||
});
|
||||
|
||||
test('global touchfile triggers ALL tests', () => {
|
||||
|
||||
Reference in New Issue
Block a user