Merge remote-tracking branch 'origin/main' into garrytan/test-coverage-catalog

# Conflicts:
#	scripts/gen-skill-docs.ts
#	test/gen-skill-docs.test.ts
This commit is contained in:
Garry Tan
2026-03-20 16:18:28 -07:00
65 changed files with 3277 additions and 499 deletions
+173
View File
@@ -0,0 +1,173 @@
/**
* Gemini CLI E2E tests — verify skills work when invoked by Gemini CLI.
*
* Spawns `gemini -p` with stream-json output in the repo root (where
* .agents/skills/ already exists), parses JSONL events, and validates
* structured results. Follows the same pattern as codex-e2e.test.ts.
*
* Prerequisites:
* - `gemini` binary installed (npm install -g @google/gemini-cli)
* - Gemini authenticated via ~/.gemini/ config or GEMINI_API_KEY env var
* - EVALS=1 env var set (same gate as Claude E2E tests)
*
* Skips gracefully when prerequisites are not met.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { runGeminiSkill } from './helpers/gemini-session-runner';
import type { GeminiResult } from './helpers/gemini-session-runner';
import { EvalCollector } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import * as path from 'path';
const ROOT = path.resolve(import.meta.dir, '..');
// --- Prerequisites check ---
const GEMINI_AVAILABLE = (() => {
try {
const result = Bun.spawnSync(['which', 'gemini']);
return result.exitCode === 0;
} catch { return false; }
})();
const evalsEnabled = !!process.env.EVALS;
// Skip all tests if gemini is not available or EVALS is not set.
const SKIP = !GEMINI_AVAILABLE || !evalsEnabled;
const describeGemini = SKIP ? describe.skip : describe;
// Log why we're skipping (helpful for debugging CI)
if (!evalsEnabled) {
// Silent — same as Claude E2E tests, EVALS=1 required
} else if (!GEMINI_AVAILABLE) {
process.stderr.write('\nGemini E2E: SKIPPED — gemini binary not found (install: npm i -g @google/gemini-cli)\n');
}
// --- Diff-based test selection ---
// Gemini E2E touchfiles — keyed by test name, same pattern as Codex E2E
const GEMINI_E2E_TOUCHFILES: Record<string, string[]> = {
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
};
let selectedTests: string[] | null = null; // null = run all
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE
|| detectBaseBranch(ROOT)
|| 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length > 0) {
const selection = selectTests(changedFiles, GEMINI_E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
selectedTests = selection.selected;
process.stderr.write(`\nGemini E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(GEMINI_E2E_TOUCHFILES).length} tests\n`);
if (selection.skipped.length > 0) {
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
}
process.stderr.write('\n');
}
// If changedFiles is empty (e.g., on main branch), selectedTests stays null -> run all
}
/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
}
// --- Eval result collector ---
const evalCollector = evalsEnabled && !SKIP ? new EvalCollector('e2e-gemini') : null;
/** DRY helper to record a Gemini E2E test result into the eval collector. */
function recordGeminiE2E(name: string, result: GeminiResult, passed: boolean) {
evalCollector?.addTest({
name,
suite: 'gemini-e2e',
tier: 'e2e',
passed,
duration_ms: result.durationMs,
cost_usd: 0, // Gemini doesn't report cost in USD; tokens are tracked
output: result.output?.slice(0, 2000),
turns_used: result.toolCalls.length, // approximate: tool calls as turns
exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
});
}
/** Print cost summary after a Gemini E2E test. */
function logGeminiCost(label: string, result: GeminiResult) {
const durationSec = Math.round(result.durationMs / 1000);
console.log(`${label}: ${result.tokens} tokens, ${result.toolCalls.length} tool calls, ${durationSec}s`);
}
// Finalize eval results on exit
afterAll(async () => {
if (evalCollector) {
await evalCollector.finalize();
}
});
// --- Tests ---
describeGemini('Gemini E2E', () => {
testIfSelected('gemini-discover-skill', async () => {
// Run Gemini in the repo root where .agents/skills/ exists
const result = await runGeminiSkill({
prompt: 'List any skills or instructions you have available. Just list the names.',
timeoutMs: 60_000,
cwd: ROOT,
});
logGeminiCost('gemini-discover-skill', result);
// Gemini should have produced some output
const passed = result.exitCode === 0 && result.output.length > 0;
recordGeminiE2E('gemini-discover-skill', result, passed);
expect(result.exitCode).toBe(0);
expect(result.output.length).toBeGreaterThan(0);
// The output should reference skills in some form
const outputLower = result.output.toLowerCase();
expect(
outputLower.includes('review') || outputLower.includes('gstack') || outputLower.includes('skill'),
).toBe(true);
}, 120_000);
testIfSelected('gemini-review-findings', async () => {
// Run gstack-review skill via Gemini on this repo
const result = await runGeminiSkill({
prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
timeoutMs: 540_000,
cwd: ROOT,
});
logGeminiCost('gemini-review-findings', result);
// Should produce structured review-like output
const output = result.output;
const passed = result.exitCode === 0 && output.length > 50;
recordGeminiE2E('gemini-review-findings', result, passed);
expect(result.exitCode).toBe(0);
expect(output.length).toBeGreaterThan(50);
// Review output should contain some review-like content
const outputLower = output.toLowerCase();
const hasReviewContent =
outputLower.includes('finding') ||
outputLower.includes('issue') ||
outputLower.includes('review') ||
outputLower.includes('change') ||
outputLower.includes('diff') ||
outputLower.includes('clean') ||
outputLower.includes('no issues') ||
outputLower.includes('p1') ||
outputLower.includes('p2');
expect(hasReviewContent).toBe(true);
}, 600_000);
});
+105 -1
View File
@@ -520,6 +520,98 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
});
});
// --- {{SPEC_REVIEW_LOOP}} resolver tests ---
describe('SPEC_REVIEW_LOOP resolver', () => {
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
test('contains all 5 review dimensions', () => {
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
expect(content).toContain(dim);
}
});
test('references Agent tool for subagent dispatch', () => {
expect(content).toMatch(/Agent.*tool/i);
});
test('specifies max 3 iterations', () => {
expect(content).toMatch(/3.*iteration|maximum.*3/i);
});
test('includes quality score', () => {
expect(content).toContain('quality score');
});
test('includes metrics path', () => {
expect(content).toContain('spec-review.jsonl');
});
test('includes convergence guard', () => {
expect(content).toMatch(/[Cc]onvergence/);
});
test('includes graceful failure handling', () => {
expect(content).toMatch(/skip.*review|unavailable/i);
});
});
// --- {{DESIGN_SKETCH}} resolver tests ---
describe('DESIGN_SKETCH resolver', () => {
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
test('references DESIGN.md for design system constraints', () => {
expect(content).toContain('DESIGN.md');
});
test('contains wireframe or sketch terminology', () => {
expect(content).toMatch(/wireframe|sketch/i);
});
test('references browse binary for rendering', () => {
expect(content).toContain('$B goto');
});
test('references screenshot capture', () => {
expect(content).toContain('$B screenshot');
});
test('specifies rough aesthetic', () => {
expect(content).toMatch(/[Rr]ough|hand-drawn/);
});
test('includes skip conditions', () => {
expect(content).toMatch(/no UI component|skip/i);
});
});
// --- {{BENEFITS_FROM}} resolver tests ---
describe('BENEFITS_FROM resolver', () => {
const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
const engContent = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
test('plan-ceo-review contains prerequisite skill offer', () => {
expect(ceoContent).toContain('Prerequisite Skill Offer');
expect(ceoContent).toContain('/office-hours');
});
test('plan-eng-review contains prerequisite skill offer', () => {
expect(engContent).toContain('Prerequisite Skill Offer');
expect(engContent).toContain('/office-hours');
});
test('offer includes graceful decline', () => {
expect(ceoContent).toContain('No worries');
});
test('skills without benefits-from do NOT have prerequisite offer', () => {
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(qaContent).not.toContain('Prerequisite Skill Offer');
});
});
// ─── Codex Generation Tests ─────────────────────────────────
describe('Codex generation (--host codex)', () => {
@@ -596,6 +688,16 @@ describe('Codex generation (--host codex)', () => {
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-codex'))).toBe(false);
});
test('Codex review step stripped from Codex-host ship and review', () => {
const shipContent = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-ship', 'SKILL.md'), 'utf-8');
expect(shipContent).not.toContain('codex review --base');
expect(shipContent).not.toContain('Investigate and fix');
const reviewContent = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
expect(reviewContent).not.toContain('codex review --base');
expect(reviewContent).not.toContain('Investigate and fix');
});
test('--host codex --dry-run freshness', () => {
const result = Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex', '--dry-run'], {
cwd: ROOT,
@@ -846,7 +948,8 @@ describe('telemetry', () => {
test('generated SKILL.md contains telemetry opt-in prompt', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('.telemetry-prompted');
expect(content).toContain('anonymous usage data');
expect(content).toContain('Help gstack get better');
expect(content).toContain('gstack-config set telemetry community');
expect(content).toContain('gstack-config set telemetry anonymous');
expect(content).toContain('gstack-config set telemetry off');
});
@@ -859,6 +962,7 @@ describe('telemetry', () => {
expect(content).toContain('_TEL_DUR');
expect(content).toContain('SKILL_NAME');
expect(content).toContain('OUTCOME');
expect(content).toContain('PLAN MODE EXCEPTION');
});
test('generated SKILL.md contains pending marker handling', () => {
+104
View File
@@ -0,0 +1,104 @@
import { describe, test, expect } from 'bun:test';
import { parseGeminiJSONL } from './gemini-session-runner';
// Fixture: actual Gemini CLI stream-json output with tool use
const FIXTURE_LINES = [
'{"type":"init","timestamp":"2026-03-20T15:14:46.455Z","session_id":"test-session-123","model":"auto-gemini-3"}',
'{"type":"message","timestamp":"2026-03-20T15:14:46.456Z","role":"user","content":"list the files"}',
'{"type":"message","timestamp":"2026-03-20T15:14:49.650Z","role":"assistant","content":"I will list the files.","delta":true}',
'{"type":"tool_use","timestamp":"2026-03-20T15:14:49.690Z","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
'{"type":"tool_result","timestamp":"2026-03-20T15:14:49.931Z","tool_id":"cmd_1","status":"success","output":"file1.ts\\nfile2.ts"}',
'{"type":"message","timestamp":"2026-03-20T15:14:51.945Z","role":"assistant","content":"Here are the files.","delta":true}',
'{"type":"result","timestamp":"2026-03-20T15:14:52.030Z","status":"success","stats":{"total_tokens":27147,"input_tokens":26928,"output_tokens":87,"cached":0,"duration_ms":5575,"tool_calls":1}}',
];
describe('parseGeminiJSONL', () => {
test('extracts session ID from init event', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.sessionId).toBe('test-session-123');
});
test('concatenates assistant message deltas into output', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.output).toBe('I will list the files.Here are the files.');
});
test('ignores user messages', () => {
const lines = [
'{"type":"message","role":"user","content":"this should be ignored"}',
'{"type":"message","role":"assistant","content":"this should be kept","delta":true}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.output).toBe('this should be kept');
});
test('extracts tool names from tool_use events', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.toolCalls).toHaveLength(1);
expect(parsed.toolCalls[0]).toBe('run_shell_command');
});
test('extracts total tokens from result stats', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.tokens).toBe(27147);
});
test('skips malformed lines without throwing', () => {
const lines = [
'{"type":"init","session_id":"ok"}',
'this is not json',
'{"type":"message","role":"assistant","content":"hello","delta":true}',
'{incomplete json',
'{"type":"result","status":"success","stats":{"total_tokens":100}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBe('ok');
expect(parsed.output).toBe('hello');
expect(parsed.tokens).toBe(100);
});
test('skips empty and whitespace-only lines', () => {
const lines = [
'',
' ',
'{"type":"init","session_id":"s1"}',
'\t',
'{"type":"result","status":"success","stats":{"total_tokens":50}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBe('s1');
expect(parsed.tokens).toBe(50);
});
test('handles empty input', () => {
const parsed = parseGeminiJSONL([]);
expect(parsed.output).toBe('');
expect(parsed.toolCalls).toHaveLength(0);
expect(parsed.tokens).toBe(0);
expect(parsed.sessionId).toBeNull();
});
test('handles missing fields gracefully', () => {
const lines = [
'{"type":"init"}', // no session_id
'{"type":"message","role":"assistant"}', // no content
'{"type":"tool_use"}', // no tool_name
'{"type":"result","status":"success"}', // no stats
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBeNull();
expect(parsed.output).toBe('');
expect(parsed.toolCalls).toHaveLength(0);
expect(parsed.tokens).toBe(0);
});
test('handles multiple tool_use events', () => {
const lines = [
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
'{"type":"tool_use","tool_name":"read_file","tool_id":"cmd_2","parameters":{"path":"foo.ts"}}',
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_3","parameters":{"command":"cat bar.ts"}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.toolCalls).toEqual(['run_shell_command', 'read_file', 'run_shell_command']);
});
});
+201
View File
@@ -0,0 +1,201 @@
/**
* Gemini CLI subprocess runner for skill E2E testing.
*
* Spawns `gemini -p` as an independent process, parses its stream-json
* output, and returns structured results. Follows the same pattern as
* codex-session-runner.ts but adapted for the Gemini CLI.
*
* Key differences from Codex session-runner:
* - Uses `gemini -p` instead of `codex exec`
* - Output is NDJSON with event types: init, message, tool_use, tool_result, result
* - Uses `--output-format stream-json --yolo` instead of `--json -s read-only`
* - No temp HOME needed — Gemini discovers skills from `.agents/skills/` in cwd
* - Message events are streamed with `delta: true` — must concatenate
*/
import * as path from 'path';
// --- Interfaces ---
export interface GeminiResult {
output: string; // Full assistant message text (concatenated deltas)
toolCalls: string[]; // Tool names from tool_use events
tokens: number; // Total tokens used
exitCode: number; // Process exit code
durationMs: number; // Wall clock time
sessionId: string | null; // Session ID from init event
rawLines: string[]; // Raw JSONL lines for debugging
}
// --- JSONL parser ---
export interface ParsedGeminiJSONL {
output: string;
toolCalls: string[];
tokens: number;
sessionId: string | null;
}
/**
* Parse an array of JSONL lines from `gemini -p --output-format stream-json`.
* Pure function — no I/O, no side effects.
*
* Handles these Gemini event types:
* - init → extract session_id
* - message (role=assistant, delta=true) → concatenate content into output
* - tool_use → extract tool_name
* - tool_result → logged but not extracted
* - result → extract token usage from stats
*/
export function parseGeminiJSONL(lines: string[]): ParsedGeminiJSONL {
const outputParts: string[] = [];
const toolCalls: string[] = [];
let tokens = 0;
let sessionId: string | null = null;
for (const line of lines) {
if (!line.trim()) continue;
try {
const obj = JSON.parse(line);
const t = obj.type || '';
if (t === 'init') {
const sid = obj.session_id || '';
if (sid) sessionId = sid;
} else if (t === 'message') {
if (obj.role === 'assistant' && obj.content) {
outputParts.push(obj.content);
}
} else if (t === 'tool_use') {
const name = obj.tool_name || '';
if (name) toolCalls.push(name);
} else if (t === 'result') {
const stats = obj.stats || {};
tokens = (stats.total_tokens || 0);
}
} catch { /* skip malformed lines */ }
}
return {
output: outputParts.join(''),
toolCalls,
tokens,
sessionId,
};
}
// --- Main runner ---
/**
* Run a prompt via `gemini -p` and return structured results.
*
* Spawns gemini with stream-json output, parses JSONL events,
* and returns a GeminiResult. Skips gracefully if gemini binary is not found.
*/
export async function runGeminiSkill(opts: {
prompt: string; // What to ask Gemini
timeoutMs?: number; // Default 300000 (5 min)
cwd?: string; // Working directory (where .agents/skills/ lives)
}): Promise<GeminiResult> {
const {
prompt,
timeoutMs = 300_000,
cwd,
} = opts;
const startTime = Date.now();
// Check if gemini binary exists
const whichResult = Bun.spawnSync(['which', 'gemini']);
if (whichResult.exitCode !== 0) {
return {
output: 'SKIP: gemini binary not found',
toolCalls: [],
tokens: 0,
exitCode: -1,
durationMs: Date.now() - startTime,
sessionId: null,
rawLines: [],
};
}
// Build gemini command
const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];
// Spawn gemini — uses real HOME for auth, cwd for skill discovery
const proc = Bun.spawn(['gemini', ...args], {
cwd: cwd || process.cwd(),
stdout: 'pipe',
stderr: 'pipe',
});
// Race against timeout
let timedOut = false;
const timeoutId = setTimeout(() => {
timedOut = true;
proc.kill();
}, timeoutMs);
// Stream and collect JSONL from stdout
const collectedLines: string[] = [];
const stderrPromise = new Response(proc.stderr).text();
const reader = proc.stdout.getReader();
const decoder = new TextDecoder();
let buf = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buf += decoder.decode(value, { stream: true });
const lines = buf.split('\n');
buf = lines.pop() || '';
for (const line of lines) {
if (!line.trim()) continue;
collectedLines.push(line);
// Real-time progress to stderr
try {
const event = JSON.parse(line);
if (event.type === 'tool_use' && event.tool_name) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [gemini ${elapsed}s] tool: ${event.tool_name}\n`);
} else if (event.type === 'message' && event.role === 'assistant' && event.content) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [gemini ${elapsed}s] message: ${event.content.slice(0, 100)}\n`);
}
} catch { /* skip — parseGeminiJSONL will handle it later */ }
}
}
} catch { /* stream read error — fall through to exit code handling */ }
// Flush remaining buffer
if (buf.trim()) {
collectedLines.push(buf);
}
const stderr = await stderrPromise;
const exitCode = await proc.exited;
clearTimeout(timeoutId);
const durationMs = Date.now() - startTime;
// Parse all collected JSONL lines
const parsed = parseGeminiJSONL(collectedLines);
// Log stderr if non-empty (may contain auth errors, etc.)
if (stderr.trim()) {
process.stderr.write(` [gemini stderr] ${stderr.trim().slice(0, 200)}\n`);
}
return {
output: parsed.output,
toolCalls: parsed.toolCalls,
tokens: parsed.tokens,
exitCode: timedOut ? 124 : exitCode,
durationMs,
sessionId: parsed.sessionId,
rawLines: collectedLines,
};
}
+13
View File
@@ -57,9 +57,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Office Hours
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
// Plan reviews
'plan-ceo-review': ['plan-ceo-review/**'],
'plan-ceo-review-selective': ['plan-ceo-review/**'],
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],
@@ -80,6 +84,10 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
// Gemini E2E (tests skills via Gemini CLI)
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
// QA bootstrap
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
@@ -141,6 +149,10 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
// Office Hours
'office-hours/SKILL.md spec review': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'office-hours/SKILL.md design sketch': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
// Other skills
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
@@ -153,6 +165,7 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts',
'test/helpers/codex-session-runner.ts',
'test/helpers/gemini-session-runner.ts',
'test/helpers/eval-store.ts',
'test/helpers/llm-judge.ts',
'scripts/gen-skill-docs.ts',
+122
View File
@@ -2928,6 +2928,128 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
}, 360_000);
});
// --- Office Hours Spec Review E2E ---
describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'], () => {
let ohDir: string;
beforeAll(() => {
ohDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-oh-spec-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: ohDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(ohDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy office-hours skill
fs.mkdirSync(path.join(ohDir, 'office-hours'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'office-hours', 'SKILL.md'),
path.join(ohDir, 'office-hours', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
});
test('/office-hours SKILL.md contains spec review loop', async () => {
const result = await runSkillTest({
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
Summarize what the "Spec Review Loop" section does specifically:
1. How many dimensions does the reviewer check?
2. What tool is used to dispatch the reviewer?
3. What's the maximum number of iterations?
4. What metrics are tracked?
Write your summary to ${ohDir}/spec-review-summary.md`,
workingDirectory: ohDir,
maxTurns: 8,
timeout: 120_000,
testName: 'office-hours-spec-review',
runId,
});
logCost('/office-hours spec review', result);
recordE2E('/office-hours-spec-review', 'Office Hours Spec Review E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(ohDir, 'spec-review-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the key concepts
expect(summary).toMatch(/5.*dimension|dimension.*5|completeness|consistency|clarity|scope|feasibility/);
expect(summary).toMatch(/agent|subagent/);
expect(summary).toMatch(/3.*iteration|iteration.*3|maximum.*3/);
}
}, 180_000);
});
// --- Plan CEO Review Benefits-From E2E ---
describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefits'], () => {
let benefitsDir: string;
beforeAll(() => {
benefitsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benefits-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: benefitsDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(benefitsDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy plan-ceo-review skill
fs.mkdirSync(path.join(benefitsDir, 'plan-ceo-review'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
path.join(benefitsDir, 'plan-ceo-review', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
Summarize what happens when no design doc is found specifically:
1. Is /office-hours offered as a prerequisite?
2. What options does the user get?
3. Is there a mid-session detection for when the user seems lost?
Write your summary to ${benefitsDir}/benefits-summary.md`,
workingDirectory: benefitsDir,
maxTurns: 8,
timeout: 120_000,
testName: 'plan-ceo-review-benefits',
runId,
});
logCost('/plan-ceo-review benefits-from', result);
recordE2E('/plan-ceo-review-benefits', 'Plan CEO Review Benefits-From E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(benefitsDir, 'benefits-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the skill chaining
expect(summary).toMatch(/office.hours/);
expect(summary).toMatch(/design doc|no design/i);
}
}, 180_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+91 -4
View File
@@ -644,6 +644,59 @@ describe('office-hours skill structure', () => {
test('contains builder operating principles', () => {
expect(content).toContain('Delight is the currency');
});
// Spec Review Loop (Phase 5.5)
test('contains spec review loop', () => {
expect(content).toContain('Spec Review Loop');
});
test('contains adversarial review dimensions', () => {
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
expect(content).toContain(dim);
}
});
test('contains subagent dispatch instruction', () => {
expect(content).toMatch(/Agent.*tool|subagent/i);
});
test('contains max 3 iterations', () => {
expect(content).toMatch(/3.*iteration|maximum.*3/i);
});
test('contains quality score', () => {
expect(content).toContain('quality score');
});
test('contains spec review metrics path', () => {
expect(content).toContain('spec-review.jsonl');
});
test('contains convergence guard', () => {
expect(content).toMatch(/convergence/i);
});
// Visual Sketch (Phase 4.5)
test('contains visual sketch section', () => {
expect(content).toContain('Visual Sketch');
});
test('contains wireframe generation', () => {
expect(content).toMatch(/wireframe|sketch/i);
});
test('contains DESIGN.md awareness', () => {
expect(content).toContain('DESIGN.md');
});
test('contains browse rendering', () => {
expect(content).toContain('$B goto');
expect(content).toContain('$B screenshot');
});
test('contains rough aesthetic instruction', () => {
expect(content).toMatch(/rough|hand-drawn/i);
});
});
describe('investigate skill structure', () => {
@@ -856,6 +909,22 @@ describe('CEO review mode validation', () => {
expect(content).toContain('HOLD SCOPE');
expect(content).toContain('REDUCTION');
});
// Skill chaining (benefits-from)
test('contains prerequisite skill offer for office-hours', () => {
expect(content).toContain('Prerequisite Skill Offer');
expect(content).toContain('/office-hours');
});
test('contains mid-session detection', () => {
expect(content).toContain('Mid-session detection');
expect(content).toMatch(/still figuring out|seems lost/i);
});
// Spec review on CEO plans
test('contains spec review loop for CEO plan documents', () => {
expect(content).toContain('Spec Review Loop');
});
});
// --- gstack-slug helper ---
@@ -1187,18 +1256,36 @@ describe('Codex skill', () => {
expect(content).toContain('mktemp');
});
test('codex integration in /review offers second opinion', () => {
test('codex integration in /review has config-driven review step', () => {
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Codex second opinion');
expect(content).toContain('Codex review');
expect(content).toContain('codex_reviews');
expect(content).toContain('codex review');
expect(content).toContain('adversarial');
expect(content).toContain('xhigh');
expect(content).toContain('Investigate and fix');
expect(content).toContain('CROSS-MODEL');
});
test('codex integration in /ship offers review gate', () => {
test('codex integration in /ship has config-driven review step', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Codex');
expect(content).toContain('Codex review');
expect(content).toContain('codex_reviews');
expect(content).toContain('codex review');
expect(content).toContain('codex-review');
expect(content).toContain('xhigh');
expect(content).toContain('Investigate and fix');
});
test('codex-host ship/review do NOT contain codex review step', () => {
const shipContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-ship', 'SKILL.md'), 'utf-8');
expect(shipContent).not.toContain('codex review --base');
expect(shipContent).not.toContain('Investigate and fix');
const reviewContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-review', 'SKILL.md'), 'utf-8');
expect(reviewContent).not.toContain('codex review --base');
expect(reviewContent).not.toContain('codex_reviews');
expect(reviewContent).not.toContain('Investigate and fix');
});
test('codex integration in /plan-eng-review offers plan critique', () => {
+3 -2
View File
@@ -78,8 +78,9 @@ describe('selectTests', () => {
const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES);
expect(result.selected).toContain('plan-ceo-review');
expect(result.selected).toContain('plan-ceo-review-selective');
expect(result.selected.length).toBe(2);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2);
expect(result.selected).toContain('plan-ceo-review-benefits');
expect(result.selected.length).toBe(3);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 3);
});
test('global touchfile triggers ALL tests', () => {