diff --git a/package.json b/package.json index 3cb6288a..2bf4a238 100644 --- a/package.json +++ b/package.json @@ -12,11 +12,13 @@ "gen:skill-docs": "bun run scripts/gen-skill-docs.ts", "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", - "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts", - "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", - "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", - "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", - "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts", + "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts", + "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test:codex": "EVALS=1 bun test test/codex-e2e.test.ts", + "test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts", diff --git a/test/codex-e2e.test.ts b/test/codex-e2e.test.ts new file mode 100644 index 00000000..91fc7abc --- /dev/null +++ b/test/codex-e2e.test.ts @@ -0,0 +1,187 @@ +/** + * Codex CLI E2E tests — verify skills work when invoked by Codex. + * + * Spawns `codex exec` with skills installed in a temp HOME, parses JSONL + * output, and validates structured results. Follows the same pattern as + * skill-e2e.test.ts but adapted for Codex CLI. + * + * Prerequisites: + * - `codex` binary installed (npm install -g @openai/codex) + * - OPENAI_API_KEY env var set + * - EVALS=1 env var set (same gate as Claude E2E tests) + * + * Skips gracefully when prerequisites are not met. + */ + +import { describe, test, expect, afterAll } from 'bun:test'; +import { runCodexSkill, parseCodexJSONL, installSkillToTempHome } from './helpers/codex-session-runner'; +import type { CodexResult } from './helpers/codex-session-runner'; +import { EvalCollector } from './helpers/eval-store'; +import type { EvalTestEntry } from './helpers/eval-store'; +import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// --- Prerequisites check --- + +const CODEX_AVAILABLE = (() => { + try { + const result = Bun.spawnSync(['which', 'codex']); + return result.exitCode === 0; + } catch { return false; } +})(); + +const HAS_API_KEY = !!process.env.OPENAI_API_KEY; +const evalsEnabled = !!process.env.EVALS; + +// Skip all tests if codex is not available, API key is not set, or EVALS is not set +const SKIP = !CODEX_AVAILABLE || !HAS_API_KEY || !evalsEnabled; + +const describeCodex = SKIP ? describe.skip : describe; + +// Log why we're skipping (helpful for debugging CI) +if (!evalsEnabled) { + // Silent — same as Claude E2E tests, EVALS=1 required +} else if (!CODEX_AVAILABLE) { + process.stderr.write('\nCodex E2E: SKIPPED — codex binary not found (install: npm i -g @openai/codex)\n'); +} else if (!HAS_API_KEY) { + process.stderr.write('\nCodex E2E: SKIPPED — OPENAI_API_KEY not set\n'); +} + +// --- Diff-based test selection --- + +// Codex E2E touchfiles — keyed by test name, same pattern as E2E_TOUCHFILES +const CODEX_E2E_TOUCHFILES: Record = { + 'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'], + 'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'], +}; + +let selectedTests: string[] | null = null; // null = run all + +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE + || detectBaseBranch(ROOT) + || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, CODEX_E2E_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + process.stderr.write(`\nCodex E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(CODEX_E2E_TOUCHFILES).length} tests\n`); + if (selection.skipped.length > 0) { + process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`); + } + process.stderr.write('\n'); + } + // If changedFiles is empty (e.g., on main branch), selectedTests stays null -> run all +} + +/** Skip an individual test if not selected by diff-based selection. */ +function testIfSelected(testName: string, fn: () => Promise, timeout: number) { + const shouldRun = selectedTests === null || selectedTests.includes(testName); + (shouldRun ? test : test.skip)(testName, fn, timeout); +} + +// --- Eval result collector --- + +const evalCollector = evalsEnabled && !SKIP ? new EvalCollector('e2e-codex') : null; + +/** DRY helper to record a Codex E2E test result into the eval collector. */ +function recordCodexE2E(name: string, result: CodexResult, passed: boolean) { + evalCollector?.addTest({ + name, + suite: 'codex-e2e', + tier: 'e2e', + passed, + duration_ms: result.durationMs, + cost_usd: 0, // Codex doesn't report cost in the same way; tokens are tracked + output: result.output?.slice(0, 2000), + turns_used: result.toolCalls.length, // approximate: tool calls as turns + exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`, + }); +} + +/** Print cost summary after a Codex E2E test. */ +function logCodexCost(label: string, result: CodexResult) { + const durationSec = Math.round(result.durationMs / 1000); + console.log(`${label}: ${result.tokens} tokens, ${result.toolCalls.length} tool calls, ${durationSec}s`); +} + +// Finalize eval results on exit +afterAll(async () => { + if (evalCollector) { + await evalCollector.finalize(); + } +}); + +// --- Tests --- + +describeCodex('Codex E2E', () => { + + testIfSelected('codex-discover-skill', async () => { + // Install gstack-review skill to a temp HOME and ask Codex to list skills + const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review'); + + const result = await runCodexSkill({ + skillDir, + prompt: 'List any skills or instructions you have available. Just list the names.', + timeoutMs: 60_000, + cwd: ROOT, + skillName: 'gstack-review', + }); + + logCodexCost('codex-discover-skill', result); + + // Codex should have produced some output + const passed = result.exitCode === 0 && result.output.length > 0; + recordCodexE2E('codex-discover-skill', result, passed); + + expect(result.exitCode).toBe(0); + expect(result.output.length).toBeGreaterThan(0); + // The output should reference the skill name in some form + const outputLower = result.output.toLowerCase(); + expect( + outputLower.includes('review') || outputLower.includes('gstack') || outputLower.includes('skill'), + ).toBe(true); + }, 120_000); + + testIfSelected('codex-review-findings', async () => { + // Install gstack-review skill and ask Codex to review the current repo + const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review'); + + const result = await runCodexSkill({ + skillDir, + prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.', + timeoutMs: 300_000, + cwd: ROOT, + skillName: 'gstack-review', + }); + + logCodexCost('codex-review-findings', result); + + // Should produce structured review-like output + const output = result.output; + const passed = result.exitCode === 0 && output.length > 50; + recordCodexE2E('codex-review-findings', result, passed); + + expect(result.exitCode).toBe(0); + expect(output.length).toBeGreaterThan(50); + + // Review output should contain some review-like content + const outputLower = output.toLowerCase(); + const hasReviewContent = + outputLower.includes('finding') || + outputLower.includes('issue') || + outputLower.includes('review') || + outputLower.includes('change') || + outputLower.includes('diff') || + outputLower.includes('clean') || + outputLower.includes('no issues') || + outputLower.includes('p1') || + outputLower.includes('p2'); + expect(hasReviewContent).toBe(true); + }, 360_000); +}); diff --git a/test/helpers/codex-session-runner.ts b/test/helpers/codex-session-runner.ts new file mode 100644 index 00000000..c4d0256f --- /dev/null +++ b/test/helpers/codex-session-runner.ts @@ -0,0 +1,262 @@ +/** + * Codex CLI subprocess runner for skill E2E testing. + * + * Spawns `codex exec` as a completely independent process, parses its JSONL + * output, and returns structured results. Follows the same pattern as + * session-runner.ts but adapted for the Codex CLI. + * + * Key differences from Claude session-runner: + * - Uses `codex exec` instead of `claude -p` + * - Output is JSONL with different event types (item.completed, turn.completed, thread.started) + * - Uses `--json` flag instead of `--output-format stream-json` + * - Needs temp HOME with skill installed at ~/.codex/skills/{skillName}/SKILL.md + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +// --- Interfaces --- + +export interface CodexResult { + output: string; // Full agent message text + reasoning: string[]; // [codex thinking] blocks + toolCalls: string[]; // [codex ran] commands + tokens: number; // Total tokens used + exitCode: number; // Process exit code + durationMs: number; // Wall clock time + sessionId: string | null; // Thread ID for session continuity + rawLines: string[]; // Raw JSONL lines for debugging +} + +// --- JSONL parser (ported from Python in codex/SKILL.md.tmpl) --- + +export interface ParsedCodexJSONL { + output: string; + reasoning: string[]; + toolCalls: string[]; + tokens: number; + sessionId: string | null; +} + +/** + * Parse an array of JSONL lines from `codex exec --json` into structured data. + * Pure function — no I/O, no side effects. + * + * Handles these Codex event types: + * - thread.started → extract thread_id (session ID) + * - item.completed → extract reasoning, agent_message, command_execution + * - turn.completed → extract token usage + */ +export function parseCodexJSONL(lines: string[]): ParsedCodexJSONL { + const outputParts: string[] = []; + const reasoning: string[] = []; + const toolCalls: string[] = []; + let tokens = 0; + let sessionId: string | null = null; + + for (const line of lines) { + if (!line.trim()) continue; + try { + const obj = JSON.parse(line); + const t = obj.type || ''; + + if (t === 'thread.started') { + const tid = obj.thread_id || ''; + if (tid) sessionId = tid; + } else if (t === 'item.completed' && obj.item) { + const item = obj.item; + const itype = item.type || ''; + const text = item.text || ''; + + if (itype === 'reasoning' && text) { + reasoning.push(text); + } else if (itype === 'agent_message' && text) { + outputParts.push(text); + } else if (itype === 'command_execution') { + const cmd = item.command || ''; + if (cmd) toolCalls.push(cmd); + } + } else if (t === 'turn.completed') { + const usage = obj.usage || {}; + const turnTokens = (usage.input_tokens || 0) + (usage.output_tokens || 0); + tokens += turnTokens; + } + } catch { /* skip malformed lines */ } + } + + return { + output: outputParts.join('\n'), + reasoning, + toolCalls, + tokens, + sessionId, + }; +} + +// --- Skill installation helper --- + +/** + * Install a SKILL.md into a temp HOME directory for Codex to discover. + * Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME. + * + * Returns the temp HOME path. Caller is responsible for cleanup. + */ +export function installSkillToTempHome( + skillDir: string, + skillName: string, + tempHome?: string, +): string { + const home = tempHome || fs.mkdtempSync(path.join(os.tmpdir(), 'codex-e2e-')); + const destDir = path.join(home, '.codex', 'skills', skillName); + fs.mkdirSync(destDir, { recursive: true }); + + const srcSkill = path.join(skillDir, 'SKILL.md'); + if (fs.existsSync(srcSkill)) { + fs.copyFileSync(srcSkill, path.join(destDir, 'SKILL.md')); + } + + return home; +} + +// --- Main runner --- + +/** + * Run a Codex skill via `codex exec` and return structured results. + * + * Spawns codex in a temp HOME with the skill installed, parses JSONL output, + * and returns a CodexResult. Skips gracefully if codex binary is not found. + */ +export async function runCodexSkill(opts: { + skillDir: string; // Path to skill directory containing SKILL.md + prompt: string; // What to ask Codex to do with the skill + timeoutMs?: number; // Default 300000 (5 min) + cwd?: string; // Working directory + skillName?: string; // Skill name for installation (default: dirname) + sandbox?: string; // Sandbox mode (default: 'read-only') +}): Promise { + const { + skillDir, + prompt, + timeoutMs = 300_000, + cwd, + skillName, + sandbox = 'read-only', + } = opts; + + const startTime = Date.now(); + const name = skillName || path.basename(skillDir) || 'gstack'; + + // Check if codex binary exists + const whichResult = Bun.spawnSync(['which', 'codex']); + if (whichResult.exitCode !== 0) { + return { + output: 'SKIP: codex binary not found', + reasoning: [], + toolCalls: [], + tokens: 0, + exitCode: -1, + durationMs: Date.now() - startTime, + sessionId: null, + rawLines: [], + }; + } + + // Set up temp HOME with skill installed + const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), 'codex-e2e-')); + + try { + installSkillToTempHome(skillDir, name, tempHome); + + // Build codex exec command + const args = ['exec', prompt, '--json', '-s', sandbox]; + + // Spawn codex with temp HOME so it discovers our installed skill + const proc = Bun.spawn(['codex', ...args], { + cwd: cwd || skillDir, + stdout: 'pipe', + stderr: 'pipe', + env: { + ...process.env, + HOME: tempHome, + }, + }); + + // Race against timeout + let timedOut = false; + const timeoutId = setTimeout(() => { + timedOut = true; + proc.kill(); + }, timeoutMs); + + // Stream and collect JSONL from stdout + const collectedLines: string[] = []; + const stderrPromise = new Response(proc.stderr).text(); + + const reader = proc.stdout.getReader(); + const decoder = new TextDecoder(); + let buf = ''; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + const lines = buf.split('\n'); + buf = lines.pop() || ''; + for (const line of lines) { + if (!line.trim()) continue; + collectedLines.push(line); + + // Real-time progress to stderr + try { + const event = JSON.parse(line); + if (event.type === 'item.completed' && event.item) { + const item = event.item; + if (item.type === 'command_execution' && item.command) { + const elapsed = Math.round((Date.now() - startTime) / 1000); + process.stderr.write(` [codex ${elapsed}s] ran: ${item.command.slice(0, 100)}\n`); + } else if (item.type === 'agent_message' && item.text) { + const elapsed = Math.round((Date.now() - startTime) / 1000); + process.stderr.write(` [codex ${elapsed}s] message: ${item.text.slice(0, 100)}\n`); + } + } + } catch { /* skip — parseCodexJSONL will handle it later */ } + } + } + } catch { /* stream read error — fall through to exit code handling */ } + + // Flush remaining buffer + if (buf.trim()) { + collectedLines.push(buf); + } + + const stderr = await stderrPromise; + const exitCode = await proc.exited; + clearTimeout(timeoutId); + + const durationMs = Date.now() - startTime; + + // Parse all collected JSONL lines + const parsed = parseCodexJSONL(collectedLines); + + // Log stderr if non-empty (may contain auth errors, etc.) + if (stderr.trim()) { + process.stderr.write(` [codex stderr] ${stderr.trim().slice(0, 200)}\n`); + } + + return { + output: parsed.output, + reasoning: parsed.reasoning, + toolCalls: parsed.toolCalls, + tokens: parsed.tokens, + exitCode: timedOut ? 124 : exitCode, + durationMs, + sessionId: parsed.sessionId, + rawLines: collectedLines, + }; + } finally { + // Clean up temp HOME + try { fs.rmSync(tempHome, { recursive: true, force: true }); } catch { /* non-fatal */ } + } +} diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 19eba66e..53cc709c 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -73,9 +73,13 @@ export const E2E_TOUCHFILES: Record = { // Document-release 'document-release': ['document-release/**'], - // Codex + // Codex (Claude E2E — tests /codex skill via Claude) 'codex-review': ['codex/**'], + // Codex E2E (tests skills via Codex CLI) + 'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'], + 'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'], + // QA bootstrap 'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'], @@ -147,6 +151,7 @@ export const LLM_JUDGE_TOUCHFILES: Record = { */ export const GLOBAL_TOUCHFILES = [ 'test/helpers/session-runner.ts', + 'test/helpers/codex-session-runner.ts', 'test/helpers/eval-store.ts', 'test/helpers/llm-judge.ts', 'scripts/gen-skill-docs.ts',