Merge branch 'main' into garrytan/team-supabase-store

Resolved 4 conflicts:
- scripts/gen-skill-docs.ts: kept ARTIFACT_SETUP + added main's new
  resolvers (SPEC_REVIEW_LOOP, DESIGN_SKETCH, BENEFITS_FROM,
  CODEX_REVIEW_STEP). Updated codex review-log to use new paths.
- ship/SKILL.md.tmpl: adopted {{CODEX_REVIEW_STEP}} macro from main
- test/skill-e2e.test.ts: added main's new E2E tests (office-hours
  spec review, plan-ceo benefits-from) + kept our E2E isolation cleanup

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-21 09:29:05 -07:00
96 changed files with 17868 additions and 342 deletions
+282
View File
@@ -0,0 +1,282 @@
/**
* Codex CLI subprocess runner for skill E2E testing.
*
* Spawns `codex exec` as a completely independent process, parses its JSONL
* output, and returns structured results. Follows the same pattern as
* session-runner.ts but adapted for the Codex CLI.
*
* Key differences from Claude session-runner:
* - Uses `codex exec` instead of `claude -p`
* - Output is JSONL with different event types (item.completed, turn.completed, thread.started)
* - Uses `--json` flag instead of `--output-format stream-json`
* - Needs temp HOME with skill installed at ~/.codex/skills/{skillName}/SKILL.md
*/
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
// --- Interfaces ---
export interface CodexResult {
output: string; // Full agent message text
reasoning: string[]; // [codex thinking] blocks
toolCalls: string[]; // [codex ran] commands
tokens: number; // Total tokens used
exitCode: number; // Process exit code
durationMs: number; // Wall clock time
sessionId: string | null; // Thread ID for session continuity
rawLines: string[]; // Raw JSONL lines for debugging
}
// --- JSONL parser (ported from Python in codex/SKILL.md.tmpl) ---
export interface ParsedCodexJSONL {
output: string;
reasoning: string[];
toolCalls: string[];
tokens: number;
sessionId: string | null;
}
/**
* Parse an array of JSONL lines from `codex exec --json` into structured data.
* Pure function — no I/O, no side effects.
*
* Handles these Codex event types:
* - thread.started → extract thread_id (session ID)
* - item.completed → extract reasoning, agent_message, command_execution
* - turn.completed → extract token usage
*/
export function parseCodexJSONL(lines: string[]): ParsedCodexJSONL {
const outputParts: string[] = [];
const reasoning: string[] = [];
const toolCalls: string[] = [];
let tokens = 0;
let sessionId: string | null = null;
for (const line of lines) {
if (!line.trim()) continue;
try {
const obj = JSON.parse(line);
const t = obj.type || '';
if (t === 'thread.started') {
const tid = obj.thread_id || '';
if (tid) sessionId = tid;
} else if (t === 'item.completed' && obj.item) {
const item = obj.item;
const itype = item.type || '';
const text = item.text || '';
if (itype === 'reasoning' && text) {
reasoning.push(text);
} else if (itype === 'agent_message' && text) {
outputParts.push(text);
} else if (itype === 'command_execution') {
const cmd = item.command || '';
if (cmd) toolCalls.push(cmd);
}
} else if (t === 'turn.completed') {
const usage = obj.usage || {};
const turnTokens = (usage.input_tokens || 0) + (usage.output_tokens || 0);
tokens += turnTokens;
}
} catch { /* skip malformed lines */ }
}
return {
output: outputParts.join('\n'),
reasoning,
toolCalls,
tokens,
sessionId,
};
}
// --- Skill installation helper ---
/**
* Install a SKILL.md into a temp HOME directory for Codex to discover.
* Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME.
*
* Returns the temp HOME path. Caller is responsible for cleanup.
*/
export function installSkillToTempHome(
skillDir: string,
skillName: string,
tempHome?: string,
): string {
const home = tempHome || fs.mkdtempSync(path.join(os.tmpdir(), 'codex-e2e-'));
const destDir = path.join(home, '.codex', 'skills', skillName);
fs.mkdirSync(destDir, { recursive: true });
const srcSkill = path.join(skillDir, 'SKILL.md');
if (fs.existsSync(srcSkill)) {
fs.copyFileSync(srcSkill, path.join(destDir, 'SKILL.md'));
}
return home;
}
// --- Main runner ---
/**
* Run a Codex skill via `codex exec` and return structured results.
*
* Spawns codex in a temp HOME with the skill installed, parses JSONL output,
* and returns a CodexResult. Skips gracefully if codex binary is not found.
*/
export async function runCodexSkill(opts: {
skillDir: string; // Path to skill directory containing SKILL.md
prompt: string; // What to ask Codex to do with the skill
timeoutMs?: number; // Default 300000 (5 min)
cwd?: string; // Working directory
skillName?: string; // Skill name for installation (default: dirname)
sandbox?: string; // Sandbox mode (default: 'read-only')
}): Promise<CodexResult> {
const {
skillDir,
prompt,
timeoutMs = 300_000,
cwd,
skillName,
sandbox = 'read-only',
} = opts;
const startTime = Date.now();
const name = skillName || path.basename(skillDir) || 'gstack';
// Check if codex binary exists
const whichResult = Bun.spawnSync(['which', 'codex']);
if (whichResult.exitCode !== 0) {
return {
output: 'SKIP: codex binary not found',
reasoning: [],
toolCalls: [],
tokens: 0,
exitCode: -1,
durationMs: Date.now() - startTime,
sessionId: null,
rawLines: [],
};
}
// Set up temp HOME with skill installed
const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), 'codex-e2e-'));
const realHome = os.homedir();
try {
installSkillToTempHome(skillDir, name, tempHome);
// Symlink real Codex auth config so codex can authenticate from temp HOME.
// Codex stores auth in ~/.codex/ — we need the config but not the skills
// (we install our own test skills above).
const realCodexConfig = path.join(realHome, '.codex');
const tempCodexDir = path.join(tempHome, '.codex');
if (fs.existsSync(realCodexConfig)) {
// Copy auth-related files from real ~/.codex/ into temp ~/.codex/
// (skills/ is already set up by installSkillToTempHome)
const entries = fs.readdirSync(realCodexConfig);
for (const entry of entries) {
if (entry === 'skills') continue; // don't clobber our test skills
const src = path.join(realCodexConfig, entry);
const dst = path.join(tempCodexDir, entry);
if (!fs.existsSync(dst)) {
fs.cpSync(src, dst, { recursive: true });
}
}
}
// Build codex exec command
const args = ['exec', prompt, '--json', '-s', sandbox];
// Spawn codex with temp HOME so it discovers our installed skill
const proc = Bun.spawn(['codex', ...args], {
cwd: cwd || skillDir,
stdout: 'pipe',
stderr: 'pipe',
env: {
...process.env,
HOME: tempHome,
},
});
// Race against timeout
let timedOut = false;
const timeoutId = setTimeout(() => {
timedOut = true;
proc.kill();
}, timeoutMs);
// Stream and collect JSONL from stdout
const collectedLines: string[] = [];
const stderrPromise = new Response(proc.stderr).text();
const reader = proc.stdout.getReader();
const decoder = new TextDecoder();
let buf = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buf += decoder.decode(value, { stream: true });
const lines = buf.split('\n');
buf = lines.pop() || '';
for (const line of lines) {
if (!line.trim()) continue;
collectedLines.push(line);
// Real-time progress to stderr
try {
const event = JSON.parse(line);
if (event.type === 'item.completed' && event.item) {
const item = event.item;
if (item.type === 'command_execution' && item.command) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [codex ${elapsed}s] ran: ${item.command.slice(0, 100)}\n`);
} else if (item.type === 'agent_message' && item.text) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [codex ${elapsed}s] message: ${item.text.slice(0, 100)}\n`);
}
}
} catch { /* skip — parseCodexJSONL will handle it later */ }
}
}
} catch { /* stream read error — fall through to exit code handling */ }
// Flush remaining buffer
if (buf.trim()) {
collectedLines.push(buf);
}
const stderr = await stderrPromise;
const exitCode = await proc.exited;
clearTimeout(timeoutId);
const durationMs = Date.now() - startTime;
// Parse all collected JSONL lines
const parsed = parseCodexJSONL(collectedLines);
// Log stderr if non-empty (may contain auth errors, etc.)
if (stderr.trim()) {
process.stderr.write(` [codex stderr] ${stderr.trim().slice(0, 200)}\n`);
}
return {
output: parsed.output,
reasoning: parsed.reasoning,
toolCalls: parsed.toolCalls,
tokens: parsed.tokens,
exitCode: timedOut ? 124 : exitCode,
durationMs,
sessionId: parsed.sessionId,
rawLines: collectedLines,
};
} finally {
// Clean up temp HOME
try { fs.rmSync(tempHome, { recursive: true, force: true }); } catch { /* non-fatal */ }
}
}
+104
View File
@@ -0,0 +1,104 @@
import { describe, test, expect } from 'bun:test';
import { parseGeminiJSONL } from './gemini-session-runner';
// Fixture: actual Gemini CLI stream-json output with tool use
const FIXTURE_LINES = [
'{"type":"init","timestamp":"2026-03-20T15:14:46.455Z","session_id":"test-session-123","model":"auto-gemini-3"}',
'{"type":"message","timestamp":"2026-03-20T15:14:46.456Z","role":"user","content":"list the files"}',
'{"type":"message","timestamp":"2026-03-20T15:14:49.650Z","role":"assistant","content":"I will list the files.","delta":true}',
'{"type":"tool_use","timestamp":"2026-03-20T15:14:49.690Z","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
'{"type":"tool_result","timestamp":"2026-03-20T15:14:49.931Z","tool_id":"cmd_1","status":"success","output":"file1.ts\\nfile2.ts"}',
'{"type":"message","timestamp":"2026-03-20T15:14:51.945Z","role":"assistant","content":"Here are the files.","delta":true}',
'{"type":"result","timestamp":"2026-03-20T15:14:52.030Z","status":"success","stats":{"total_tokens":27147,"input_tokens":26928,"output_tokens":87,"cached":0,"duration_ms":5575,"tool_calls":1}}',
];
describe('parseGeminiJSONL', () => {
test('extracts session ID from init event', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.sessionId).toBe('test-session-123');
});
test('concatenates assistant message deltas into output', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.output).toBe('I will list the files.Here are the files.');
});
test('ignores user messages', () => {
const lines = [
'{"type":"message","role":"user","content":"this should be ignored"}',
'{"type":"message","role":"assistant","content":"this should be kept","delta":true}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.output).toBe('this should be kept');
});
test('extracts tool names from tool_use events', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.toolCalls).toHaveLength(1);
expect(parsed.toolCalls[0]).toBe('run_shell_command');
});
test('extracts total tokens from result stats', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.tokens).toBe(27147);
});
test('skips malformed lines without throwing', () => {
const lines = [
'{"type":"init","session_id":"ok"}',
'this is not json',
'{"type":"message","role":"assistant","content":"hello","delta":true}',
'{incomplete json',
'{"type":"result","status":"success","stats":{"total_tokens":100}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBe('ok');
expect(parsed.output).toBe('hello');
expect(parsed.tokens).toBe(100);
});
test('skips empty and whitespace-only lines', () => {
const lines = [
'',
' ',
'{"type":"init","session_id":"s1"}',
'\t',
'{"type":"result","status":"success","stats":{"total_tokens":50}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBe('s1');
expect(parsed.tokens).toBe(50);
});
test('handles empty input', () => {
const parsed = parseGeminiJSONL([]);
expect(parsed.output).toBe('');
expect(parsed.toolCalls).toHaveLength(0);
expect(parsed.tokens).toBe(0);
expect(parsed.sessionId).toBeNull();
});
test('handles missing fields gracefully', () => {
const lines = [
'{"type":"init"}', // no session_id
'{"type":"message","role":"assistant"}', // no content
'{"type":"tool_use"}', // no tool_name
'{"type":"result","status":"success"}', // no stats
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBeNull();
expect(parsed.output).toBe('');
expect(parsed.toolCalls).toHaveLength(0);
expect(parsed.tokens).toBe(0);
});
test('handles multiple tool_use events', () => {
const lines = [
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
'{"type":"tool_use","tool_name":"read_file","tool_id":"cmd_2","parameters":{"path":"foo.ts"}}',
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_3","parameters":{"command":"cat bar.ts"}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.toolCalls).toEqual(['run_shell_command', 'read_file', 'run_shell_command']);
});
});
+201
View File
@@ -0,0 +1,201 @@
/**
* Gemini CLI subprocess runner for skill E2E testing.
*
* Spawns `gemini -p` as an independent process, parses its stream-json
* output, and returns structured results. Follows the same pattern as
* codex-session-runner.ts but adapted for the Gemini CLI.
*
* Key differences from Codex session-runner:
* - Uses `gemini -p` instead of `codex exec`
* - Output is NDJSON with event types: init, message, tool_use, tool_result, result
* - Uses `--output-format stream-json --yolo` instead of `--json -s read-only`
* - No temp HOME needed — Gemini discovers skills from `.agents/skills/` in cwd
* - Message events are streamed with `delta: true` — must concatenate
*/
import * as path from 'path';
// --- Interfaces ---
export interface GeminiResult {
output: string; // Full assistant message text (concatenated deltas)
toolCalls: string[]; // Tool names from tool_use events
tokens: number; // Total tokens used
exitCode: number; // Process exit code
durationMs: number; // Wall clock time
sessionId: string | null; // Session ID from init event
rawLines: string[]; // Raw JSONL lines for debugging
}
// --- JSONL parser ---
export interface ParsedGeminiJSONL {
output: string;
toolCalls: string[];
tokens: number;
sessionId: string | null;
}
/**
* Parse an array of JSONL lines from `gemini -p --output-format stream-json`.
* Pure function — no I/O, no side effects.
*
* Handles these Gemini event types:
* - init → extract session_id
* - message (role=assistant, delta=true) → concatenate content into output
* - tool_use → extract tool_name
* - tool_result → logged but not extracted
* - result → extract token usage from stats
*/
export function parseGeminiJSONL(lines: string[]): ParsedGeminiJSONL {
const outputParts: string[] = [];
const toolCalls: string[] = [];
let tokens = 0;
let sessionId: string | null = null;
for (const line of lines) {
if (!line.trim()) continue;
try {
const obj = JSON.parse(line);
const t = obj.type || '';
if (t === 'init') {
const sid = obj.session_id || '';
if (sid) sessionId = sid;
} else if (t === 'message') {
if (obj.role === 'assistant' && obj.content) {
outputParts.push(obj.content);
}
} else if (t === 'tool_use') {
const name = obj.tool_name || '';
if (name) toolCalls.push(name);
} else if (t === 'result') {
const stats = obj.stats || {};
tokens = (stats.total_tokens || 0);
}
} catch { /* skip malformed lines */ }
}
return {
output: outputParts.join(''),
toolCalls,
tokens,
sessionId,
};
}
// --- Main runner ---
/**
* Run a prompt via `gemini -p` and return structured results.
*
* Spawns gemini with stream-json output, parses JSONL events,
* and returns a GeminiResult. Skips gracefully if gemini binary is not found.
*/
export async function runGeminiSkill(opts: {
prompt: string; // What to ask Gemini
timeoutMs?: number; // Default 300000 (5 min)
cwd?: string; // Working directory (where .agents/skills/ lives)
}): Promise<GeminiResult> {
const {
prompt,
timeoutMs = 300_000,
cwd,
} = opts;
const startTime = Date.now();
// Check if gemini binary exists
const whichResult = Bun.spawnSync(['which', 'gemini']);
if (whichResult.exitCode !== 0) {
return {
output: 'SKIP: gemini binary not found',
toolCalls: [],
tokens: 0,
exitCode: -1,
durationMs: Date.now() - startTime,
sessionId: null,
rawLines: [],
};
}
// Build gemini command
const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];
// Spawn gemini — uses real HOME for auth, cwd for skill discovery
const proc = Bun.spawn(['gemini', ...args], {
cwd: cwd || process.cwd(),
stdout: 'pipe',
stderr: 'pipe',
});
// Race against timeout
let timedOut = false;
const timeoutId = setTimeout(() => {
timedOut = true;
proc.kill();
}, timeoutMs);
// Stream and collect JSONL from stdout
const collectedLines: string[] = [];
const stderrPromise = new Response(proc.stderr).text();
const reader = proc.stdout.getReader();
const decoder = new TextDecoder();
let buf = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buf += decoder.decode(value, { stream: true });
const lines = buf.split('\n');
buf = lines.pop() || '';
for (const line of lines) {
if (!line.trim()) continue;
collectedLines.push(line);
// Real-time progress to stderr
try {
const event = JSON.parse(line);
if (event.type === 'tool_use' && event.tool_name) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [gemini ${elapsed}s] tool: ${event.tool_name}\n`);
} else if (event.type === 'message' && event.role === 'assistant' && event.content) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [gemini ${elapsed}s] message: ${event.content.slice(0, 100)}\n`);
}
} catch { /* skip — parseGeminiJSONL will handle it later */ }
}
}
} catch { /* stream read error — fall through to exit code handling */ }
// Flush remaining buffer
if (buf.trim()) {
collectedLines.push(buf);
}
const stderr = await stderrPromise;
const exitCode = await proc.exited;
clearTimeout(timeoutId);
const durationMs = Date.now() - startTime;
// Parse all collected JSONL lines
const parsed = parseGeminiJSONL(collectedLines);
// Log stderr if non-empty (may contain auth errors, etc.)
if (stderr.trim()) {
process.stderr.write(` [gemini stderr] ${stderr.trim().slice(0, 200)}\n`);
}
return {
output: parsed.output,
toolCalls: parsed.toolCalls,
tokens: parsed.tokens,
exitCode: timedOut ? 124 : exitCode,
durationMs,
sessionId: parsed.sessionId,
rawLines: collectedLines,
};
}
+19 -1
View File
@@ -57,9 +57,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Office Hours
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
// Plan reviews
'plan-ceo-review': ['plan-ceo-review/**'],
'plan-ceo-review-selective': ['plan-ceo-review/**'],
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],
@@ -73,9 +77,17 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Document-release
'document-release': ['document-release/**'],
// Codex
// Codex (Claude E2E — tests /codex skill via Claude)
'codex-review': ['codex/**'],
// Codex E2E (tests skills via Codex CLI)
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
// Gemini E2E (tests skills via Gemini CLI)
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
// QA bootstrap
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
@@ -136,6 +148,10 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
// Office Hours
'office-hours/SKILL.md spec review': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'office-hours/SKILL.md design sketch': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
// Other skills
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
@@ -147,6 +163,8 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
*/
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts',
'test/helpers/codex-session-runner.ts',
'test/helpers/gemini-session-runner.ts',
'test/helpers/eval-store.ts',
'test/helpers/llm-judge.ts',
'scripts/gen-skill-docs.ts',