From 6a6b2b076641dfdbec23d1e763ba1d62532ef035 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 20 Mar 2026 08:30:09 -0700 Subject: [PATCH 1/8] feat: Gemini CLI E2E tests (v0.9.2.0) (#252) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add Gemini CLI session runner + JSONL parser Subprocess wrapper for `gemini -p --output-format stream-json --yolo` that spawns the Gemini CLI and parses NDJSON events (init, message, tool_use, tool_result, result) into a structured GeminiResult. Includes 10 unit tests for parseGeminiJSONL covering happy path, malformed input, empty input, missing fields, and multi-tool scenarios. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: add Gemini CLI E2E tests Two E2E tests (gemini-discover-skill, gemini-review-findings) that verify gstack skills work when invoked by the Gemini CLI. Follows the same pattern as codex-e2e.test.ts — gated by EVALS=1 + binary availability, diff-based selection via touchfiles, eval persistence. - Add test/gemini-e2e.test.ts - Add Gemini entries to E2E_TOUCHFILES and GLOBAL_TOUCHFILES - Add test:gemini and test:gemini:all scripts to package.json - Add gemini-e2e.test.ts to test:evals, test:e2e, and ignore list Co-Authored-By: Claude Opus 4.6 (1M context) * chore: bump version and changelog (v0.9.2.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 8 + VERSION | 2 +- package.json | 12 +- test/gemini-e2e.test.ts | 173 ++++++++++++++++++ test/helpers/gemini-session-runner.test.ts | 104 +++++++++++ test/helpers/gemini-session-runner.ts | 201 +++++++++++++++++++++ test/helpers/touchfiles.ts | 5 + 7 files changed, 499 insertions(+), 6 deletions(-) create mode 100644 test/gemini-e2e.test.ts create mode 100644 test/helpers/gemini-session-runner.test.ts create mode 100644 test/helpers/gemini-session-runner.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index e0259c60..9e47e135 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [0.9.2.0] - 2026-03-20 — Gemini CLI E2E Tests + +### Added + +- **Gemini CLI is now tested end-to-end.** Two E2E tests verify that gstack skills work when invoked by Google's Gemini CLI (`gemini -p`). The `gemini-discover-skill` test confirms skill discovery from `.agents/skills/`, and `gemini-review-findings` runs a full code review via gstack-review. Both parse Gemini's stream-json NDJSON output and track token usage. +- **Gemini JSONL parser with 10 unit tests.** `parseGeminiJSONL` handles all Gemini event types (init, message, tool_use, tool_result, result) with defensive parsing for malformed input. The parser is a pure function, independently testable without spawning the CLI. +- **`bun run test:gemini`** and **`bun run test:gemini:all`** scripts for running Gemini E2E tests independently. Gemini tests are also included in `test:evals` and `test:e2e` aggregate scripts. + ## [0.9.1.0] - 2026-03-20 — Adversarial Spec Review + Skill Chaining ### Added diff --git a/VERSION b/VERSION index cf94a424..594150e3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.1.0 +0.9.2.0 diff --git a/package.json b/package.json index 2bf4a238..ba18c08a 100644 --- a/package.json +++ b/package.json @@ -12,13 +12,15 @@ "gen:skill-docs": "bun run scripts/gen-skill-docs.ts", "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", - "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts", - "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", - "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", - "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", - "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts", + "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts --ignore test/gemini-e2e.test.ts", + "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", "test:codex": "EVALS=1 bun test test/codex-e2e.test.ts", "test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts", + "test:gemini": "EVALS=1 bun test test/gemini-e2e.test.ts", + "test:gemini:all": "EVALS=1 EVALS_ALL=1 bun test test/gemini-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts", diff --git a/test/gemini-e2e.test.ts b/test/gemini-e2e.test.ts new file mode 100644 index 00000000..bd69919f --- /dev/null +++ b/test/gemini-e2e.test.ts @@ -0,0 +1,173 @@ +/** + * Gemini CLI E2E tests — verify skills work when invoked by Gemini CLI. + * + * Spawns `gemini -p` with stream-json output in the repo root (where + * .agents/skills/ already exists), parses JSONL events, and validates + * structured results. Follows the same pattern as codex-e2e.test.ts. + * + * Prerequisites: + * - `gemini` binary installed (npm install -g @google/gemini-cli) + * - Gemini authenticated via ~/.gemini/ config or GEMINI_API_KEY env var + * - EVALS=1 env var set (same gate as Claude E2E tests) + * + * Skips gracefully when prerequisites are not met. + */ + +import { describe, test, expect, afterAll } from 'bun:test'; +import { runGeminiSkill } from './helpers/gemini-session-runner'; +import type { GeminiResult } from './helpers/gemini-session-runner'; +import { EvalCollector } from './helpers/eval-store'; +import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// --- Prerequisites check --- + +const GEMINI_AVAILABLE = (() => { + try { + const result = Bun.spawnSync(['which', 'gemini']); + return result.exitCode === 0; + } catch { return false; } +})(); + +const evalsEnabled = !!process.env.EVALS; + +// Skip all tests if gemini is not available or EVALS is not set. +const SKIP = !GEMINI_AVAILABLE || !evalsEnabled; + +const describeGemini = SKIP ? describe.skip : describe; + +// Log why we're skipping (helpful for debugging CI) +if (!evalsEnabled) { + // Silent — same as Claude E2E tests, EVALS=1 required +} else if (!GEMINI_AVAILABLE) { + process.stderr.write('\nGemini E2E: SKIPPED — gemini binary not found (install: npm i -g @google/gemini-cli)\n'); +} + +// --- Diff-based test selection --- + +// Gemini E2E touchfiles — keyed by test name, same pattern as Codex E2E +const GEMINI_E2E_TOUCHFILES: Record = { + 'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'], + 'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'], +}; + +let selectedTests: string[] | null = null; // null = run all + +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE + || detectBaseBranch(ROOT) + || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, GEMINI_E2E_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + process.stderr.write(`\nGemini E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(GEMINI_E2E_TOUCHFILES).length} tests\n`); + if (selection.skipped.length > 0) { + process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`); + } + process.stderr.write('\n'); + } + // If changedFiles is empty (e.g., on main branch), selectedTests stays null -> run all +} + +/** Skip an individual test if not selected by diff-based selection. */ +function testIfSelected(testName: string, fn: () => Promise, timeout: number) { + const shouldRun = selectedTests === null || selectedTests.includes(testName); + (shouldRun ? test : test.skip)(testName, fn, timeout); +} + +// --- Eval result collector --- + +const evalCollector = evalsEnabled && !SKIP ? new EvalCollector('e2e-gemini') : null; + +/** DRY helper to record a Gemini E2E test result into the eval collector. */ +function recordGeminiE2E(name: string, result: GeminiResult, passed: boolean) { + evalCollector?.addTest({ + name, + suite: 'gemini-e2e', + tier: 'e2e', + passed, + duration_ms: result.durationMs, + cost_usd: 0, // Gemini doesn't report cost in USD; tokens are tracked + output: result.output?.slice(0, 2000), + turns_used: result.toolCalls.length, // approximate: tool calls as turns + exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`, + }); +} + +/** Print cost summary after a Gemini E2E test. */ +function logGeminiCost(label: string, result: GeminiResult) { + const durationSec = Math.round(result.durationMs / 1000); + console.log(`${label}: ${result.tokens} tokens, ${result.toolCalls.length} tool calls, ${durationSec}s`); +} + +// Finalize eval results on exit +afterAll(async () => { + if (evalCollector) { + await evalCollector.finalize(); + } +}); + +// --- Tests --- + +describeGemini('Gemini E2E', () => { + + testIfSelected('gemini-discover-skill', async () => { + // Run Gemini in the repo root where .agents/skills/ exists + const result = await runGeminiSkill({ + prompt: 'List any skills or instructions you have available. Just list the names.', + timeoutMs: 60_000, + cwd: ROOT, + }); + + logGeminiCost('gemini-discover-skill', result); + + // Gemini should have produced some output + const passed = result.exitCode === 0 && result.output.length > 0; + recordGeminiE2E('gemini-discover-skill', result, passed); + + expect(result.exitCode).toBe(0); + expect(result.output.length).toBeGreaterThan(0); + // The output should reference skills in some form + const outputLower = result.output.toLowerCase(); + expect( + outputLower.includes('review') || outputLower.includes('gstack') || outputLower.includes('skill'), + ).toBe(true); + }, 120_000); + + testIfSelected('gemini-review-findings', async () => { + // Run gstack-review skill via Gemini on this repo + const result = await runGeminiSkill({ + prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.', + timeoutMs: 540_000, + cwd: ROOT, + }); + + logGeminiCost('gemini-review-findings', result); + + // Should produce structured review-like output + const output = result.output; + const passed = result.exitCode === 0 && output.length > 50; + recordGeminiE2E('gemini-review-findings', result, passed); + + expect(result.exitCode).toBe(0); + expect(output.length).toBeGreaterThan(50); + + // Review output should contain some review-like content + const outputLower = output.toLowerCase(); + const hasReviewContent = + outputLower.includes('finding') || + outputLower.includes('issue') || + outputLower.includes('review') || + outputLower.includes('change') || + outputLower.includes('diff') || + outputLower.includes('clean') || + outputLower.includes('no issues') || + outputLower.includes('p1') || + outputLower.includes('p2'); + expect(hasReviewContent).toBe(true); + }, 600_000); +}); diff --git a/test/helpers/gemini-session-runner.test.ts b/test/helpers/gemini-session-runner.test.ts new file mode 100644 index 00000000..1bb9a393 --- /dev/null +++ b/test/helpers/gemini-session-runner.test.ts @@ -0,0 +1,104 @@ +import { describe, test, expect } from 'bun:test'; +import { parseGeminiJSONL } from './gemini-session-runner'; + +// Fixture: actual Gemini CLI stream-json output with tool use +const FIXTURE_LINES = [ + '{"type":"init","timestamp":"2026-03-20T15:14:46.455Z","session_id":"test-session-123","model":"auto-gemini-3"}', + '{"type":"message","timestamp":"2026-03-20T15:14:46.456Z","role":"user","content":"list the files"}', + '{"type":"message","timestamp":"2026-03-20T15:14:49.650Z","role":"assistant","content":"I will list the files.","delta":true}', + '{"type":"tool_use","timestamp":"2026-03-20T15:14:49.690Z","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}', + '{"type":"tool_result","timestamp":"2026-03-20T15:14:49.931Z","tool_id":"cmd_1","status":"success","output":"file1.ts\\nfile2.ts"}', + '{"type":"message","timestamp":"2026-03-20T15:14:51.945Z","role":"assistant","content":"Here are the files.","delta":true}', + '{"type":"result","timestamp":"2026-03-20T15:14:52.030Z","status":"success","stats":{"total_tokens":27147,"input_tokens":26928,"output_tokens":87,"cached":0,"duration_ms":5575,"tool_calls":1}}', +]; + +describe('parseGeminiJSONL', () => { + test('extracts session ID from init event', () => { + const parsed = parseGeminiJSONL(FIXTURE_LINES); + expect(parsed.sessionId).toBe('test-session-123'); + }); + + test('concatenates assistant message deltas into output', () => { + const parsed = parseGeminiJSONL(FIXTURE_LINES); + expect(parsed.output).toBe('I will list the files.Here are the files.'); + }); + + test('ignores user messages', () => { + const lines = [ + '{"type":"message","role":"user","content":"this should be ignored"}', + '{"type":"message","role":"assistant","content":"this should be kept","delta":true}', + ]; + const parsed = parseGeminiJSONL(lines); + expect(parsed.output).toBe('this should be kept'); + }); + + test('extracts tool names from tool_use events', () => { + const parsed = parseGeminiJSONL(FIXTURE_LINES); + expect(parsed.toolCalls).toHaveLength(1); + expect(parsed.toolCalls[0]).toBe('run_shell_command'); + }); + + test('extracts total tokens from result stats', () => { + const parsed = parseGeminiJSONL(FIXTURE_LINES); + expect(parsed.tokens).toBe(27147); + }); + + test('skips malformed lines without throwing', () => { + const lines = [ + '{"type":"init","session_id":"ok"}', + 'this is not json', + '{"type":"message","role":"assistant","content":"hello","delta":true}', + '{incomplete json', + '{"type":"result","status":"success","stats":{"total_tokens":100}}', + ]; + const parsed = parseGeminiJSONL(lines); + expect(parsed.sessionId).toBe('ok'); + expect(parsed.output).toBe('hello'); + expect(parsed.tokens).toBe(100); + }); + + test('skips empty and whitespace-only lines', () => { + const lines = [ + '', + ' ', + '{"type":"init","session_id":"s1"}', + '\t', + '{"type":"result","status":"success","stats":{"total_tokens":50}}', + ]; + const parsed = parseGeminiJSONL(lines); + expect(parsed.sessionId).toBe('s1'); + expect(parsed.tokens).toBe(50); + }); + + test('handles empty input', () => { + const parsed = parseGeminiJSONL([]); + expect(parsed.output).toBe(''); + expect(parsed.toolCalls).toHaveLength(0); + expect(parsed.tokens).toBe(0); + expect(parsed.sessionId).toBeNull(); + }); + + test('handles missing fields gracefully', () => { + const lines = [ + '{"type":"init"}', // no session_id + '{"type":"message","role":"assistant"}', // no content + '{"type":"tool_use"}', // no tool_name + '{"type":"result","status":"success"}', // no stats + ]; + const parsed = parseGeminiJSONL(lines); + expect(parsed.sessionId).toBeNull(); + expect(parsed.output).toBe(''); + expect(parsed.toolCalls).toHaveLength(0); + expect(parsed.tokens).toBe(0); + }); + + test('handles multiple tool_use events', () => { + const lines = [ + '{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}', + '{"type":"tool_use","tool_name":"read_file","tool_id":"cmd_2","parameters":{"path":"foo.ts"}}', + '{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_3","parameters":{"command":"cat bar.ts"}}', + ]; + const parsed = parseGeminiJSONL(lines); + expect(parsed.toolCalls).toEqual(['run_shell_command', 'read_file', 'run_shell_command']); + }); +}); diff --git a/test/helpers/gemini-session-runner.ts b/test/helpers/gemini-session-runner.ts new file mode 100644 index 00000000..06393c38 --- /dev/null +++ b/test/helpers/gemini-session-runner.ts @@ -0,0 +1,201 @@ +/** + * Gemini CLI subprocess runner for skill E2E testing. + * + * Spawns `gemini -p` as an independent process, parses its stream-json + * output, and returns structured results. Follows the same pattern as + * codex-session-runner.ts but adapted for the Gemini CLI. + * + * Key differences from Codex session-runner: + * - Uses `gemini -p` instead of `codex exec` + * - Output is NDJSON with event types: init, message, tool_use, tool_result, result + * - Uses `--output-format stream-json --yolo` instead of `--json -s read-only` + * - No temp HOME needed — Gemini discovers skills from `.agents/skills/` in cwd + * - Message events are streamed with `delta: true` — must concatenate + */ + +import * as path from 'path'; + +// --- Interfaces --- + +export interface GeminiResult { + output: string; // Full assistant message text (concatenated deltas) + toolCalls: string[]; // Tool names from tool_use events + tokens: number; // Total tokens used + exitCode: number; // Process exit code + durationMs: number; // Wall clock time + sessionId: string | null; // Session ID from init event + rawLines: string[]; // Raw JSONL lines for debugging +} + +// --- JSONL parser --- + +export interface ParsedGeminiJSONL { + output: string; + toolCalls: string[]; + tokens: number; + sessionId: string | null; +} + +/** + * Parse an array of JSONL lines from `gemini -p --output-format stream-json`. + * Pure function — no I/O, no side effects. + * + * Handles these Gemini event types: + * - init → extract session_id + * - message (role=assistant, delta=true) → concatenate content into output + * - tool_use → extract tool_name + * - tool_result → logged but not extracted + * - result → extract token usage from stats + */ +export function parseGeminiJSONL(lines: string[]): ParsedGeminiJSONL { + const outputParts: string[] = []; + const toolCalls: string[] = []; + let tokens = 0; + let sessionId: string | null = null; + + for (const line of lines) { + if (!line.trim()) continue; + try { + const obj = JSON.parse(line); + const t = obj.type || ''; + + if (t === 'init') { + const sid = obj.session_id || ''; + if (sid) sessionId = sid; + } else if (t === 'message') { + if (obj.role === 'assistant' && obj.content) { + outputParts.push(obj.content); + } + } else if (t === 'tool_use') { + const name = obj.tool_name || ''; + if (name) toolCalls.push(name); + } else if (t === 'result') { + const stats = obj.stats || {}; + tokens = (stats.total_tokens || 0); + } + } catch { /* skip malformed lines */ } + } + + return { + output: outputParts.join(''), + toolCalls, + tokens, + sessionId, + }; +} + +// --- Main runner --- + +/** + * Run a prompt via `gemini -p` and return structured results. + * + * Spawns gemini with stream-json output, parses JSONL events, + * and returns a GeminiResult. Skips gracefully if gemini binary is not found. + */ +export async function runGeminiSkill(opts: { + prompt: string; // What to ask Gemini + timeoutMs?: number; // Default 300000 (5 min) + cwd?: string; // Working directory (where .agents/skills/ lives) +}): Promise { + const { + prompt, + timeoutMs = 300_000, + cwd, + } = opts; + + const startTime = Date.now(); + + // Check if gemini binary exists + const whichResult = Bun.spawnSync(['which', 'gemini']); + if (whichResult.exitCode !== 0) { + return { + output: 'SKIP: gemini binary not found', + toolCalls: [], + tokens: 0, + exitCode: -1, + durationMs: Date.now() - startTime, + sessionId: null, + rawLines: [], + }; + } + + // Build gemini command + const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo']; + + // Spawn gemini — uses real HOME for auth, cwd for skill discovery + const proc = Bun.spawn(['gemini', ...args], { + cwd: cwd || process.cwd(), + stdout: 'pipe', + stderr: 'pipe', + }); + + // Race against timeout + let timedOut = false; + const timeoutId = setTimeout(() => { + timedOut = true; + proc.kill(); + }, timeoutMs); + + // Stream and collect JSONL from stdout + const collectedLines: string[] = []; + const stderrPromise = new Response(proc.stderr).text(); + + const reader = proc.stdout.getReader(); + const decoder = new TextDecoder(); + let buf = ''; + + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + const lines = buf.split('\n'); + buf = lines.pop() || ''; + for (const line of lines) { + if (!line.trim()) continue; + collectedLines.push(line); + + // Real-time progress to stderr + try { + const event = JSON.parse(line); + if (event.type === 'tool_use' && event.tool_name) { + const elapsed = Math.round((Date.now() - startTime) / 1000); + process.stderr.write(` [gemini ${elapsed}s] tool: ${event.tool_name}\n`); + } else if (event.type === 'message' && event.role === 'assistant' && event.content) { + const elapsed = Math.round((Date.now() - startTime) / 1000); + process.stderr.write(` [gemini ${elapsed}s] message: ${event.content.slice(0, 100)}\n`); + } + } catch { /* skip — parseGeminiJSONL will handle it later */ } + } + } + } catch { /* stream read error — fall through to exit code handling */ } + + // Flush remaining buffer + if (buf.trim()) { + collectedLines.push(buf); + } + + const stderr = await stderrPromise; + const exitCode = await proc.exited; + clearTimeout(timeoutId); + + const durationMs = Date.now() - startTime; + + // Parse all collected JSONL lines + const parsed = parseGeminiJSONL(collectedLines); + + // Log stderr if non-empty (may contain auth errors, etc.) + if (stderr.trim()) { + process.stderr.write(` [gemini stderr] ${stderr.trim().slice(0, 200)}\n`); + } + + return { + output: parsed.output, + toolCalls: parsed.toolCalls, + tokens: parsed.tokens, + exitCode: timedOut ? 124 : exitCode, + durationMs, + sessionId: parsed.sessionId, + rawLines: collectedLines, + }; +} diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index c516a3b5..1246a413 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -84,6 +84,10 @@ export const E2E_TOUCHFILES: Record = { 'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'], 'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'], + // Gemini E2E (tests skills via Gemini CLI) + 'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'], + 'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'], + // QA bootstrap 'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'], @@ -160,6 +164,7 @@ export const LLM_JUDGE_TOUCHFILES: Record = { export const GLOBAL_TOUCHFILES = [ 'test/helpers/session-runner.ts', 'test/helpers/codex-session-runner.ts', + 'test/helpers/gemini-session-runner.ts', 'test/helpers/eval-store.ts', 'test/helpers/llm-judge.ts', 'scripts/gen-skill-docs.ts', From d7c732b282845214a5a0ab436f059c8767039795 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 20 Mar 2026 12:22:11 -0700 Subject: [PATCH 2/8] =?UTF-8?q?fix:=20Windows=20support=20=E2=80=94=20Node?= =?UTF-8?q?.js=20server=20fallback=20for=20Playwright=20(#255)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: Windows support — Node.js server fallback for Playwright Setup hangs on Windows 11 because Bun's child_process can't handle Playwright's --remote-debugging-pipe (fd 3/4 pipe handles). Fall back to Node.js on Windows for both the setup verification and server runtime. macOS/Linux completely unaffected — all Windows code behind IS_WINDOWS / process.platform === 'win32' guards. Based on community PR #194 by @sozairali. Fixed sed -i portability (perl -pi -e) in build-node-server.sh for macOS compatibility. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: cross-platform path handling for Windows compatibility Replace hardcoded '/tmp' and 'dir + "/"' path checks with platform-aware constants from new platform.ts module. On macOS/Linux this evaluates identically ('/tmp', '/'); on Windows it uses os.tmpdir() and path.sep. Zero behavior change on Unix. Co-Authored-By: Claude Opus 4.6 (1M context) * test: add tests for Windows polyfill, platform constants, and Node server resolution Co-Authored-By: Claude Opus 4.6 (1M context) * docs: Windows support in README + CHANGELOG (v0.9.1.1) Co-Authored-By: Claude Opus 4.6 (1M context) * chore: bump version and changelog (v0.9.3.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .agents/skills/gstack-browse/SKILL.md | 2 +- .agents/skills/gstack/SKILL.md | 2 +- CHANGELOG.md | 12 +++ README.md | 4 +- SKILL.md | 2 +- VERSION | 2 +- browse/SKILL.md | 2 +- browse/scripts/build-node-server.sh | 48 ++++++++++++ browse/src/bun-polyfill.cjs | 109 ++++++++++++++++++++++++++ browse/src/cli.ts | 42 +++++++++- browse/src/meta-commands.ts | 11 +-- browse/src/platform.ts | 17 ++++ browse/src/read-commands.ts | 5 +- browse/src/snapshot.ts | 9 ++- browse/src/write-commands.ts | 5 +- browse/test/bun-polyfill.test.ts | 72 +++++++++++++++++ browse/test/config.test.ts | 30 +++++++ browse/test/platform.test.ts | 37 +++++++++ package.json | 2 +- setup | 46 +++++++++-- 20 files changed, 430 insertions(+), 29 deletions(-) create mode 100755 browse/scripts/build-node-server.sh create mode 100644 browse/src/bun-polyfill.cjs create mode 100644 browse/src/platform.ts create mode 100644 browse/test/bun-polyfill.test.ts create mode 100644 browse/test/platform.test.ts diff --git a/.agents/skills/gstack-browse/SKILL.md b/.agents/skills/gstack-browse/SKILL.md index 6f634f12..db405e47 100644 --- a/.agents/skills/gstack-browse/SKILL.md +++ b/.agents/skills/gstack-browse/SKILL.md @@ -358,7 +358,7 @@ The snapshot is your primary tool for understanding and interacting with pages. -s --selector Scope to CSS selector -D --diff Unified diff against previous snapshot (first call stores baseline) -a --annotate Annotated screenshot with red overlay boxes and ref labels --o --output Output path for annotated screenshot (default: /tmp/browse-annotated.png) +-o --output Output path for annotated screenshot (default: /browse-annotated.png) -C --cursor-interactive Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` diff --git a/.agents/skills/gstack/SKILL.md b/.agents/skills/gstack/SKILL.md index 3b4f93b5..4bb9ba17 100644 --- a/.agents/skills/gstack/SKILL.md +++ b/.agents/skills/gstack/SKILL.md @@ -486,7 +486,7 @@ The snapshot is your primary tool for understanding and interacting with pages. -s --selector Scope to CSS selector -D --diff Unified diff against previous snapshot (first call stores baseline) -a --annotate Annotated screenshot with red overlay boxes and ref labels --o --output Output path for annotated screenshot (default: /tmp/browse-annotated.png) +-o --output Output path for annotated screenshot (default: /browse-annotated.png) -C --cursor-interactive Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e47e135..b4e8261c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## [0.9.3.0] - 2026-03-20 — Windows Support + +### Fixed + +- **gstack now works on Windows 11.** Setup no longer hangs when verifying Playwright, and the browse server automatically falls back to Node.js to work around a Bun pipe-handling bug on Windows ([bun#4253](https://github.com/oven-sh/bun/issues/4253)). Just make sure Node.js is installed alongside Bun. macOS and Linux are completely unaffected. +- **Path handling works on Windows.** All hardcoded `/tmp` paths and Unix-style path separators now use platform-aware equivalents via a new `platform.ts` module. Path traversal protection works correctly with Windows backslash separators. + +### Added + +- **Bun API polyfill for Node.js.** When the browse server runs under Node.js on Windows, a compatibility layer provides `Bun.serve()`, `Bun.spawn()`, `Bun.spawnSync()`, and `Bun.sleep()` equivalents. Fully tested. +- **Node server build script.** `browse/scripts/build-node-server.sh` transpiles the server for Node.js, stubs `bun:sqlite`, and injects the polyfill — all automated during `bun run build`. + ## [0.9.2.0] - 2026-03-20 — Gemini CLI E2E Tests ### Added diff --git a/README.md b/README.md index b7ddb7d1..07047797 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Expect first useful run in under 5 minutes on any repo with tests already set up ## Install — takes 30 seconds -**Requirements:** [Claude Code](https://docs.anthropic.com/en/docs/claude-code), [Git](https://git-scm.com/), [Bun](https://bun.sh/) v1.0+ +**Requirements:** [Claude Code](https://docs.anthropic.com/en/docs/claude-code), [Git](https://git-scm.com/), [Bun](https://bun.sh/) v1.0+, [Node.js](https://nodejs.org/) (Windows only) ### Step 1: Install on your machine @@ -238,6 +238,8 @@ Data is stored in [Supabase](https://supabase.com) (open source Firebase alterna **Stale install?** Run `/gstack-upgrade` — or set `auto_upgrade: true` in `~/.gstack/config.yaml` +**Windows users:** gstack works on Windows 11 via Git Bash or WSL. Node.js is required in addition to Bun — Bun has a known bug with Playwright's pipe transport on Windows ([bun#4253](https://github.com/oven-sh/bun/issues/4253)). The browse server automatically falls back to Node.js. Make sure both `bun` and `node` are on your PATH. + **Claude says it can't see the skills?** Make sure your project's `CLAUDE.md` has a gstack section. Add this: ``` diff --git a/SKILL.md b/SKILL.md index fe66b618..46b7a558 100644 --- a/SKILL.md +++ b/SKILL.md @@ -492,7 +492,7 @@ The snapshot is your primary tool for understanding and interacting with pages. -s --selector Scope to CSS selector -D --diff Unified diff against previous snapshot (first call stores baseline) -a --annotate Annotated screenshot with red overlay boxes and ref labels --o --output Output path for annotated screenshot (default: /tmp/browse-annotated.png) +-o --output Output path for annotated screenshot (default: /browse-annotated.png) -C --cursor-interactive Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` diff --git a/VERSION b/VERSION index 594150e3..947d2886 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.2.0 +0.9.3.0 diff --git a/browse/SKILL.md b/browse/SKILL.md index 8782ccbf..2acf60b0 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -364,7 +364,7 @@ The snapshot is your primary tool for understanding and interacting with pages. -s --selector Scope to CSS selector -D --diff Unified diff against previous snapshot (first call stores baseline) -a --annotate Annotated screenshot with red overlay boxes and ref labels --o --output Output path for annotated screenshot (default: /tmp/browse-annotated.png) +-o --output Output path for annotated screenshot (default: /browse-annotated.png) -C --cursor-interactive Cursor-interactive elements (@c refs — divs with pointer, onclick) ``` diff --git a/browse/scripts/build-node-server.sh b/browse/scripts/build-node-server.sh new file mode 100755 index 00000000..539e391c --- /dev/null +++ b/browse/scripts/build-node-server.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Build a Node.js-compatible server bundle for Windows. +# +# On Windows, Bun can't launch or connect to Playwright's Chromium +# (oven-sh/bun#4253, #9911). This script produces a server bundle +# that runs under Node.js with Bun API polyfills. + +set -e + +GSTACK_DIR="$(cd "$(dirname "$0")/../.." && pwd)" +SRC_DIR="$GSTACK_DIR/browse/src" +DIST_DIR="$GSTACK_DIR/browse/dist" + +echo "Building Node-compatible server bundle..." + +# Step 1: Transpile server.ts to a single .mjs bundle (externalize runtime deps) +bun build "$SRC_DIR/server.ts" \ + --target=node \ + --outfile "$DIST_DIR/server-node.mjs" \ + --external playwright \ + --external playwright-core \ + --external diff \ + --external "bun:sqlite" + +# Step 2: Post-process +# Replace import.meta.dir with a resolvable reference +perl -pi -e 's/import\.meta\.dir/__browseNodeSrcDir/g' "$DIST_DIR/server-node.mjs" +# Stub out bun:sqlite (macOS-only cookie import, not needed on Windows) +perl -pi -e 's|import { Database } from "bun:sqlite";|const Database = null; // bun:sqlite stubbed on Node|g' "$DIST_DIR/server-node.mjs" + +# Step 3: Create the final file with polyfill header injected after the first line +{ + head -1 "$DIST_DIR/server-node.mjs" + echo '// ── Windows Node.js compatibility (auto-generated) ──' + echo 'import { fileURLToPath as _ftp } from "node:url";' + echo 'import { dirname as _dn } from "node:path";' + echo 'const __browseNodeSrcDir = _dn(_dn(_ftp(import.meta.url))) + "/src";' + echo '{ const _r = createRequire(import.meta.url); _r("./bun-polyfill.cjs"); }' + echo '// ── end compatibility ──' + tail -n +2 "$DIST_DIR/server-node.mjs" +} > "$DIST_DIR/server-node.tmp.mjs" + +mv "$DIST_DIR/server-node.tmp.mjs" "$DIST_DIR/server-node.mjs" + +# Step 4: Copy polyfill to dist/ +cp "$SRC_DIR/bun-polyfill.cjs" "$DIST_DIR/bun-polyfill.cjs" + +echo "Node server bundle ready: $DIST_DIR/server-node.mjs" diff --git a/browse/src/bun-polyfill.cjs b/browse/src/bun-polyfill.cjs new file mode 100644 index 00000000..e0ada11b --- /dev/null +++ b/browse/src/bun-polyfill.cjs @@ -0,0 +1,109 @@ +/** + * Bun API polyfill for Node.js — Windows compatibility layer. + * + * On Windows, Bun can't launch or connect to Playwright's Chromium + * (oven-sh/bun#4253, #9911). The browse server falls back to running + * under Node.js with this polyfill providing Bun API equivalents. + * + * Loaded via --require before the transpiled server bundle. + */ + +'use strict'; + +const http = require('http'); +const { spawnSync, spawn } = require('child_process'); + +globalThis.Bun = { + serve(options) { + const { port, hostname = '127.0.0.1', fetch } = options; + + const server = http.createServer(async (nodeReq, nodeRes) => { + try { + const url = `http://${hostname}:${port}${nodeReq.url}`; + const headers = new Headers(); + for (const [key, val] of Object.entries(nodeReq.headers)) { + if (val) headers.set(key, Array.isArray(val) ? val[0] : val); + } + + let body = null; + if (nodeReq.method !== 'GET' && nodeReq.method !== 'HEAD') { + body = await new Promise((resolve) => { + const chunks = []; + nodeReq.on('data', (chunk) => chunks.push(chunk)); + nodeReq.on('end', () => resolve(Buffer.concat(chunks))); + }); + } + + const webReq = new Request(url, { + method: nodeReq.method, + headers, + body, + }); + + const webRes = await fetch(webReq); + + nodeRes.statusCode = webRes.status; + webRes.headers.forEach((val, key) => { + nodeRes.setHeader(key, val); + }); + + const resBody = await webRes.arrayBuffer(); + nodeRes.end(Buffer.from(resBody)); + } catch (err) { + nodeRes.statusCode = 500; + nodeRes.end(JSON.stringify({ error: err.message })); + } + }); + + server.listen(port, hostname); + + return { + stop() { server.close(); }, + port, + hostname, + }; + }, + + spawnSync(cmd, options = {}) { + const [command, ...args] = cmd; + const result = spawnSync(command, args, { + stdio: [ + options.stdin || 'pipe', + options.stdout === 'pipe' ? 'pipe' : 'ignore', + options.stderr === 'pipe' ? 'pipe' : 'ignore', + ], + timeout: options.timeout, + env: options.env, + cwd: options.cwd, + }); + + return { + exitCode: result.status, + stdout: result.stdout || Buffer.from(''), + stderr: result.stderr || Buffer.from(''), + }; + }, + + spawn(cmd, options = {}) { + const [command, ...args] = cmd; + const stdio = options.stdio || ['pipe', 'pipe', 'pipe']; + const proc = spawn(command, args, { + stdio, + env: options.env, + cwd: options.cwd, + }); + + return { + pid: proc.pid, + stdout: proc.stdout, + stderr: proc.stderr, + stdin: proc.stdin, + unref() { proc.unref(); }, + kill(signal) { proc.kill(signal); }, + }; + }, + + sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); + }, +}; diff --git a/browse/src/cli.ts b/browse/src/cli.ts index 7d6eacdf..830b2e7c 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -14,7 +14,8 @@ import * as path from 'path'; import { resolveConfig, ensureStateDir, readVersionHash } from './config'; const config = resolveConfig(); -const MAX_START_WAIT = 8000; // 8 seconds to start +const IS_WINDOWS = process.platform === 'win32'; +const MAX_START_WAIT = IS_WINDOWS ? 15000 : 8000; // Node+Chromium takes longer on Windows export function resolveServerScript( env: Record = process.env, @@ -26,7 +27,9 @@ export function resolveServerScript( } // Dev mode: cli.ts runs directly from browse/src - if (metaDir.startsWith('/') && !metaDir.includes('$bunfs')) { + // On macOS/Linux, import.meta.dir starts with / + // On Windows, it starts with a drive letter (e.g., C:\...) + if (!metaDir.includes('$bunfs')) { const direct = path.resolve(metaDir, 'server.ts'); if (fs.existsSync(direct)) { return direct; @@ -48,6 +51,31 @@ export function resolveServerScript( const SERVER_SCRIPT = resolveServerScript(); +/** + * On Windows, resolve the Node.js-compatible server bundle. + * Falls back to null if not found (server will use Bun instead). + */ +export function resolveNodeServerScript( + metaDir: string = import.meta.dir, + execPath: string = process.execPath +): string | null { + // Dev mode + if (!metaDir.includes('$bunfs')) { + const distScript = path.resolve(metaDir, '..', 'dist', 'server-node.mjs'); + if (fs.existsSync(distScript)) return distScript; + } + + // Compiled binary: browse/dist/browse → browse/dist/server-node.mjs + if (execPath) { + const adjacent = path.resolve(path.dirname(execPath), 'server-node.mjs'); + if (fs.existsSync(adjacent)) return adjacent; + } + + return null; +} + +const NODE_SERVER_SCRIPT = IS_WINDOWS ? resolveNodeServerScript() : null; + interface ServerState { pid: number; port: number; @@ -139,8 +167,14 @@ async function startServer(): Promise { // Clean up stale state file try { fs.unlinkSync(config.stateFile); } catch {} - // Start server as detached background process - const proc = Bun.spawn(['bun', 'run', SERVER_SCRIPT], { + // Start server as detached background process. + // On Windows, Bun can't launch/connect to Playwright's Chromium (oven-sh/bun#4253, #9911). + // Fall back to running the server under Node.js with Bun API polyfills. + const useNode = IS_WINDOWS && NODE_SERVER_SCRIPT; + const serverCmd = useNode + ? ['node', NODE_SERVER_SCRIPT] + : ['bun', 'run', SERVER_SCRIPT]; + const proc = Bun.spawn(serverCmd, { stdio: ['ignore', 'pipe', 'pipe'], env: { ...process.env, BROWSE_STATE_FILE: config.stateFile }, }); diff --git a/browse/src/meta-commands.ts b/browse/src/meta-commands.ts index 049ed69a..f1ebdea8 100644 --- a/browse/src/meta-commands.ts +++ b/browse/src/meta-commands.ts @@ -10,13 +10,14 @@ import { validateNavigationUrl } from './url-validation'; import * as Diff from 'diff'; import * as fs from 'fs'; import * as path from 'path'; +import { TEMP_DIR, isPathWithin } from './platform'; // Security: Path validation to prevent path traversal attacks -const SAFE_DIRECTORIES = ['/tmp', process.cwd()]; +const SAFE_DIRECTORIES = [TEMP_DIR, process.cwd()]; export function validateOutputPath(filePath: string): void { const resolved = path.resolve(filePath); - const isSafe = SAFE_DIRECTORIES.some(dir => resolved === dir || resolved.startsWith(dir + '/')); + const isSafe = SAFE_DIRECTORIES.some(dir => isPathWithin(resolved, dir)); if (!isSafe) { throw new Error(`Path must be within: ${SAFE_DIRECTORIES.join(', ')}`); } @@ -88,7 +89,7 @@ export async function handleMetaCommand( case 'screenshot': { // Parse priority: flags (--viewport, --clip) → selector (@ref, CSS) → output path const page = bm.getPage(); - let outputPath = '/tmp/browse-screenshot.png'; + let outputPath = `${TEMP_DIR}/browse-screenshot.png`; let clipRect: { x: number; y: number; width: number; height: number } | undefined; let targetSelector: string | undefined; let viewportOnly = false; @@ -147,7 +148,7 @@ export async function handleMetaCommand( case 'pdf': { const page = bm.getPage(); - const pdfPath = args[0] || '/tmp/browse-page.pdf'; + const pdfPath = args[0] || `${TEMP_DIR}/browse-page.pdf`; validateOutputPath(pdfPath); await page.pdf({ path: pdfPath, format: 'A4' }); return `PDF saved: ${pdfPath}`; @@ -155,7 +156,7 @@ export async function handleMetaCommand( case 'responsive': { const page = bm.getPage(); - const prefix = args[0] || '/tmp/browse-responsive'; + const prefix = args[0] || `${TEMP_DIR}/browse-responsive`; validateOutputPath(prefix); const viewports = [ { name: 'mobile', width: 375, height: 812 }, diff --git a/browse/src/platform.ts b/browse/src/platform.ts new file mode 100644 index 00000000..c022b1d6 --- /dev/null +++ b/browse/src/platform.ts @@ -0,0 +1,17 @@ +/** + * Cross-platform constants for gstack browse. + * + * On macOS/Linux: TEMP_DIR = '/tmp', path.sep = '/' — identical to hardcoded values. + * On Windows: TEMP_DIR = os.tmpdir(), path.sep = '\\' — correct Windows behavior. + */ + +import * as os from 'os'; +import * as path from 'path'; + +export const IS_WINDOWS = process.platform === 'win32'; +export const TEMP_DIR = IS_WINDOWS ? os.tmpdir() : '/tmp'; + +/** Check if resolvedPath is within dir, using platform-aware separators. */ +export function isPathWithin(resolvedPath: string, dir: string): boolean { + return resolvedPath === dir || resolvedPath.startsWith(dir + path.sep); +} diff --git a/browse/src/read-commands.ts b/browse/src/read-commands.ts index e9823325..fad4e78c 100644 --- a/browse/src/read-commands.ts +++ b/browse/src/read-commands.ts @@ -10,6 +10,7 @@ import { consoleBuffer, networkBuffer, dialogBuffer } from './buffers'; import type { Page } from 'playwright'; import * as fs from 'fs'; import * as path from 'path'; +import { TEMP_DIR, isPathWithin } from './platform'; /** Detect await keyword, ignoring comments. Accepted risk: await in string literals triggers wrapping (harmless). */ function hasAwait(code: string): boolean { @@ -36,12 +37,12 @@ function wrapForEvaluate(code: string): string { } // Security: Path validation to prevent path traversal attacks -const SAFE_DIRECTORIES = ['/tmp', process.cwd()]; +const SAFE_DIRECTORIES = [TEMP_DIR, process.cwd()]; export function validateReadPath(filePath: string): void { if (path.isAbsolute(filePath)) { const resolved = path.resolve(filePath); - const isSafe = SAFE_DIRECTORIES.some(dir => resolved === dir || resolved.startsWith(dir + '/')); + const isSafe = SAFE_DIRECTORIES.some(dir => isPathWithin(resolved, dir)); if (!isSafe) { throw new Error(`Absolute path must be within: ${SAFE_DIRECTORIES.join(', ')}`); } diff --git a/browse/src/snapshot.ts b/browse/src/snapshot.ts index db1dfc7c..24380bad 100644 --- a/browse/src/snapshot.ts +++ b/browse/src/snapshot.ts @@ -20,6 +20,7 @@ import type { Page, Locator } from 'playwright'; import type { BrowserManager, RefEntry } from './browser-manager'; import * as Diff from 'diff'; +import { TEMP_DIR, isPathWithin } from './platform'; // Roles considered "interactive" for the -i flag const INTERACTIVE_ROLES = new Set([ @@ -61,7 +62,7 @@ export const SNAPSHOT_FLAGS: Array<{ { short: '-s', long: '--selector', description: 'Scope to CSS selector', takesValue: true, valueHint: '', optionKey: 'selector' }, { short: '-D', long: '--diff', description: 'Unified diff against previous snapshot (first call stores baseline)', optionKey: 'diff' }, { short: '-a', long: '--annotate', description: 'Annotated screenshot with red overlay boxes and ref labels', optionKey: 'annotate' }, - { short: '-o', long: '--output', description: 'Output path for annotated screenshot (default: /tmp/browse-annotated.png)', takesValue: true, valueHint: '', optionKey: 'outputPath' }, + { short: '-o', long: '--output', description: 'Output path for annotated screenshot (default: /browse-annotated.png)', takesValue: true, valueHint: '', optionKey: 'outputPath' }, { short: '-C', long: '--cursor-interactive', description: 'Cursor-interactive elements (@c refs — divs with pointer, onclick)', optionKey: 'cursorInteractive' }, ]; @@ -308,11 +309,11 @@ export async function handleSnapshot( // ─── Annotated screenshot (-a) ──────────────────────────── if (opts.annotate) { - const screenshotPath = opts.outputPath || '/tmp/browse-annotated.png'; + const screenshotPath = opts.outputPath || `${TEMP_DIR}/browse-annotated.png`; // Validate output path (consistent with screenshot/pdf/responsive) const resolvedPath = require('path').resolve(screenshotPath); - const safeDirs = ['/tmp', process.cwd()]; - if (!safeDirs.some((dir: string) => resolvedPath === dir || resolvedPath.startsWith(dir + '/'))) { + const safeDirs = [TEMP_DIR, process.cwd()]; + if (!safeDirs.some((dir: string) => isPathWithin(resolvedPath, dir))) { throw new Error(`Path must be within: ${safeDirs.join(', ')}`); } try { diff --git a/browse/src/write-commands.ts b/browse/src/write-commands.ts index 26a46a4b..1bf37eb5 100644 --- a/browse/src/write-commands.ts +++ b/browse/src/write-commands.ts @@ -10,6 +10,7 @@ import { findInstalledBrowsers, importCookies } from './cookie-import-browser'; import { validateNavigationUrl } from './url-validation'; import * as fs from 'fs'; import * as path from 'path'; +import { TEMP_DIR, isPathWithin } from './platform'; export async function handleWriteCommand( command: string, @@ -277,9 +278,9 @@ export async function handleWriteCommand( if (!filePath) throw new Error('Usage: browse cookie-import '); // Path validation — prevent reading arbitrary files if (path.isAbsolute(filePath)) { - const safeDirs = ['/tmp', process.cwd()]; + const safeDirs = [TEMP_DIR, process.cwd()]; const resolved = path.resolve(filePath); - if (!safeDirs.some(dir => resolved === dir || resolved.startsWith(dir + '/'))) { + if (!safeDirs.some(dir => isPathWithin(resolved, dir))) { throw new Error(`Path must be within: ${safeDirs.join(', ')}`); } } diff --git a/browse/test/bun-polyfill.test.ts b/browse/test/bun-polyfill.test.ts new file mode 100644 index 00000000..7ca25dfa --- /dev/null +++ b/browse/test/bun-polyfill.test.ts @@ -0,0 +1,72 @@ +import { describe, test, expect, afterAll } from 'bun:test'; +import * as path from 'path'; + +// Load the polyfill into a fresh object (don't clobber globalThis.Bun) +const polyfillPath = path.resolve(import.meta.dir, '../src/bun-polyfill.cjs'); + +describe('bun-polyfill', () => { + // We test the polyfill by requiring it in a subprocess under Node.js + // since it's designed for Node, not Bun. + + test('Bun.sleep resolves after delay', async () => { + const result = Bun.spawnSync(['node', '-e', ` + require('${polyfillPath}'); + (async () => { + const start = Date.now(); + await Bun.sleep(50); + const elapsed = Date.now() - start; + console.log(elapsed >= 40 ? 'OK' : 'TOO_FAST'); + })(); + `], { stdout: 'pipe', stderr: 'pipe' }); + expect(result.stdout.toString().trim()).toBe('OK'); + expect(result.exitCode).toBe(0); + }); + + test('Bun.spawnSync runs a command and returns stdout', () => { + const result = Bun.spawnSync(['node', '-e', ` + require('${polyfillPath}'); + const r = Bun.spawnSync(['echo', 'hello'], { stdout: 'pipe' }); + console.log(r.stdout.toString().trim()); + console.log('exit:' + r.exitCode); + `], { stdout: 'pipe', stderr: 'pipe' }); + const lines = result.stdout.toString().trim().split('\n'); + expect(lines[0]).toBe('hello'); + expect(lines[1]).toBe('exit:0'); + }); + + test('Bun.spawn launches a process with pid', async () => { + const result = Bun.spawnSync(['node', '-e', ` + require('${polyfillPath}'); + const p = Bun.spawn(['echo', 'test'], { stdio: ['pipe', 'pipe', 'pipe'] }); + console.log(typeof p.pid === 'number' ? 'HAS_PID' : 'NO_PID'); + console.log(typeof p.kill === 'function' ? 'HAS_KILL' : 'NO_KILL'); + console.log(typeof p.unref === 'function' ? 'HAS_UNREF' : 'NO_UNREF'); + `], { stdout: 'pipe', stderr: 'pipe' }); + const lines = result.stdout.toString().trim().split('\n'); + expect(lines[0]).toBe('HAS_PID'); + expect(lines[1]).toBe('HAS_KILL'); + expect(lines[2]).toBe('HAS_UNREF'); + }); + + test('Bun.serve creates an HTTP server that responds', async () => { + const result = Bun.spawnSync(['node', '-e', ` + require('${polyfillPath}'); + const server = Bun.serve({ + port: 0, // Note: polyfill uses port directly, so we pick one + hostname: '127.0.0.1', + fetch(req) { + return new Response(JSON.stringify({ ok: true }), { + headers: { 'Content-Type': 'application/json' }, + }); + }, + }); + // The polyfill doesn't support port 0, so we test the object shape + console.log(typeof server.stop === 'function' ? 'HAS_STOP' : 'NO_STOP'); + console.log(typeof server.port === 'number' ? 'HAS_PORT' : 'NO_PORT'); + server.stop(); + `], { stdout: 'pipe', stderr: 'pipe' }); + const lines = result.stdout.toString().trim().split('\n'); + expect(lines[0]).toBe('HAS_STOP'); + expect(lines[1]).toBe('HAS_PORT'); + }); +}); diff --git a/browse/test/config.test.ts b/browse/test/config.test.ts index 12892ce4..0cbe47fa 100644 --- a/browse/test/config.test.ts +++ b/browse/test/config.test.ts @@ -197,6 +197,36 @@ describe('resolveServerScript', () => { }); }); +describe('resolveNodeServerScript', () => { + const { resolveNodeServerScript } = require('../src/cli'); + + test('finds server-node.mjs in dist from dev mode', () => { + const srcDir = path.resolve(__dirname, '../src'); + const distFile = path.resolve(srcDir, '..', 'dist', 'server-node.mjs'); + const fs = require('fs'); + // Only test if the file exists (it may not be built yet) + if (fs.existsSync(distFile)) { + const result = resolveNodeServerScript(srcDir, ''); + expect(result).toBe(distFile); + } + }); + + test('returns null when server-node.mjs does not exist', () => { + const result = resolveNodeServerScript('/nonexistent/$bunfs', '/nonexistent/browse'); + expect(result).toBeNull(); + }); + + test('finds server-node.mjs adjacent to compiled binary', () => { + const distDir = path.resolve(__dirname, '../dist'); + const distFile = path.join(distDir, 'server-node.mjs'); + const fs = require('fs'); + if (fs.existsSync(distFile)) { + const result = resolveNodeServerScript('/$bunfs/something', path.join(distDir, 'browse')); + expect(result).toBe(distFile); + } + }); +}); + describe('version mismatch detection', () => { test('detects when versions differ', () => { const stateVersion = 'abc123'; diff --git a/browse/test/platform.test.ts b/browse/test/platform.test.ts new file mode 100644 index 00000000..fb6c64b9 --- /dev/null +++ b/browse/test/platform.test.ts @@ -0,0 +1,37 @@ +import { describe, test, expect } from 'bun:test'; +import { TEMP_DIR, isPathWithin, IS_WINDOWS } from '../src/platform'; + +describe('platform constants', () => { + test('TEMP_DIR is /tmp on non-Windows', () => { + if (!IS_WINDOWS) { + expect(TEMP_DIR).toBe('/tmp'); + } + }); + + test('IS_WINDOWS reflects process.platform', () => { + expect(IS_WINDOWS).toBe(process.platform === 'win32'); + }); +}); + +describe('isPathWithin', () => { + test('path inside directory returns true', () => { + expect(isPathWithin('/tmp/foo', '/tmp')).toBe(true); + }); + + test('path outside directory returns false', () => { + expect(isPathWithin('/etc/foo', '/tmp')).toBe(false); + }); + + test('exact match returns true', () => { + expect(isPathWithin('/tmp', '/tmp')).toBe(true); + }); + + test('partial prefix does not match (path traversal)', () => { + // /tmp-evil should NOT match /tmp + expect(isPathWithin('/tmp-evil/foo', '/tmp')).toBe(false); + }); + + test('nested path returns true', () => { + expect(isPathWithin('/tmp/a/b/c', '/tmp')).toBe(true); + }); +}); diff --git a/package.json b/package.json index ba18c08a..3001c764 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "browse": "./browse/dist/browse" }, "scripts": { - "build": "bun run gen:skill-docs && bun run gen:skill-docs --host codex && bun build --compile browse/src/cli.ts --outfile browse/dist/browse && bun build --compile browse/src/find-browse.ts --outfile browse/dist/find-browse && git rev-parse HEAD > browse/dist/.version && rm -f .*.bun-build || true", + "build": "bun run gen:skill-docs && bun run gen:skill-docs --host codex && bun build --compile browse/src/cli.ts --outfile browse/dist/browse && bun build --compile browse/src/find-browse.ts --outfile browse/dist/find-browse && bash browse/scripts/build-node-server.sh && git rev-parse HEAD > browse/dist/.version && rm -f .*.bun-build || true", "gen:skill-docs": "bun run scripts/gen-skill-docs.ts", "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", diff --git a/setup b/setup index cf3e5050..09d2282f 100755 --- a/setup +++ b/setup @@ -12,6 +12,11 @@ GSTACK_DIR="$(cd "$(dirname "$0")" && pwd)" SKILLS_DIR="$(dirname "$GSTACK_DIR")" BROWSE_BIN="$GSTACK_DIR/browse/dist/browse" +IS_WINDOWS=0 +case "$(uname -s)" in + MINGW*|MSYS*|CYGWIN*|Windows_NT) IS_WINDOWS=1 ;; +esac + # ─── Parse --host flag ───────────────────────────────────────── HOST="claude" while [ $# -gt 0 ]; do @@ -44,10 +49,19 @@ elif [ "$HOST" = "codex" ]; then fi ensure_playwright_browser() { - ( - cd "$GSTACK_DIR" - bun --eval 'import { chromium } from "playwright"; const browser = await chromium.launch(); await browser.close();' - ) >/dev/null 2>&1 + if [ "$IS_WINDOWS" -eq 1 ]; then + # On Windows, Bun can't launch Chromium due to broken pipe handling + # (oven-sh/bun#4253). Use Node.js to verify Chromium works instead. + ( + cd "$GSTACK_DIR" + node -e "const { chromium } = require('playwright'); (async () => { const b = await chromium.launch(); await b.close(); })()" 2>/dev/null + ) + else + ( + cd "$GSTACK_DIR" + bun --eval 'import { chromium } from "playwright"; const browser = await chromium.launch(); await browser.close();' + ) >/dev/null 2>&1 + fi } # 1. Build browse binary if needed (smart rebuild: stale sources, package.json, lock) @@ -87,10 +101,32 @@ if ! ensure_playwright_browser; then cd "$GSTACK_DIR" bunx playwright install chromium ) + + if [ "$IS_WINDOWS" -eq 1 ]; then + # On Windows, Node.js launches Chromium (not Bun — see oven-sh/bun#4253). + # Ensure playwright is importable by Node from the gstack directory. + if ! command -v node >/dev/null 2>&1; then + echo "gstack setup failed: Node.js is required on Windows (Bun cannot launch Chromium due to a pipe bug)" >&2 + echo " Install Node.js: https://nodejs.org/" >&2 + exit 1 + fi + echo "Windows detected — verifying Node.js can load Playwright..." + ( + cd "$GSTACK_DIR" + # Bun's node_modules already has playwright; verify Node can require it + node -e "require('playwright')" 2>/dev/null || npm install --no-save playwright + ) + fi fi if ! ensure_playwright_browser; then - echo "gstack setup failed: Playwright Chromium could not be launched" >&2 + if [ "$IS_WINDOWS" -eq 1 ]; then + echo "gstack setup failed: Playwright Chromium could not be launched via Node.js" >&2 + echo " This is a known issue with Bun on Windows (oven-sh/bun#4253)." >&2 + echo " Ensure Node.js is installed and 'node -e \"require('playwright')\"' works." >&2 + else + echo "gstack setup failed: Playwright Chromium could not be launched" >&2 + fi exit 1 fi From 9811ed37bf05a6cfa85ce3a1b00d8db5edfa44ca Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 20 Mar 2026 13:47:50 -0700 Subject: [PATCH 3/8] feat: default codex reviews in /ship and /review (v0.9.4.0) (#256) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: default codex reviews in /ship and /review with xhigh reasoning Codex code reviews are now opt-in-once-then-always-on via a one-time adoption prompt. When enabled, both review + adversarial run automatically on every /ship and /review — no more choosing between them. Key changes: - New {{CODEX_REVIEW_STEP}} resolver centralizes Codex review logic (DRY) - Three-state config: enabled/not-set/disabled via gstack-config - P1 findings default to "Investigate and fix" instead of "Ship anyway" - All reasoning bumped to xhigh (review, adversarial, consult) - Codex review step stripped from codex-host variants (no self-invocation) - Ship "Never ask" rule updated to accurately list quality-gate stops - Error handling for auth, timeout, empty response (all non-blocking) Co-Authored-By: Claude Opus 4.6 (1M context) * fix: update touchfiles test for plan-ceo-review-benefits dependency The merge from main added plan-ceo-review-benefits to E2E_TOUCHFILES, which means plan-ceo-review/SKILL.md now selects 3 tests, not 2. Co-Authored-By: Claude Opus 4.6 * feat: default codex reviews in /ship and /review (v0.9.4.0) Codex code reviews now run automatically — both review + adversarial challenge — with a one-time opt-in prompt for new users. All modes use xhigh reasoning. Codex-host builds strip the step to prevent recursion. Fixes from Codex review: TMPERR properly defined, stderr captured for both review and adversarial, error handling before log persist, commit hash included in review log for staleness tracking. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .../skills/gstack-plan-ceo-review/SKILL.md | 2 +- .../skills/gstack-plan-design-review/SKILL.md | 2 +- .../skills/gstack-plan-eng-review/SKILL.md | 2 +- .agents/skills/gstack-review/SKILL.md | 47 ------ .agents/skills/gstack-ship/SKILL.md | 40 +----- CHANGELOG.md | 13 ++ VERSION | 2 +- codex/SKILL.md | 13 +- codex/SKILL.md.tmpl | 13 +- plan-ceo-review/SKILL.md | 2 +- plan-design-review/SKILL.md | 2 +- plan-eng-review/SKILL.md | 2 +- review/SKILL.md | 128 +++++++++++++---- review/SKILL.md.tmpl | 49 +------ scripts/gen-skill-docs.ts | 136 +++++++++++++++++- ship/SKILL.md | 119 ++++++++++++--- ship/SKILL.md.tmpl | 40 +----- test/gen-skill-docs.test.ts | 10 ++ test/skill-validation.test.ts | 26 +++- test/touchfiles.test.ts | 5 +- 20 files changed, 405 insertions(+), 248 deletions(-) diff --git a/.agents/skills/gstack-plan-ceo-review/SKILL.md b/.agents/skills/gstack-plan-ceo-review/SKILL.md index 6d078f3a..9e450566 100644 --- a/.agents/skills/gstack-plan-ceo-review/SKILL.md +++ b/.agents/skills/gstack-plan-ceo-review/SKILL.md @@ -993,7 +993,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/.agents/skills/gstack-plan-design-review/SKILL.md b/.agents/skills/gstack-plan-design-review/SKILL.md index 92a5dc56..e431d72c 100644 --- a/.agents/skills/gstack-plan-design-review/SKILL.md +++ b/.agents/skills/gstack-plan-design-review/SKILL.md @@ -528,7 +528,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/.agents/skills/gstack-plan-eng-review/SKILL.md b/.agents/skills/gstack-plan-eng-review/SKILL.md index d4cff7cd..01233079 100644 --- a/.agents/skills/gstack-plan-eng-review/SKILL.md +++ b/.agents/skills/gstack-plan-eng-review/SKILL.md @@ -517,7 +517,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/.agents/skills/gstack-review/SKILL.md b/.agents/skills/gstack-review/SKILL.md index 0253dcac..ab00c53e 100644 --- a/.agents/skills/gstack-review/SKILL.md +++ b/.agents/skills/gstack-review/SKILL.md @@ -474,54 +474,7 @@ If no documentation files exist, skip this step silently. --- -## Step 5.7: Codex second opinion (optional) -After completing the review, check if the Codex CLI is available: - -```bash -which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" -``` - -If Codex is available, use AskUserQuestion: - -``` -Review complete. Want an independent second opinion from Codex (OpenAI)? - -A) Run Codex code review — independent diff review with pass/fail gate -B) Run Codex adversarial challenge — try to find ways this code will fail in production -C) Both — review first, then adversarial challenge -D) Skip — no Codex review needed -``` - -If the user chooses A, B, or C: - -**For code review (A or C):** Run `codex review --base ` with a 5-minute timeout. -Present the full output verbatim under a `CODEX SAYS (code review):` header. -Check the output for `[P1]` markers — if found, note `GATE: FAIL`, otherwise `GATE: PASS`. -After presenting, compare Codex's findings with your own review findings from Steps 4-5 -and output a CROSS-MODEL ANALYSIS showing what both found, what only Codex found, -and what only Claude found. - -**For adversarial challenge (B or C):** Run: -```bash -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, failure modes. Be adversarial." -s read-only -``` -Present the full output verbatim under a `CODEX SAYS (adversarial challenge):` header. - -**Only if a code review ran (user chose A or C):** Persist the Codex review result to the review log: -```bash -~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE"}' -``` - -Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). - -**Do NOT persist a codex-review entry when only the adversarial challenge (B) ran** — -there is no gate verdict to record, and a false entry would make the Review Readiness -Dashboard believe a code review happened when it didn't. - -If Codex is not available, skip this step silently. - ---- ## Important Rules diff --git a/.agents/skills/gstack-ship/SKILL.md b/.agents/skills/gstack-ship/SKILL.md index da20fd26..24a770be 100644 --- a/.agents/skills/gstack-ship/SKILL.md +++ b/.agents/skills/gstack-ship/SKILL.md @@ -295,7 +295,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) @@ -837,43 +837,7 @@ For each classified comment: --- -## Step 3.8: Codex second opinion (optional) -Check if the Codex CLI is available: - -```bash -which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" -``` - -If Codex is available, use AskUserQuestion: - -``` -Pre-landing review complete. Want an independent Codex (OpenAI) review before shipping? - -A) Run Codex code review — independent diff review with pass/fail gate -B) Run Codex adversarial challenge — try to break this code -C) Skip — ship without Codex review -``` - -If the user chooses A or B: - -**For code review (A):** Run `codex review --base ` with a 5-minute timeout. -Present the full output verbatim under a `CODEX SAYS:` header. Check for `[P1]` markers -to determine pass/fail gate. Persist the result: - -```bash -~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE"}' -``` - -If GATE is FAIL, use AskUserQuestion: "Codex found critical issues. Ship anyway?" -If the user says no, stop. If yes, continue to Step 4. - -**For adversarial (B):** Run codex exec with the adversarial prompt (see /codex skill). -Present findings. This is informational — does not block shipping. - -If Codex is not available, skip silently. Continue to Step 4. - ---- ## Step 4: Version bump (auto-decide) @@ -1114,7 +1078,7 @@ doc updates — the user runs `/ship` and documentation stays current without a - **Never skip tests.** If tests fail, stop. - **Never skip the pre-landing review.** If checklist.md is unreadable, stop. - **Never force push.** Use regular `git push` only. -- **Never ask for confirmation** except for MINOR/MAJOR version bumps and pre-landing review ASK items (batched into at most one AskUserQuestion). +- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), Codex critical findings ([P1]), and the one-time Codex adoption prompt. - **Always use the 4-digit version format** from the VERSION file. - **Date format in CHANGELOG:** `YYYY-MM-DD` - **Split commits for bisectability** — each commit = one logical change. diff --git a/CHANGELOG.md b/CHANGELOG.md index b4e8261c..2fa0a844 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.9.4.0] - 2026-03-20 — Codex Reviews On By Default + +### Changed + +- **Codex code reviews now run automatically in `/ship` and `/review`.** No more "want a second opinion?" prompt every time — Codex reviews both your code (with a pass/fail gate) and runs an adversarial challenge by default. First-time users get a one-time opt-in prompt; after that, it's hands-free. Configure with `gstack-config set codex_reviews enabled|disabled`. +- **All Codex operations use maximum reasoning power.** Review, adversarial, and consult modes all use `xhigh` reasoning effort — when an AI is reviewing your code, you want it thinking as hard as possible. +- **Codex review errors can't corrupt the dashboard.** Auth failures, timeouts, and empty responses are now detected before logging results, so the Review Readiness Dashboard never shows a false "passed" entry. Adversarial stderr is captured separately. +- **Codex review log includes commit hash.** Staleness detection now works correctly for Codex reviews, matching the same commit-tracking behavior as eng/CEO/design reviews. + +### Fixed + +- **Codex-for-Codex recursion prevented.** When gstack runs inside Codex CLI (`.agents/skills/`), the Codex review step is completely stripped — no accidental infinite loops. + ## [0.9.3.0] - 2026-03-20 — Windows Support ### Fixed diff --git a/VERSION b/VERSION index 947d2886..3544d2f0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.3.0 +0.9.4.0 diff --git a/codex/SKILL.md b/codex/SKILL.md index 19a341dd..77705f7e 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -300,13 +300,13 @@ TMPERR=$(mktemp /tmp/codex-err-XXXXXX.txt) 2. Run the review (5-minute timeout): ```bash -codex review --base -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR" +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" ``` Use `timeout: 300000` on the Bash call. If the user provided custom instructions (e.g., `/codex review focus on security`), pass them as the prompt argument: ```bash -codex review "focus on security" --base -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR" +codex review "focus on security" --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" ``` 3. Capture the output. Then parse cost from stderr: @@ -461,7 +461,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -494,7 +494,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` @@ -530,10 +530,7 @@ Session saved — run /codex again to continue this conversation. agentic coding model). This means as OpenAI ships newer models, /codex automatically uses them. If the user wants a specific model, pass `-m` through to codex. -**Reasoning effort** varies by mode — use the right level for each task: -- **Review mode:** `high` — thorough but not slow. Diff review benefits from depth but doesn't need maximum compute. -- **Challenge (adversarial) mode:** `xhigh` — maximum reasoning power. When trying to break code, you want the model thinking as hard as possible. -- **Consult mode:** `high` — good balance of depth and speed for conversations. +**Reasoning effort:** All modes use `xhigh` — maximum reasoning power. When reviewing code, breaking code, or consulting on architecture, you want the model thinking as hard as possible. **Web search:** All codex commands use `--enable web_search_cached` so Codex can look up docs and APIs during review. This is OpenAI's cached index — fast, no extra cost. diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index f2da49ad..30b603ee 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -79,13 +79,13 @@ TMPERR=$(mktemp /tmp/codex-err-XXXXXX.txt) 2. Run the review (5-minute timeout): ```bash -codex review --base -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR" +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" ``` Use `timeout: 300000` on the Bash call. If the user provided custom instructions (e.g., `/codex review focus on security`), pass them as the prompt argument: ```bash -codex review "focus on security" --base -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR" +codex review "focus on security" --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" ``` 3. Capture the output. Then parse cost from stderr: @@ -240,7 +240,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -273,7 +273,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` @@ -309,10 +309,7 @@ Session saved — run /codex again to continue this conversation. agentic coding model). This means as OpenAI ships newer models, /codex automatically uses them. If the user wants a specific model, pass `-m` through to codex. -**Reasoning effort** varies by mode — use the right level for each task: -- **Review mode:** `high` — thorough but not slow. Diff review benefits from depth but doesn't need maximum compute. -- **Challenge (adversarial) mode:** `xhigh` — maximum reasoning power. When trying to break code, you want the model thinking as hard as possible. -- **Consult mode:** `high` — good balance of depth and speed for conversations. +**Reasoning effort:** All modes use `xhigh` — maximum reasoning power. When reviewing code, breaking code, or consulting on architecture, you want the model thinking as hard as possible. **Web search:** All codex commands use `--enable web_search_cached` so Codex can look up docs and APIs during review. This is OpenAI's cached index — fast, no extra cost. diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index 44fc4013..fdf95164 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -1001,7 +1001,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 6abd6948..1483e6e8 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -536,7 +536,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 078a2875..63fda40b 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -526,7 +526,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/review/SKILL.md b/review/SKILL.md index 01373875..4a646f6e 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -483,52 +483,128 @@ If no documentation files exist, skip this step silently. --- -## Step 5.7: Codex second opinion (optional) +## Step 5.7: Codex review -After completing the review, check if the Codex CLI is available: +Check if the Codex CLI is available and read the user's Codex review preference: ```bash which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +CODEX_REVIEWS_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) +echo "CODEX_REVIEWS: ${CODEX_REVIEWS_CFG:-not_set}" ``` -If Codex is available, use AskUserQuestion: +If `CODEX_NOT_AVAILABLE`: skip this step silently. Continue to the next step. + +If `CODEX_REVIEWS` is `disabled`: skip this step silently. Continue to the next step. + +If `CODEX_REVIEWS` is `enabled`: run both code review and adversarial challenge automatically (no prompt). Jump to the "Run Codex" section below. + +If `CODEX_REVIEWS` is `not_set`: use AskUserQuestion to offer the one-time adoption prompt: ``` -Review complete. Want an independent second opinion from Codex (OpenAI)? +GStack recommends enabling Codex code reviews — Codex is the super smart quiet engineer friend who will save your butt. -A) Run Codex code review — independent diff review with pass/fail gate -B) Run Codex adversarial challenge — try to find ways this code will fail in production -C) Both — review first, then adversarial challenge -D) Skip — no Codex review needed +A) Enable for all future runs (recommended, default) +B) Try it for now, ask me again later +C) No thanks, don't ask me again ``` -If the user chooses A, B, or C: - -**For code review (A or C):** Run `codex review --base ` with a 5-minute timeout. -Present the full output verbatim under a `CODEX SAYS (code review):` header. -Check the output for `[P1]` markers — if found, note `GATE: FAIL`, otherwise `GATE: PASS`. -After presenting, compare Codex's findings with your own review findings from Steps 4-5 -and output a CROSS-MODEL ANALYSIS showing what both found, what only Codex found, -and what only Claude found. - -**For adversarial challenge (B or C):** Run: +If the user chooses A: persist the setting and run both: ```bash -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, failure modes. Be adversarial." -s read-only +~/.claude/skills/gstack/bin/gstack-config set codex_reviews enabled ``` -Present the full output verbatim under a `CODEX SAYS (adversarial challenge):` header. -**Only if a code review ran (user chose A or C):** Persist the Codex review result to the review log: +If the user chooses B: run both this time but do not persist any setting. + +If the user chooses C: persist the opt-out and skip: ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE"}' +~/.claude/skills/gstack/bin/gstack-config set codex_reviews disabled +``` +Then skip this step. Continue to the next step. + +### Run Codex + +Always run **both** code review and adversarial challenge. Use a 5-minute timeout (`timeout: 300000`) on each Bash call. + +First, create a temp file for stderr capture: +```bash +TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) +``` + +**Code review:** Run: +```bash +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" +``` + +After the command completes, read stderr for cost/error info: +```bash +cat "$TMPERR" +``` + +Present the full output verbatim under a `CODEX SAYS (code review):` header: + +``` +CODEX SAYS (code review): +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +GATE: PASS Tokens: N | Est. cost: ~$X.XX +``` + +Check the output for `[P1]` markers. If found: `GATE: FAIL`. If no `[P1]`: `GATE: PASS`. + +**If GATE is FAIL:** use AskUserQuestion: + +``` +Codex found N critical issues in the diff. + +A) Investigate and fix now (recommended) +B) Ship anyway — these issues may cause production problems +``` + +If the user chooses A: read the Codex findings carefully and work to address them. Then re-run `codex review` to verify the gate is now PASS. + +If the user chooses B: continue to the next step. + +### Error handling (code review) + +Before persisting the gate result, check for errors. All errors are non-blocking — Codex is a quality enhancement, not a prerequisite. Check `$TMPERR` output (already read above) for error indicators: + +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key", tell the user: "Codex authentication failed. Run \`codex login\` in your terminal to authenticate via ChatGPT." Do NOT persist a review log entry. Continue to the adversarial step (it will likely fail too, but try anyway). +- **Timeout:** If the Bash call times out (5 min), tell the user: "Codex timed out after 5 minutes. The diff may be too large or the API may be slow." Do NOT persist a review log entry. Skip to cleanup. +- **Empty response:** If codex returned no stdout output, tell the user: "Codex returned no response. Stderr: ." Do NOT persist a review log entry. Skip to cleanup. + +**Only if codex produced a real review (non-empty stdout):** Persist the code review result: +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' ``` Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). -**Do NOT persist a codex-review entry when only the adversarial challenge (B) ran** — -there is no gate verdict to record, and a false entry would make the Review Readiness -Dashboard believe a code review happened when it didn't. +**Adversarial challenge:** Run: +```bash +TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +``` -If Codex is not available, skip this step silently. +After the command completes, read adversarial stderr: +```bash +cat "$TMPERR_ADV" +``` + +Present the full output verbatim under a `CODEX SAYS (adversarial challenge):` header. This is informational — it never blocks shipping. If the adversarial command timed out or returned no output, note this to the user and continue. + +**Cross-model analysis:** After both Codex outputs are presented, compare Codex's findings with your own review findings from the earlier review steps and output: + +``` +CROSS-MODEL ANALYSIS: + Both found: [findings that overlap between Claude and Codex] + Only Codex found: [findings unique to Codex] + Only Claude found: [findings unique to Claude's review] + Agreement rate: X% (N/M total unique findings overlap) +``` + +**Cleanup:** Run `rm -f "$TMPERR" "$TMPERR_ADV"` after processing. --- diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index bab95d91..34a25018 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -231,54 +231,7 @@ If no documentation files exist, skip this step silently. --- -## Step 5.7: Codex second opinion (optional) - -After completing the review, check if the Codex CLI is available: - -```bash -which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" -``` - -If Codex is available, use AskUserQuestion: - -``` -Review complete. Want an independent second opinion from Codex (OpenAI)? - -A) Run Codex code review — independent diff review with pass/fail gate -B) Run Codex adversarial challenge — try to find ways this code will fail in production -C) Both — review first, then adversarial challenge -D) Skip — no Codex review needed -``` - -If the user chooses A, B, or C: - -**For code review (A or C):** Run `codex review --base ` with a 5-minute timeout. -Present the full output verbatim under a `CODEX SAYS (code review):` header. -Check the output for `[P1]` markers — if found, note `GATE: FAIL`, otherwise `GATE: PASS`. -After presenting, compare Codex's findings with your own review findings from Steps 4-5 -and output a CROSS-MODEL ANALYSIS showing what both found, what only Codex found, -and what only Claude found. - -**For adversarial challenge (B or C):** Run: -```bash -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, failure modes. Be adversarial." -s read-only -``` -Present the full output verbatim under a `CODEX SAYS (adversarial challenge):` header. - -**Only if a code review ran (user chose A or C):** Persist the Codex review result to the review log: -```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE"}' -``` - -Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). - -**Do NOT persist a codex-review entry when only the adversarial challenge (B) ran** — -there is no gate verdict to record, and a false entry would make the Review Readiness -Dashboard believe a code review happened when it didn't. - -If Codex is not available, skip this step silently. - ---- +{{CODEX_REVIEW_STEP}} ## Important Rules diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 53e8834f..8bb16bf9 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -1092,7 +1092,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \\\`gstack-config set skip_eng_review true\\\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \\\`gstack-config set codex_reviews enabled|disabled\\\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \\\`skip_eng_review\\\` is \\\`true\\\`) @@ -1412,6 +1412,139 @@ The screenshot file at \`/tmp/gstack-sketch.png\` can be referenced by downstrea (\`/plan-design-review\`, \`/design-review\`) to see what was originally envisioned.`; } +function generateCodexReviewStep(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should never invoke itself + if (ctx.host === 'codex') return ''; + + const isShip = ctx.skillName === 'ship'; + const stepNum = isShip ? '3.8' : '5.7'; + + return `## Step ${stepNum}: Codex review + +Check if the Codex CLI is available and read the user's Codex review preference: + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +CODEX_REVIEWS_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) +echo "CODEX_REVIEWS: \${CODEX_REVIEWS_CFG:-not_set}" +\`\`\` + +If \`CODEX_NOT_AVAILABLE\`: skip this step silently. Continue to the next step. + +If \`CODEX_REVIEWS\` is \`disabled\`: skip this step silently. Continue to the next step. + +If \`CODEX_REVIEWS\` is \`enabled\`: run both code review and adversarial challenge automatically (no prompt). Jump to the "Run Codex" section below. + +If \`CODEX_REVIEWS\` is \`not_set\`: use AskUserQuestion to offer the one-time adoption prompt: + +\`\`\` +GStack recommends enabling Codex code reviews — Codex is the super smart quiet engineer friend who will save your butt. + +A) Enable for all future runs (recommended, default) +B) Try it for now, ask me again later +C) No thanks, don't ask me again +\`\`\` + +If the user chooses A: persist the setting and run both: +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-config set codex_reviews enabled +\`\`\` + +If the user chooses B: run both this time but do not persist any setting. + +If the user chooses C: persist the opt-out and skip: +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-config set codex_reviews disabled +\`\`\` +Then skip this step. Continue to the next step. + +### Run Codex + +Always run **both** code review and adversarial challenge. Use a 5-minute timeout (\`timeout: 300000\`) on each Bash call. + +First, create a temp file for stderr capture: +\`\`\`bash +TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) +\`\`\` + +**Code review:** Run: +\`\`\`bash +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" +\`\`\` + +After the command completes, read stderr for cost/error info: +\`\`\`bash +cat "$TMPERR" +\`\`\` + +Present the full output verbatim under a \`CODEX SAYS (code review):\` header: + +\`\`\` +CODEX SAYS (code review): +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +GATE: PASS Tokens: N | Est. cost: ~$X.XX +\`\`\` + +Check the output for \`[P1]\` markers. If found: \`GATE: FAIL\`. If no \`[P1]\`: \`GATE: PASS\`. + +**If GATE is FAIL:** use AskUserQuestion: + +\`\`\` +Codex found N critical issues in the diff. + +A) Investigate and fix now (recommended) +B) Ship anyway — these issues may cause production problems +\`\`\` + +If the user chooses A: read the Codex findings carefully and work to address them${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Then re-run \`codex review\` to verify the gate is now PASS. + +If the user chooses B: continue to the next step. + +### Error handling (code review) + +Before persisting the gate result, check for errors. All errors are non-blocking — Codex is a quality enhancement, not a prerequisite. Check \`$TMPERR\` output (already read above) for error indicators: + +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key", tell the user: "Codex authentication failed. Run \\\`codex login\\\` in your terminal to authenticate via ChatGPT." Do NOT persist a review log entry. Continue to the adversarial step (it will likely fail too, but try anyway). +- **Timeout:** If the Bash call times out (5 min), tell the user: "Codex timed out after 5 minutes. The diff may be too large or the API may be slow." Do NOT persist a review log entry. Skip to cleanup. +- **Empty response:** If codex returned no stdout output, tell the user: "Codex returned no response. Stderr: ." Do NOT persist a review log entry. Skip to cleanup. + +**Only if codex produced a real review (non-empty stdout):** Persist the code review result: +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` + +Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). + +**Adversarial challenge:** Run: +\`\`\`bash +TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +\`\`\` + +After the command completes, read adversarial stderr: +\`\`\`bash +cat "$TMPERR_ADV" +\`\`\` + +Present the full output verbatim under a \`CODEX SAYS (adversarial challenge):\` header. This is informational — it never blocks shipping. If the adversarial command timed out or returned no output, note this to the user and continue. +${!isShip ? ` +**Cross-model analysis:** After both Codex outputs are presented, compare Codex's findings with your own review findings from the earlier review steps and output: + +\`\`\` +CROSS-MODEL ANALYSIS: + Both found: [findings that overlap between Claude and Codex] + Only Codex found: [findings unique to Codex] + Only Claude found: [findings unique to Claude's review] + Agreement rate: X% (N/M total unique findings overlap) +\`\`\` +` : ''} +**Cleanup:** Run \`rm -f "$TMPERR" "$TMPERR_ADV"\` after processing. + +---`; +} + const RESOLVERS: Record string> = { COMMAND_REFERENCE: generateCommandReference, SNAPSHOT_FLAGS: generateSnapshotFlags, @@ -1426,6 +1559,7 @@ const RESOLVERS: Record string> = { SPEC_REVIEW_LOOP: generateSpecReviewLoop, DESIGN_SKETCH: generateDesignSketch, BENEFITS_FROM: generateBenefitsFrom, + CODEX_REVIEW_STEP: generateCodexReviewStep, }; // ─── Codex Helpers ─────────────────────────────────────────── diff --git a/ship/SKILL.md b/ship/SKILL.md index a4eb4d80..232a23e0 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -305,7 +305,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed. +- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) @@ -847,41 +847,118 @@ For each classified comment: --- -## Step 3.8: Codex second opinion (optional) +## Step 3.8: Codex review -Check if the Codex CLI is available: +Check if the Codex CLI is available and read the user's Codex review preference: ```bash which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +CODEX_REVIEWS_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) +echo "CODEX_REVIEWS: ${CODEX_REVIEWS_CFG:-not_set}" ``` -If Codex is available, use AskUserQuestion: +If `CODEX_NOT_AVAILABLE`: skip this step silently. Continue to the next step. + +If `CODEX_REVIEWS` is `disabled`: skip this step silently. Continue to the next step. + +If `CODEX_REVIEWS` is `enabled`: run both code review and adversarial challenge automatically (no prompt). Jump to the "Run Codex" section below. + +If `CODEX_REVIEWS` is `not_set`: use AskUserQuestion to offer the one-time adoption prompt: ``` -Pre-landing review complete. Want an independent Codex (OpenAI) review before shipping? +GStack recommends enabling Codex code reviews — Codex is the super smart quiet engineer friend who will save your butt. -A) Run Codex code review — independent diff review with pass/fail gate -B) Run Codex adversarial challenge — try to break this code -C) Skip — ship without Codex review +A) Enable for all future runs (recommended, default) +B) Try it for now, ask me again later +C) No thanks, don't ask me again ``` -If the user chooses A or B: - -**For code review (A):** Run `codex review --base ` with a 5-minute timeout. -Present the full output verbatim under a `CODEX SAYS:` header. Check for `[P1]` markers -to determine pass/fail gate. Persist the result: - +If the user chooses A: persist the setting and run both: ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE"}' +~/.claude/skills/gstack/bin/gstack-config set codex_reviews enabled ``` -If GATE is FAIL, use AskUserQuestion: "Codex found critical issues. Ship anyway?" -If the user says no, stop. If yes, continue to Step 4. +If the user chooses B: run both this time but do not persist any setting. -**For adversarial (B):** Run codex exec with the adversarial prompt (see /codex skill). -Present findings. This is informational — does not block shipping. +If the user chooses C: persist the opt-out and skip: +```bash +~/.claude/skills/gstack/bin/gstack-config set codex_reviews disabled +``` +Then skip this step. Continue to the next step. -If Codex is not available, skip silently. Continue to Step 4. +### Run Codex + +Always run **both** code review and adversarial challenge. Use a 5-minute timeout (`timeout: 300000`) on each Bash call. + +First, create a temp file for stderr capture: +```bash +TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) +``` + +**Code review:** Run: +```bash +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" +``` + +After the command completes, read stderr for cost/error info: +```bash +cat "$TMPERR" +``` + +Present the full output verbatim under a `CODEX SAYS (code review):` header: + +``` +CODEX SAYS (code review): +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +GATE: PASS Tokens: N | Est. cost: ~$X.XX +``` + +Check the output for `[P1]` markers. If found: `GATE: FAIL`. If no `[P1]`: `GATE: PASS`. + +**If GATE is FAIL:** use AskUserQuestion: + +``` +Codex found N critical issues in the diff. + +A) Investigate and fix now (recommended) +B) Ship anyway — these issues may cause production problems +``` + +If the user chooses A: read the Codex findings carefully and work to address them. After fixing, re-run tests (Step 3) since code has changed. Then re-run `codex review` to verify the gate is now PASS. + +If the user chooses B: continue to the next step. + +### Error handling (code review) + +Before persisting the gate result, check for errors. All errors are non-blocking — Codex is a quality enhancement, not a prerequisite. Check `$TMPERR` output (already read above) for error indicators: + +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key", tell the user: "Codex authentication failed. Run \`codex login\` in your terminal to authenticate via ChatGPT." Do NOT persist a review log entry. Continue to the adversarial step (it will likely fail too, but try anyway). +- **Timeout:** If the Bash call times out (5 min), tell the user: "Codex timed out after 5 minutes. The diff may be too large or the API may be slow." Do NOT persist a review log entry. Skip to cleanup. +- **Empty response:** If codex returned no stdout output, tell the user: "Codex returned no response. Stderr: ." Do NOT persist a review log entry. Skip to cleanup. + +**Only if codex produced a real review (non-empty stdout):** Persist the code review result: +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' +``` + +Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). + +**Adversarial challenge:** Run: +```bash +TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +``` + +After the command completes, read adversarial stderr: +```bash +cat "$TMPERR_ADV" +``` + +Present the full output verbatim under a `CODEX SAYS (adversarial challenge):` header. This is informational — it never blocks shipping. If the adversarial command timed out or returned no output, note this to the user and continue. + +**Cleanup:** Run `rm -f "$TMPERR" "$TMPERR_ADV"` after processing. --- @@ -1124,7 +1201,7 @@ doc updates — the user runs `/ship` and documentation stays current without a - **Never skip tests.** If tests fail, stop. - **Never skip the pre-landing review.** If checklist.md is unreadable, stop. - **Never force push.** Use regular `git push` only. -- **Never ask for confirmation** except for MINOR/MAJOR version bumps and pre-landing review ASK items (batched into at most one AskUserQuestion). +- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), Codex critical findings ([P1]), and the one-time Codex adoption prompt. - **Always use the 4-digit version format** from the VERSION file. - **Date format in CHANGELOG:** `YYYY-MM-DD` - **Split commits for bisectability** — each commit = one logical change. diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index 22dff7d0..6b441870 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -403,43 +403,7 @@ For each classified comment: --- -## Step 3.8: Codex second opinion (optional) - -Check if the Codex CLI is available: - -```bash -which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" -``` - -If Codex is available, use AskUserQuestion: - -``` -Pre-landing review complete. Want an independent Codex (OpenAI) review before shipping? - -A) Run Codex code review — independent diff review with pass/fail gate -B) Run Codex adversarial challenge — try to break this code -C) Skip — ship without Codex review -``` - -If the user chooses A or B: - -**For code review (A):** Run `codex review --base ` with a 5-minute timeout. -Present the full output verbatim under a `CODEX SAYS:` header. Check for `[P1]` markers -to determine pass/fail gate. Persist the result: - -```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE"}' -``` - -If GATE is FAIL, use AskUserQuestion: "Codex found critical issues. Ship anyway?" -If the user says no, stop. If yes, continue to Step 4. - -**For adversarial (B):** Run codex exec with the adversarial prompt (see /codex skill). -Present findings. This is informational — does not block shipping. - -If Codex is not available, skip silently. Continue to Step 4. - ---- +{{CODEX_REVIEW_STEP}} ## Step 4: Version bump (auto-decide) @@ -680,7 +644,7 @@ doc updates — the user runs `/ship` and documentation stays current without a - **Never skip tests.** If tests fail, stop. - **Never skip the pre-landing review.** If checklist.md is unreadable, stop. - **Never force push.** Use regular `git push` only. -- **Never ask for confirmation** except for MINOR/MAJOR version bumps and pre-landing review ASK items (batched into at most one AskUserQuestion). +- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), Codex critical findings ([P1]), and the one-time Codex adoption prompt. - **Always use the 4-digit version format** from the VERSION file. - **Date format in CHANGELOG:** `YYYY-MM-DD` - **Split commits for bisectability** — each commit = one logical change. diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 68d84465..64b39118 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -584,6 +584,16 @@ describe('Codex generation (--host codex)', () => { expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-codex'))).toBe(false); }); + test('Codex review step stripped from Codex-host ship and review', () => { + const shipContent = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-ship', 'SKILL.md'), 'utf-8'); + expect(shipContent).not.toContain('codex review --base'); + expect(shipContent).not.toContain('Investigate and fix'); + + const reviewContent = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8'); + expect(reviewContent).not.toContain('codex review --base'); + expect(reviewContent).not.toContain('Investigate and fix'); + }); + test('--host codex --dry-run freshness', () => { const result = Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex', '--dry-run'], { cwd: ROOT, diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index f4405a25..6300803d 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -1256,18 +1256,36 @@ describe('Codex skill', () => { expect(content).toContain('mktemp'); }); - test('codex integration in /review offers second opinion', () => { + test('codex integration in /review has config-driven review step', () => { const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); - expect(content).toContain('Codex second opinion'); + expect(content).toContain('Codex review'); + expect(content).toContain('codex_reviews'); expect(content).toContain('codex review'); expect(content).toContain('adversarial'); + expect(content).toContain('xhigh'); + expect(content).toContain('Investigate and fix'); + expect(content).toContain('CROSS-MODEL'); }); - test('codex integration in /ship offers review gate', () => { + test('codex integration in /ship has config-driven review step', () => { const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); - expect(content).toContain('Codex'); + expect(content).toContain('Codex review'); + expect(content).toContain('codex_reviews'); expect(content).toContain('codex review'); expect(content).toContain('codex-review'); + expect(content).toContain('xhigh'); + expect(content).toContain('Investigate and fix'); + }); + + test('codex-host ship/review do NOT contain codex review step', () => { + const shipContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-ship', 'SKILL.md'), 'utf-8'); + expect(shipContent).not.toContain('codex review --base'); + expect(shipContent).not.toContain('Investigate and fix'); + + const reviewContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-review', 'SKILL.md'), 'utf-8'); + expect(reviewContent).not.toContain('codex review --base'); + expect(reviewContent).not.toContain('codex_reviews'); + expect(reviewContent).not.toContain('Investigate and fix'); }); test('codex integration in /plan-eng-review offers plan critique', () => { diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index d89d533d..11dedb1c 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -78,8 +78,9 @@ describe('selectTests', () => { const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES); expect(result.selected).toContain('plan-ceo-review'); expect(result.selected).toContain('plan-ceo-review-selective'); - expect(result.selected.length).toBe(2); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2); + expect(result.selected).toContain('plan-ceo-review-benefits'); + expect(result.selected.length).toBe(3); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 3); }); test('global touchfile triggers ALL tests', () => { From 1f4b6fd7a2a349dfc6f04d158b8b7778b5b74232 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 20 Mar 2026 19:42:28 -0700 Subject: [PATCH 4/8] fix: remove PR size nagging from /retro (v0.9.4.1) (#264) The retro template no longer flags XL PRs as problems or recommends splitting them. PR size distribution is still reported as neutral data. Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 6 ++++++ VERSION | 2 +- retro/SKILL.md | 7 +++---- retro/SKILL.md.tmpl | 7 +++---- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fa0a844..69070ecd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [0.9.4.1] - 2026-03-20 + +### Changed + +- **`/retro` no longer nags about PR size.** The retro still reports PR size distribution (Small/Medium/Large/XL) as neutral data, but no longer flags XL PRs as problems or recommends splitting them. AI reviews don't fatigue — the unit of work is the feature, not the diff. + ## [0.9.4.0] - 2026-03-20 — Codex Reviews On By Default ### Changed diff --git a/VERSION b/VERSION index 3544d2f0..199bf2ae 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.4.0 +0.9.4.1 diff --git a/retro/SKILL.md b/retro/SKILL.md index bc1f5f32..ff4f7283 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -435,7 +435,7 @@ From commit diffs, estimate PR sizes and bucket them: - **Small** (<100 LOC) - **Medium** (100-500 LOC) - **Large** (500-1500 LOC) -- **XL** (1500+ LOC) — flag these with file counts +- **XL** (1500+ LOC) ### Step 8: Focus Score + Ship of the Week @@ -627,14 +627,13 @@ Narrative interpreting what the team-wide patterns mean: Narrative covering: - Commit type mix and what it reveals -- PR size discipline (are PRs staying small?) +- PR size distribution and what it reveals about shipping cadence - Fix-chain detection (sequences of fix commits on the same subsystem) - Version bump discipline ### Code Quality Signals - Test LOC ratio trend - Hotspot analysis (are the same files churning?) -- Any XL PRs that should have been split - Greptile signal ratio and trend (if history exists): "Greptile: X% signal (Y valid catches, Z false positives)" ### Test Health @@ -673,7 +672,7 @@ For each teammate (sorted by commits descending), write a section: - "Fixed the N+1 query that was causing 2s load times on the dashboard" - **Opportunity for growth**: 1 specific, constructive suggestion. Frame as investment, not criticism. Examples: - "Test coverage on the payment module is at 8% — worth investing in before the next feature lands on top of it" - - "3 of the 5 PRs were 800+ LOC — breaking these up would catch issues earlier and make review easier" + - "Most commits land in a single burst — spacing work across the day could reduce context-switching fatigue" - "All commits land between 1-4am — sustainable pace matters for code quality long-term" **AI collaboration note:** If many commits have `Co-Authored-By` AI trailers (e.g., Claude, Copilot), note the AI-assisted commit percentage as a team metric. Frame it neutrally — "N% of commits were AI-assisted" — without judgment. diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index a918e24a..a81b12c9 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -231,7 +231,7 @@ From commit diffs, estimate PR sizes and bucket them: - **Small** (<100 LOC) - **Medium** (100-500 LOC) - **Large** (500-1500 LOC) -- **XL** (1500+ LOC) — flag these with file counts +- **XL** (1500+ LOC) ### Step 8: Focus Score + Ship of the Week @@ -423,14 +423,13 @@ Narrative interpreting what the team-wide patterns mean: Narrative covering: - Commit type mix and what it reveals -- PR size discipline (are PRs staying small?) +- PR size distribution and what it reveals about shipping cadence - Fix-chain detection (sequences of fix commits on the same subsystem) - Version bump discipline ### Code Quality Signals - Test LOC ratio trend - Hotspot analysis (are the same files churning?) -- Any XL PRs that should have been split - Greptile signal ratio and trend (if history exists): "Greptile: X% signal (Y valid catches, Z false positives)" ### Test Health @@ -469,7 +468,7 @@ For each teammate (sorted by commits descending), write a section: - "Fixed the N+1 query that was causing 2s load times on the dashboard" - **Opportunity for growth**: 1 specific, constructive suggestion. Frame as investment, not criticism. Examples: - "Test coverage on the payment module is at 8% — worth investing in before the next feature lands on top of it" - - "3 of the 5 PRs were 800+ LOC — breaking these up would catch issues earlier and make review easier" + - "Most commits land in a single burst — spacing work across the day could reduce context-switching fatigue" - "All commits land between 1-4am — sustainable pace matters for code quality long-term" **AI collaboration note:** If many commits have `Co-Authored-By` AI trailers (e.g., Claude, Copilot), note the AI-assisted commit percentage as a team metric. Frame it neutrally — "N% of commits were AI-assisted" — without judgment. From 709bed9f4d7d419ef4f806f8b3e91fa53f6c0945 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 21 Mar 2026 09:54:27 -0700 Subject: [PATCH 5/8] feat: CEO review handoff context for /office-hours chaining (v0.9.5.0) (#288) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: CEO review saves handoff context for /office-hours chaining When /plan-ceo-review suggests running /office-hours, it now saves a handoff note with system audit findings and discussion context. On re-invocation, the note is auto-discovered and used to avoid redundant questions. Addresses Codex review feedback: no step tracking or resume logic — just context as additional input (same pattern as design docs). * fix: remove PR size nagging from /retro codex variant Sync the Codex SKILL.md variant with the retro template changes from v0.9.4.1 — removes "flag these with file counts" for XL PRs and reframes PR size discussion as neutral data. * chore: bump version and changelog (v0.9.5.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- .../skills/gstack-plan-ceo-review/SKILL.md | 67 +++++++++++++++++++ .agents/skills/gstack-retro/SKILL.md | 7 +- CHANGELOG.md | 6 ++ VERSION | 2 +- plan-ceo-review/SKILL.md | 67 +++++++++++++++++++ plan-ceo-review/SKILL.md.tmpl | 67 +++++++++++++++++++ 6 files changed, 211 insertions(+), 5 deletions(-) diff --git a/.agents/skills/gstack-plan-ceo-review/SKILL.md b/.agents/skills/gstack-plan-ceo-review/SKILL.md index 9e450566..b43b8e0a 100644 --- a/.agents/skills/gstack-plan-ceo-review/SKILL.md +++ b/.agents/skills/gstack-plan-ceo-review/SKILL.md @@ -324,6 +324,21 @@ DESIGN=$(ls -t ~/.gstack/projects/$SLUG/*-$BRANCH-design-*.md 2>/dev/null | head ``` If a design doc exists (from `/office-hours`), read it. Use it as the source of truth for the problem statement, constraints, and chosen approach. If it has a `Supersedes:` field, note that this is a revised design. +**Handoff note check** (reuses $SLUG and $BRANCH from the design doc check above): +```bash +HANDOFF=$(ls -t ~/.gstack/projects/$SLUG/*-$BRANCH-ceo-handoff-*.md 2>/dev/null | head -1) +[ -n "$HANDOFF" ] && echo "HANDOFF_FOUND: $HANDOFF" || echo "NO_HANDOFF" +``` +If this block runs in a separate shell from the design doc check, recompute $SLUG and $BRANCH first using the same commands from that block. +If a handoff note is found: read it. This contains system audit findings and discussion +from a prior CEO review session that paused so the user could run `/office-hours`. Use it +as additional context alongside the design doc. The handoff note helps you avoid re-asking +questions the user already answered. Do NOT skip any steps — run the full review, but use +the handoff note to inform your analysis and avoid redundant questions. + +Tell the user: "Found a handoff note from your prior CEO review session. I'll use that +context to pick up where we left off." + ## Prerequisite Skill Offer When the design doc check above prints "No design doc found," offer the prerequisite @@ -343,6 +358,39 @@ Options: If they skip: "No worries — standard review. If you ever want sharper input, try /office-hours first next time." Then proceed normally. Do not re-offer later in the session. +**Handoff note save (BENEFITS_FROM):** If the user chose A (run /office-hours first), +save a handoff context note before they leave. Reuse $SLUG and $BRANCH from the +design doc check block above (they use the same `remote-slug || basename` fallback +that handles repos without an origin remote). Then run: +```bash +mkdir -p ~/.gstack/projects/$SLUG +USER=$(whoami) +DATETIME=$(date +%Y%m%d-%H%M%S) +``` +Write to `~/.gstack/projects/$SLUG/$USER-$BRANCH-ceo-handoff-$DATETIME.md`: +```markdown +# CEO Review Handoff Note + +Generated by /plan-ceo-review on {date} +Branch: {branch} +Repo: {owner/repo} + +## Why I paused +User chose to run /office-hours first (no design doc found). + +## System Audit Summary +{Summarize what the system audit found — recent git history, diff scope, +CLAUDE.md key points, TODOS.md relevant items, known pain points} + +## Discussion So Far +{Empty — handoff happened before Step 0. Frontend/UI scope detection has not +run yet — it will be assessed when the review resumes.} +``` + +Tell the user: "Context saved. Run /office-hours in another window. When you come back +and invoke /plan-ceo-review, I'll pick up the context automatically — including the +design doc /office-hours produces." + **Mid-session detection:** During Step 0A (Premise Challenge), if the user can't articulate the problem, keeps changing the problem statement, answers with "I'm not sure," or is clearly exploring rather than reviewing — offer `/office-hours`: @@ -355,6 +403,15 @@ sure," or is clearly exploring rather than reviewing — offer `/office-hours`: Options: A) Yes, run /office-hours first. B) No, keep going. If they keep going, proceed normally — no guilt, no re-asking. +**Handoff note save (mid-session):** If the user chose A (run /office-hours first from +mid-session detection), save a handoff context note with the same format above, but +include any Step 0A progress in the "Discussion So Far" section — premises discussed, +problem framing attempts, user answers so far. Use the same bash block to generate the +file path. + +Tell the user: "Context saved with your discussion so far. Run /office-hours, then +come back to /plan-ceo-review." + When reading TODOS.md, specifically: * Note any TODOs this plan touches, blocks, or unlocks * Check if deferred work from prior reviews relates to this plan @@ -942,6 +999,16 @@ List every ASCII diagram in files this plan touches. Still accurate? ### Unresolved Decisions If any AskUserQuestion goes unanswered, note it here. Never silently default. +## Handoff Note Cleanup + +After producing the Completion Summary, clean up any handoff notes for this branch — +the review is complete and the context is no longer needed. + +```bash +source <(~/.codex/skills/gstack/bin/gstack-slug 2>/dev/null) +rm -f ~/.gstack/projects/$SLUG/*-$BRANCH-ceo-handoff-*.md 2>/dev/null || true +``` + ## Review Log After producing the Completion Summary above, persist the review result. diff --git a/.agents/skills/gstack-retro/SKILL.md b/.agents/skills/gstack-retro/SKILL.md index 2a30b527..5d17bf2b 100644 --- a/.agents/skills/gstack-retro/SKILL.md +++ b/.agents/skills/gstack-retro/SKILL.md @@ -428,7 +428,7 @@ From commit diffs, estimate PR sizes and bucket them: - **Small** (<100 LOC) - **Medium** (100-500 LOC) - **Large** (500-1500 LOC) -- **XL** (1500+ LOC) — flag these with file counts +- **XL** (1500+ LOC) ### Step 8: Focus Score + Ship of the Week @@ -620,14 +620,13 @@ Narrative interpreting what the team-wide patterns mean: Narrative covering: - Commit type mix and what it reveals -- PR size discipline (are PRs staying small?) +- PR size distribution and what it reveals about shipping cadence - Fix-chain detection (sequences of fix commits on the same subsystem) - Version bump discipline ### Code Quality Signals - Test LOC ratio trend - Hotspot analysis (are the same files churning?) -- Any XL PRs that should have been split - Greptile signal ratio and trend (if history exists): "Greptile: X% signal (Y valid catches, Z false positives)" ### Test Health @@ -666,7 +665,7 @@ For each teammate (sorted by commits descending), write a section: - "Fixed the N+1 query that was causing 2s load times on the dashboard" - **Opportunity for growth**: 1 specific, constructive suggestion. Frame as investment, not criticism. Examples: - "Test coverage on the payment module is at 8% — worth investing in before the next feature lands on top of it" - - "3 of the 5 PRs were 800+ LOC — breaking these up would catch issues earlier and make review easier" + - "Most commits land in a single burst — spacing work across the day could reduce context-switching fatigue" - "All commits land between 1-4am — sustainable pace matters for code quality long-term" **AI collaboration note:** If many commits have `Co-Authored-By` AI trailers (e.g., Claude, Copilot), note the AI-assisted commit percentage as a team metric. Frame it neutrally — "N% of commits were AI-assisted" — without judgment. diff --git a/CHANGELOG.md b/CHANGELOG.md index 69070ecd..7a5aec9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [0.9.5.0] - 2026-03-21 — CEO Review ↔ Office Hours Chaining + +### Added + +- **CEO review saves context when handing off to `/office-hours`.** When `/plan-ceo-review` suggests running `/office-hours` first, it now saves a handoff note with your system audit findings and any discussion so far. When you come back and re-invoke `/plan-ceo-review`, it picks up that context automatically — no more starting from scratch. + ## [0.9.4.1] - 2026-03-20 ### Changed diff --git a/VERSION b/VERSION index 199bf2ae..719a2339 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.4.1 +0.9.5.0 diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index fdf95164..be25485a 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -332,6 +332,21 @@ DESIGN=$(ls -t ~/.gstack/projects/$SLUG/*-$BRANCH-design-*.md 2>/dev/null | head ``` If a design doc exists (from `/office-hours`), read it. Use it as the source of truth for the problem statement, constraints, and chosen approach. If it has a `Supersedes:` field, note that this is a revised design. +**Handoff note check** (reuses $SLUG and $BRANCH from the design doc check above): +```bash +HANDOFF=$(ls -t ~/.gstack/projects/$SLUG/*-$BRANCH-ceo-handoff-*.md 2>/dev/null | head -1) +[ -n "$HANDOFF" ] && echo "HANDOFF_FOUND: $HANDOFF" || echo "NO_HANDOFF" +``` +If this block runs in a separate shell from the design doc check, recompute $SLUG and $BRANCH first using the same commands from that block. +If a handoff note is found: read it. This contains system audit findings and discussion +from a prior CEO review session that paused so the user could run `/office-hours`. Use it +as additional context alongside the design doc. The handoff note helps you avoid re-asking +questions the user already answered. Do NOT skip any steps — run the full review, but use +the handoff note to inform your analysis and avoid redundant questions. + +Tell the user: "Found a handoff note from your prior CEO review session. I'll use that +context to pick up where we left off." + ## Prerequisite Skill Offer When the design doc check above prints "No design doc found," offer the prerequisite @@ -351,6 +366,39 @@ Options: If they skip: "No worries — standard review. If you ever want sharper input, try /office-hours first next time." Then proceed normally. Do not re-offer later in the session. +**Handoff note save (BENEFITS_FROM):** If the user chose A (run /office-hours first), +save a handoff context note before they leave. Reuse $SLUG and $BRANCH from the +design doc check block above (they use the same `remote-slug || basename` fallback +that handles repos without an origin remote). Then run: +```bash +mkdir -p ~/.gstack/projects/$SLUG +USER=$(whoami) +DATETIME=$(date +%Y%m%d-%H%M%S) +``` +Write to `~/.gstack/projects/$SLUG/$USER-$BRANCH-ceo-handoff-$DATETIME.md`: +```markdown +# CEO Review Handoff Note + +Generated by /plan-ceo-review on {date} +Branch: {branch} +Repo: {owner/repo} + +## Why I paused +User chose to run /office-hours first (no design doc found). + +## System Audit Summary +{Summarize what the system audit found — recent git history, diff scope, +CLAUDE.md key points, TODOS.md relevant items, known pain points} + +## Discussion So Far +{Empty — handoff happened before Step 0. Frontend/UI scope detection has not +run yet — it will be assessed when the review resumes.} +``` + +Tell the user: "Context saved. Run /office-hours in another window. When you come back +and invoke /plan-ceo-review, I'll pick up the context automatically — including the +design doc /office-hours produces." + **Mid-session detection:** During Step 0A (Premise Challenge), if the user can't articulate the problem, keeps changing the problem statement, answers with "I'm not sure," or is clearly exploring rather than reviewing — offer `/office-hours`: @@ -363,6 +411,15 @@ sure," or is clearly exploring rather than reviewing — offer `/office-hours`: Options: A) Yes, run /office-hours first. B) No, keep going. If they keep going, proceed normally — no guilt, no re-asking. +**Handoff note save (mid-session):** If the user chose A (run /office-hours first from +mid-session detection), save a handoff context note with the same format above, but +include any Step 0A progress in the "Discussion So Far" section — premises discussed, +problem framing attempts, user answers so far. Use the same bash block to generate the +file path. + +Tell the user: "Context saved with your discussion so far. Run /office-hours, then +come back to /plan-ceo-review." + When reading TODOS.md, specifically: * Note any TODOs this plan touches, blocks, or unlocks * Check if deferred work from prior reviews relates to this plan @@ -950,6 +1007,16 @@ List every ASCII diagram in files this plan touches. Still accurate? ### Unresolved Decisions If any AskUserQuestion goes unanswered, note it here. Never silently default. +## Handoff Note Cleanup + +After producing the Completion Summary, clean up any handoff notes for this branch — +the review is complete and the context is no longer needed. + +```bash +source <(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +rm -f ~/.gstack/projects/$SLUG/*-$BRANCH-ceo-handoff-*.md 2>/dev/null || true +``` + ## Review Log After producing the Completion Summary above, persist the review result. diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index 8dce40eb..fea6879c 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -111,8 +111,56 @@ DESIGN=$(ls -t ~/.gstack/projects/$SLUG/*-$BRANCH-design-*.md 2>/dev/null | head ``` If a design doc exists (from `/office-hours`), read it. Use it as the source of truth for the problem statement, constraints, and chosen approach. If it has a `Supersedes:` field, note that this is a revised design. +**Handoff note check** (reuses $SLUG and $BRANCH from the design doc check above): +```bash +HANDOFF=$(ls -t ~/.gstack/projects/$SLUG/*-$BRANCH-ceo-handoff-*.md 2>/dev/null | head -1) +[ -n "$HANDOFF" ] && echo "HANDOFF_FOUND: $HANDOFF" || echo "NO_HANDOFF" +``` +If this block runs in a separate shell from the design doc check, recompute $SLUG and $BRANCH first using the same commands from that block. +If a handoff note is found: read it. This contains system audit findings and discussion +from a prior CEO review session that paused so the user could run `/office-hours`. Use it +as additional context alongside the design doc. The handoff note helps you avoid re-asking +questions the user already answered. Do NOT skip any steps — run the full review, but use +the handoff note to inform your analysis and avoid redundant questions. + +Tell the user: "Found a handoff note from your prior CEO review session. I'll use that +context to pick up where we left off." + {{BENEFITS_FROM}} +**Handoff note save (BENEFITS_FROM):** If the user chose A (run /office-hours first), +save a handoff context note before they leave. Reuse $SLUG and $BRANCH from the +design doc check block above (they use the same `remote-slug || basename` fallback +that handles repos without an origin remote). Then run: +```bash +mkdir -p ~/.gstack/projects/$SLUG +USER=$(whoami) +DATETIME=$(date +%Y%m%d-%H%M%S) +``` +Write to `~/.gstack/projects/$SLUG/$USER-$BRANCH-ceo-handoff-$DATETIME.md`: +```markdown +# CEO Review Handoff Note + +Generated by /plan-ceo-review on {date} +Branch: {branch} +Repo: {owner/repo} + +## Why I paused +User chose to run /office-hours first (no design doc found). + +## System Audit Summary +{Summarize what the system audit found — recent git history, diff scope, +CLAUDE.md key points, TODOS.md relevant items, known pain points} + +## Discussion So Far +{Empty — handoff happened before Step 0. Frontend/UI scope detection has not +run yet — it will be assessed when the review resumes.} +``` + +Tell the user: "Context saved. Run /office-hours in another window. When you come back +and invoke /plan-ceo-review, I'll pick up the context automatically — including the +design doc /office-hours produces." + **Mid-session detection:** During Step 0A (Premise Challenge), if the user can't articulate the problem, keeps changing the problem statement, answers with "I'm not sure," or is clearly exploring rather than reviewing — offer `/office-hours`: @@ -125,6 +173,15 @@ sure," or is clearly exploring rather than reviewing — offer `/office-hours`: Options: A) Yes, run /office-hours first. B) No, keep going. If they keep going, proceed normally — no guilt, no re-asking. +**Handoff note save (mid-session):** If the user chose A (run /office-hours first from +mid-session detection), save a handoff context note with the same format above, but +include any Step 0A progress in the "Discussion So Far" section — premises discussed, +problem framing attempts, user answers so far. Use the same bash block to generate the +file path. + +Tell the user: "Context saved with your discussion so far. Run /office-hours, then +come back to /plan-ceo-review." + When reading TODOS.md, specifically: * Note any TODOs this plan touches, blocks, or unlocks * Check if deferred work from prior reviews relates to this plan @@ -652,6 +709,16 @@ List every ASCII diagram in files this plan touches. Still accurate? ### Unresolved Decisions If any AskUserQuestion goes unanswered, note it here. Never silently default. +## Handoff Note Cleanup + +After producing the Completion Summary, clean up any handoff notes for this branch — +the review is complete and the context is no longer needed. + +```bash +source <(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +rm -f ~/.gstack/projects/$SLUG/*-$BRANCH-ceo-handoff-*.md 2>/dev/null || true +``` + ## Review Log After producing the Completion Summary above, persist the review result. From f075cb757f67cacd0062f975064978019080214a Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 21 Mar 2026 11:46:06 -0700 Subject: [PATCH 6/8] =?UTF-8?q?feat:=20Search=20Before=20Building=20?= =?UTF-8?q?=E2=80=94=20builder=20ethos=20+=20skill=20integrations=20(v0.9.?= =?UTF-8?q?5.0)=20(#298)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: ETHOS.md — gstack builder philosophy Standalone document capturing the four principles: The Golden Age, Boil the Lake, Search Before Building, and Build for Yourself. Introduces the three-layer knowledge framework (tried-and-true, new-and-popular, first-principles) and the Eureka Moment concept — when first-principles reasoning reveals conventional wisdom is wrong. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: Search Before Building preamble section + CLAUDE.md Add generateSearchBeforeBuildingSection(ctx) to gen-skill-docs.ts. Every workflow skill now gets a compact router section covering: - Three layers of knowledge (tried-and-true, new-and-popular, first-principles) - Eureka moment format and jq-based JSONL logging - WebSearch fallback clause - ETHOS.md reference via ctx.paths.skillRoot resolver Also adds compact "Search before building" section to CLAUDE.md. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: skill-specific Search Before Building integrations 8 template changes: - /office-hours: Phase 2.75 Landscape Awareness (WebSearch + three-layer synthesis) - /plan-eng-review: Step 0 search check with layer provenance annotations - /investigate: external pattern search + search escalation on hypothesis failure - /plan-ceo-review: Landscape Check before scope challenge - /review: search-before-recommending for fix patterns - /qa-only: WebSearch in allowed-tools - /design-consultation: three-layer synthesis backport in Phase 2 Step 3 - /retro: eureka moment tracking from ~/.gstack/analytics/eureka.jsonl All search steps include WebSearch fallback clause. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: v0.9.5.0 — Builder Ethos (CHANGELOG + VERSION + TODOS) ETHOS.md + Search Before Building across all workflow skills. Deferred: first-time intro flow (blocked on blog post). Co-Authored-By: Claude Opus 4.6 (1M context) * fix: address Codex review — sanitize search, privacy gate, ETHOS.md sidecar Three fixes from adversarial Codex review: - /investigate: sanitize error messages before searching (strip hostnames, IPs, file paths, SQL, customer data). Skip search if unsanitizable. - /office-hours: add privacy gate before landscape search. Use generalized category terms, never the user's specific product name or stealth idea. - setup: link ETHOS.md into .agents/skills/gstack/ sidecar so workspace- local Codex sessions can find the builder philosophy. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: sanitize Phase 2 external pattern search in /investigate The Phase 2 external search also sent raw error messages to WebSearch. Apply same sanitization rule as Phase 3 search escalation. Co-Authored-By: Claude Opus 4.6 (1M context) * docs: sync documentation with shipped changes - ARCHITECTURE.md: preamble now handles 5 things (add Search Before Building) - CLAUDE.md: add ETHOS.md to project structure tree - README.md: add ETHOS.md to docs table Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .agents/skills/gstack-browse/SKILL.md | 20 +++ .../gstack-design-consultation/SKILL.md | 27 +++- .agents/skills/gstack-design-review/SKILL.md | 20 +++ .../skills/gstack-document-release/SKILL.md | 20 +++ .agents/skills/gstack-investigate/SKILL.md | 28 +++- .agents/skills/gstack-office-hours/SKILL.md | 57 ++++++++ .../skills/gstack-plan-ceo-review/SKILL.md | 36 +++++ .../skills/gstack-plan-design-review/SKILL.md | 20 +++ .../skills/gstack-plan-eng-review/SKILL.md | 30 +++- .agents/skills/gstack-qa-only/SKILL.md | 20 +++ .agents/skills/gstack-qa/SKILL.md | 20 +++ .agents/skills/gstack-retro/SKILL.md | 34 +++++ .agents/skills/gstack-review/SKILL.md | 27 ++++ .../gstack-setup-browser-cookies/SKILL.md | 20 +++ .agents/skills/gstack-ship/SKILL.md | 20 +++ .agents/skills/gstack/SKILL.md | 20 +++ ARCHITECTURE.md | 3 +- CHANGELOG.md | 9 +- CLAUDE.md | 14 ++ ETHOS.md | 129 ++++++++++++++++++ README.md | 1 + SKILL.md | 20 +++ TODOS.md | 14 ++ browse/SKILL.md | 20 +++ codex/SKILL.md | 20 +++ design-consultation/SKILL.md | 27 +++- design-consultation/SKILL.md.tmpl | 7 +- design-review/SKILL.md | 20 +++ document-release/SKILL.md | 20 +++ investigate/SKILL.md | 29 +++- investigate/SKILL.md.tmpl | 9 +- office-hours/SKILL.md | 58 ++++++++ office-hours/SKILL.md.tmpl | 38 ++++++ plan-ceo-review/SKILL.md | 37 +++++ plan-ceo-review/SKILL.md.tmpl | 17 +++ plan-design-review/SKILL.md | 20 +++ plan-eng-review/SKILL.md | 31 ++++- plan-eng-review/SKILL.md.tmpl | 11 +- qa-only/SKILL.md | 21 +++ qa-only/SKILL.md.tmpl | 1 + qa/SKILL.md | 20 +++ retro/SKILL.md | 34 +++++ retro/SKILL.md.tmpl | 14 ++ review/SKILL.md | 28 ++++ review/SKILL.md.tmpl | 8 ++ scripts/gen-skill-docs.ts | 23 ++++ setup | 11 ++ setup-browser-cookies/SKILL.md | 20 +++ ship/SKILL.md | 20 +++ 49 files changed, 1162 insertions(+), 11 deletions(-) create mode 100644 ETHOS.md diff --git a/.agents/skills/gstack-browse/SKILL.md b/.agents/skills/gstack-browse/SKILL.md index db405e47..45a59485 100644 --- a/.agents/skills/gstack-browse/SKILL.md +++ b/.agents/skills/gstack-browse/SKILL.md @@ -125,6 +125,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack-design-consultation/SKILL.md b/.agents/skills/gstack-design-consultation/SKILL.md index 8d018781..29e1a222 100644 --- a/.agents/skills/gstack-design-consultation/SKILL.md +++ b/.agents/skills/gstack-design-consultation/SKILL.md @@ -126,6 +126,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -323,7 +343,12 @@ If browse is not available, rely on WebSearch results and your built-in design k **Step 3: Synthesize findings** -The goal of research is NOT to copy. It is to get in the ballpark — to understand the visual language users in this category already expect. This gives you the baseline. The interesting design work starts after you have the baseline: deciding where to follow conventions (so the product feels literate) and where to break from them (so the product is memorable). +**Three-layer synthesis:** +- **Layer 1 (tried and true):** What design patterns does every product in this category share? These are table stakes — users expect them. +- **Layer 2 (new and popular):** What are the search results and current design discourse saying? What's trending? What new patterns are emerging? +- **Layer 3 (first principles):** Given what we know about THIS product's users and positioning — is there a reason the conventional design approach is wrong? Where should we deliberately break from the category norms? + +**Eureka check:** If Layer 3 reasoning reveals a genuine design insight — a reason the category's visual language fails THIS product — name it: "EUREKA: Every [category] product does X because they assume [assumption]. But this product's users [evidence] — so we should do Y instead." Log the eureka moment (see preamble). Summarize conversationally: > "I looked at what's out there. Here's the landscape: they converge on [patterns]. Most of them feel [observation — e.g., interchangeable, polished but generic, etc.]. The opportunity to stand out is [gap]. Here's where I'd play it safe and where I'd take a risk..." diff --git a/.agents/skills/gstack-design-review/SKILL.md b/.agents/skills/gstack-design-review/SKILL.md index 2a7d26b2..700bd33e 100644 --- a/.agents/skills/gstack-design-review/SKILL.md +++ b/.agents/skills/gstack-design-review/SKILL.md @@ -126,6 +126,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack-document-release/SKILL.md b/.agents/skills/gstack-document-release/SKILL.md index 9249b0be..ccf34824 100644 --- a/.agents/skills/gstack-document-release/SKILL.md +++ b/.agents/skills/gstack-document-release/SKILL.md @@ -124,6 +124,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack-investigate/SKILL.md b/.agents/skills/gstack-investigate/SKILL.md index 39f5910e..0f53afef 100644 --- a/.agents/skills/gstack-investigate/SKILL.md +++ b/.agents/skills/gstack-investigate/SKILL.md @@ -127,6 +127,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -289,6 +309,12 @@ Also check: - `TODOS.md` for related known issues - `git log` for prior fixes in the same area — **recurring bugs in the same files are an architectural smell**, not a coincidence +**External pattern search:** If the bug doesn't match a known pattern above, WebSearch for: +- "{framework} {generic error type}" — **sanitize first:** strip hostnames, IPs, file paths, SQL, customer data. Search the error category, not the raw message. +- "{library} {component} known issues" + +If WebSearch is unavailable, skip this search and proceed with hypothesis testing. If a documented solution or known dependency bug surfaces, present it as a candidate hypothesis in Phase 3. + --- ## Phase 3: Hypothesis Testing @@ -297,7 +323,7 @@ Before writing ANY fix, verify your hypothesis. 1. **Confirm the hypothesis:** Add a temporary log statement, assertion, or debug output at the suspected root cause. Run the reproduction. Does the evidence match? -2. **If the hypothesis is wrong:** Return to Phase 1. Gather more evidence. Do not guess. +2. **If the hypothesis is wrong:** Before forming the next hypothesis, consider searching for the error. **Sanitize first** — strip hostnames, IPs, file paths, SQL fragments, customer identifiers, and any internal/proprietary data from the error message. Search only the generic error type and framework context: "{component} {sanitized error type} {framework version}". If the error message is too specific to sanitize safely, skip the search. If WebSearch is unavailable, skip and proceed. Then return to Phase 1. Gather more evidence. Do not guess. 3. **3-strike rule:** If 3 hypotheses fail, **STOP**. Use AskUserQuestion: ``` diff --git a/.agents/skills/gstack-office-hours/SKILL.md b/.agents/skills/gstack-office-hours/SKILL.md index eb1b7084..c464c88c 100644 --- a/.agents/skills/gstack-office-hours/SKILL.md +++ b/.agents/skills/gstack-office-hours/SKILL.md @@ -128,6 +128,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -447,6 +467,43 @@ If no matches found, proceed silently. --- +## Phase 2.75: Landscape Awareness + +Read ETHOS.md for the full Search Before Building framework (three layers, eureka moments). The preamble's Search Before Building section has the ETHOS.md path. + +After understanding the problem through questioning, search for what the world thinks. This is NOT competitive research (that's /design-consultation's job). This is understanding conventional wisdom so you can evaluate where it's wrong. + +**Privacy gate:** Before searching, use AskUserQuestion: "I'd like to search for what the world thinks about this space to inform our discussion. This sends generalized category terms (not your specific idea) to a search provider. OK to proceed?" +Options: A) Yes, search away B) Skip — keep this session private +If B: skip this phase entirely and proceed to Phase 3. Use only in-distribution knowledge. + +When searching, use **generalized category terms** — never the user's specific product name, proprietary concept, or stealth idea. For example, search "task management app landscape" not "SuperTodo AI-powered task killer." + +If WebSearch is unavailable, skip this phase and note: "Search unavailable — proceeding with in-distribution knowledge only." + +**Startup mode:** WebSearch for: +- "[problem space] startup approach {current year}" +- "[problem space] common mistakes" +- "why [incumbent solution] fails" OR "why [incumbent solution] works" + +**Builder mode:** WebSearch for: +- "[thing being built] existing solutions" +- "[thing being built] open source alternatives" +- "best [thing category] {current year}" + +Read the top 2-3 results. Run the three-layer synthesis: +- **[Layer 1]** What does everyone already know about this space? +- **[Layer 2]** What are the search results and current discourse saying? +- **[Layer 3]** Given what WE learned in Phase 2A/2B — is there a reason the conventional approach is wrong? + +**Eureka check:** If Layer 3 reasoning reveals a genuine insight, name it: "EUREKA: Everyone does X because they assume [assumption]. But [evidence from our conversation] suggests that's wrong here. This means [implication]." Log the eureka moment (see preamble). + +If no eureka moment exists, say: "The conventional wisdom seems sound here. Let's build on it." Proceed to Phase 3. + +**Important:** This search feeds Phase 3 (Premise Challenge). If you found reasons the conventional approach fails, those become premises to challenge. If conventional wisdom is solid, that raises the bar for any premise that contradicts it. + +--- + ## Phase 3: Premise Challenge Before proposing solutions, challenge the premises: diff --git a/.agents/skills/gstack-plan-ceo-review/SKILL.md b/.agents/skills/gstack-plan-ceo-review/SKILL.md index b43b8e0a..290ddbb2 100644 --- a/.agents/skills/gstack-plan-ceo-review/SKILL.md +++ b/.agents/skills/gstack-plan-ceo-review/SKILL.md @@ -127,6 +127,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -434,6 +454,22 @@ Analyze the plan. If it involves ANY of: new UI screens/pages, changes to existi Identify 2-3 files or patterns in the existing codebase that are particularly well-designed. Note them as style references for the review. Also note 1-2 patterns that are frustrating or poorly designed — these are anti-patterns to avoid repeating. Report findings before proceeding to Step 0. +### Landscape Check + +Read ETHOS.md for the Search Before Building framework (the preamble's Search Before Building section has the path). Before challenging scope, understand the landscape. WebSearch for: +- "[product category] landscape {current year}" +- "[key feature] alternatives" +- "why [incumbent/conventional approach] [succeeds/fails]" + +If WebSearch is unavailable, skip this check and note: "Search unavailable — proceeding with in-distribution knowledge only." + +Run the three-layer synthesis: +- **[Layer 1]** What's the tried-and-true approach in this space? +- **[Layer 2]** What are the search results saying? +- **[Layer 3]** First-principles reasoning — where might the conventional wisdom be wrong? + +Feed into the Premise Challenge (0A) and Dream State Mapping (0C). If you find a eureka moment, surface it during the Expansion opt-in ceremony as a differentiation opportunity. Log it (see preamble). + ## Step 0: Nuclear Scope Challenge + Mode Selection ### 0A. Premise Challenge diff --git a/.agents/skills/gstack-plan-design-review/SKILL.md b/.agents/skills/gstack-plan-design-review/SKILL.md index e431d72c..8ee46259 100644 --- a/.agents/skills/gstack-plan-design-review/SKILL.md +++ b/.agents/skills/gstack-plan-design-review/SKILL.md @@ -126,6 +126,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack-plan-eng-review/SKILL.md b/.agents/skills/gstack-plan-eng-review/SKILL.md index 01233079..942d5822 100644 --- a/.agents/skills/gstack-plan-eng-review/SKILL.md +++ b/.agents/skills/gstack-plan-eng-review/SKILL.md @@ -125,6 +125,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -293,7 +313,15 @@ Before reviewing anything, answer these questions: 1. **What existing code already partially or fully solves each sub-problem?** Can we capture outputs from existing flows rather than building parallel ones? 2. **What is the minimum set of changes that achieves the stated goal?** Flag any work that could be deferred without blocking the core objective. Be ruthless about scope creep. 3. **Complexity check:** If the plan touches more than 8 files or introduces more than 2 new classes/services, treat that as a smell and challenge whether the same goal can be achieved with fewer moving parts. -4. **TODOS cross-reference:** Read `TODOS.md` if it exists. Are any deferred items blocking this plan? Can any deferred items be bundled into this PR without expanding scope? Does this plan create new work that should be captured as a TODO? +4. **Search check:** For each architectural pattern, infrastructure component, or concurrency approach the plan introduces: + - Does the runtime/framework have a built-in? Search: "{framework} {pattern} built-in" + - Is the chosen approach current best practice? Search: "{pattern} best practice {current year}" + - Are there known footguns? Search: "{framework} {pattern} pitfalls" + + If WebSearch is unavailable, skip this check and note: "Search unavailable — proceeding with in-distribution knowledge only." + + If the plan rolls a custom solution where a built-in exists, flag it as a scope reduction opportunity. Annotate recommendations with **[Layer 1]**, **[Layer 2]**, **[Layer 3]**, or **[EUREKA]** (see preamble's Search Before Building section). If you find a eureka moment — a reason the standard approach is wrong for this case — present it as an architectural insight. +5. **TODOS cross-reference:** Read `TODOS.md` if it exists. Are any deferred items blocking this plan? Can any deferred items be bundled into this PR without expanding scope? Does this plan create new work that should be captured as a TODO? 5. **Completeness check:** Is the plan doing the complete version or a shortcut? With AI-assisted coding, the cost of completeness (100% test coverage, full edge case handling, complete error paths) is 10-100x cheaper than with a human team. If the plan proposes a shortcut that saves human-hours but only saves minutes with CC+gstack, recommend the complete version. Boil the lake. diff --git a/.agents/skills/gstack-qa-only/SKILL.md b/.agents/skills/gstack-qa-only/SKILL.md index 6a6f8a7d..f310fb25 100644 --- a/.agents/skills/gstack-qa-only/SKILL.md +++ b/.agents/skills/gstack-qa-only/SKILL.md @@ -124,6 +124,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack-qa/SKILL.md b/.agents/skills/gstack-qa/SKILL.md index 0617c447..92e61a9a 100644 --- a/.agents/skills/gstack-qa/SKILL.md +++ b/.agents/skills/gstack-qa/SKILL.md @@ -127,6 +127,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack-retro/SKILL.md b/.agents/skills/gstack-retro/SKILL.md index 5d17bf2b..a0b796ba 100644 --- a/.agents/skills/gstack-retro/SKILL.md +++ b/.agents/skills/gstack-retro/SKILL.md @@ -124,6 +124,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -369,6 +389,20 @@ If TODOS.md doesn't exist, skip the Backlog Health row. If the JSONL file doesn't exist or has no entries in the window, skip the Skill Usage row. +**Eureka Moments (if logged):** Read `~/.gstack/analytics/eureka.jsonl` if it exists. Filter entries within the retro time window by `ts` field. For each eureka moment, show the skill that flagged it, the branch, and a one-line summary of the insight. Present as: + +``` +| Eureka Moments | 2 this period | +``` + +If moments exist, list them: +``` + EUREKA /office-hours (branch: garrytan/auth-rethink): "Session tokens don't need server storage — browser crypto API makes client-side JWT validation viable" + EUREKA /plan-eng-review (branch: garrytan/cache-layer): "Redis isn't needed here — Bun's built-in LRU cache handles this workload" +``` + +If the JSONL file doesn't exist or has no entries in the window, skip the Eureka Moments row. + ### Step 3: Commit Time Distribution Show hourly histogram in local time using bar chart: diff --git a/.agents/skills/gstack-review/SKILL.md b/.agents/skills/gstack-review/SKILL.md index ab00c53e..de1a0674 100644 --- a/.agents/skills/gstack-review/SKILL.md +++ b/.agents/skills/gstack-review/SKILL.md @@ -123,6 +123,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -319,6 +339,13 @@ Apply the checklist against the diff in two passes: **Enum & Value Completeness requires reading code OUTSIDE the diff.** When the diff introduces a new enum value, status, tier, or type constant, use Grep to find all files that reference sibling values, then Read those files to check if the new value is handled. This is the one category where within-diff review is insufficient. +**Search-before-recommending:** When recommending a fix pattern (especially for concurrency, caching, auth, or framework-specific behavior): +- Verify the pattern is current best practice for the framework version in use +- Check if a built-in solution exists in newer versions before recommending a workaround +- Verify API signatures against current docs (APIs change between versions) + +Takes seconds, prevents recommending outdated patterns. If WebSearch is unavailable, note it and proceed with in-distribution knowledge. + Follow the output format specified in the checklist. Respect the suppressions — do NOT flag items listed in the "DO NOT flag" section. --- diff --git a/.agents/skills/gstack-setup-browser-cookies/SKILL.md b/.agents/skills/gstack-setup-browser-cookies/SKILL.md index 34f95391..49e2e900 100644 --- a/.agents/skills/gstack-setup-browser-cookies/SKILL.md +++ b/.agents/skills/gstack-setup-browser-cookies/SKILL.md @@ -123,6 +123,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack-ship/SKILL.md b/.agents/skills/gstack-ship/SKILL.md index 24a770be..0809141b 100644 --- a/.agents/skills/gstack-ship/SKILL.md +++ b/.agents/skills/gstack-ship/SKILL.md @@ -121,6 +121,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/.agents/skills/gstack/SKILL.md b/.agents/skills/gstack/SKILL.md index 4bb9ba17..93128866 100644 --- a/.agents/skills/gstack/SKILL.md +++ b/.agents/skills/gstack/SKILL.md @@ -156,6 +156,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.codex/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index db55ee36..0a2b9313 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -210,12 +210,13 @@ This is structurally sound — if a command exists in code, it appears in docs. ### The preamble -Every skill starts with a `{{PREAMBLE}}` block that runs before the skill's own logic. It handles four things in a single bash command: +Every skill starts with a `{{PREAMBLE}}` block that runs before the skill's own logic. It handles five things in a single bash command: 1. **Update check** — calls `gstack-update-check`, reports if an upgrade is available. 2. **Session tracking** — touches `~/.gstack/sessions/$PPID` and counts active sessions (files modified in the last 2 hours). When 3+ sessions are running, all skills enter "ELI16 mode" — every question re-grounds the user on context because they're juggling windows. 3. **Contributor mode** — reads `gstack_contributor` from config. When true, the agent files casual field reports to `~/.gstack/contributor-logs/` when gstack itself misbehaves. 4. **AskUserQuestion format** — universal format: context, question, `RECOMMENDATION: Choose X because ___`, lettered options. Consistent across all skills. +5. **Search Before Building** — before building infrastructure or unfamiliar patterns, search first. Three layers of knowledge: tried-and-true (Layer 1), new-and-popular (Layer 2), first-principles (Layer 3). When first-principles reasoning reveals conventional wisdom is wrong, the agent names the "eureka moment" and logs it. See `ETHOS.md` for the full builder philosophy. ### Why committed, not generated at runtime? diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a5aec9a..ee1d230d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,16 @@ # Changelog -## [0.9.5.0] - 2026-03-21 — CEO Review ↔ Office Hours Chaining +## [0.9.5.0] - 2026-03-21 — Builder Ethos ### Added +- **ETHOS.md — gstack's builder philosophy in one document.** Four principles: The Golden Age (AI compression ratios), Boil the Lake (completeness is cheap), Search Before Building (three layers of knowledge), and Build for Yourself. This is the philosophical source of truth that every workflow skill references. +- **Every workflow skill now searches before recommending.** Before suggesting infrastructure patterns, concurrency approaches, or framework-specific solutions, gstack checks if the runtime has a built-in and whether the pattern is current best practice. Three layers of knowledge — tried-and-true (Layer 1), new-and-popular (Layer 2), and first-principles (Layer 3) — with the most valuable insights prized above all. +- **Eureka moments.** When first-principles reasoning reveals that conventional wisdom is wrong, gstack names it, celebrates it, and logs it. Your weekly `/retro` now surfaces these insights so you can see where your projects zigged while others zagged. +- **`/office-hours` adds Landscape Awareness phase.** After understanding your problem through questioning but before challenging premises, gstack searches for what the world thinks — then runs a three-layer synthesis to find where conventional wisdom might be wrong for your specific case. +- **`/plan-eng-review` adds search check.** Step 0 now verifies architectural patterns against current best practices and flags custom solutions where built-ins exist. +- **`/investigate` searches on hypothesis failure.** When your first debugging hypothesis is wrong, gstack searches for the exact error message and known framework issues before guessing again. +- **`/design-consultation` three-layer synthesis.** Competitive research now uses the structured Layer 1/2/3 framework to find where your product should deliberately break from category norms. - **CEO review saves context when handing off to `/office-hours`.** When `/plan-ceo-review` suggests running `/office-hours` first, it now saves a handoff note with your system audit findings and any discussion so far. When you come back and re-invoke `/plan-ceo-review`, it picks up that context automatically — no more starting from scratch. ## [0.9.4.1] - 2026-03-20 diff --git a/CLAUDE.md b/CLAUDE.md index 9a7edc28..bd513552 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -78,6 +78,7 @@ gstack/ ├── setup # One-time setup: build binary + symlink skills ├── SKILL.md # Generated from SKILL.md.tmpl (don't edit directly) ├── SKILL.md.tmpl # Template: edit this, run gen:skill-docs +├── ETHOS.md # Builder philosophy (Boil the Lake, Search Before Building) └── package.json # Build scripts for browse ``` @@ -192,6 +193,19 @@ Completeness is cheap. Don't recommend shortcuts when the complete implementatio is a "lake" (achievable) not an "ocean" (multi-quarter migration). See the Completeness Principle in the skill preamble for the full philosophy. +## Search before building + +Before designing any solution that involves concurrency, unfamiliar patterns, +infrastructure, or anything where the runtime/framework might have a built-in: + +1. Search for "{runtime} {thing} built-in" +2. Search for "{thing} best practice {current year}" +3. Check official runtime/framework docs + +Three layers of knowledge: tried-and-true (Layer 1), new-and-popular (Layer 2), +first-principles (Layer 3). Prize Layer 3 above all. See ETHOS.md for the full +builder philosophy. + ## Local plans Contributors can store long-range vision docs and design documents in `~/.gstack-dev/plans/`. diff --git a/ETHOS.md b/ETHOS.md new file mode 100644 index 00000000..b056fcf1 --- /dev/null +++ b/ETHOS.md @@ -0,0 +1,129 @@ +# gstack Builder Ethos + +These are the principles that shape how gstack thinks, recommends, and builds. +They are injected into every workflow skill's preamble automatically. They +reflect what we believe about building software in 2026. + +--- + +## The Golden Age + +A single person with AI can now build what used to take a team of twenty. +The engineering barrier is gone. What remains is taste, judgment, and the +willingness to do the complete thing. + +This is not a prediction — it's happening right now. 10,000+ usable lines of +code per day. 100+ commits per week. Not by a team. By one person, part-time, +using the right tools. The compression ratio between human-team time and +AI-assisted time ranges from 3x (research) to 100x (boilerplate): + +| Task type | Human team | AI-assisted | Compression | +|-----------------------------|-----------|-------------|-------------| +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | + +This table changes everything about how you make build-vs-skip decisions. +The last 10% of completeness that teams used to skip? It costs seconds now. + +--- + +## 1. Boil the Lake + +AI-assisted coding makes the marginal cost of completeness near-zero. When +the complete implementation costs minutes more than the shortcut — do the +complete thing. Every time. + +**Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, +full feature implementation, all edge cases, complete error paths. An "ocean" +is not — rewriting an entire system from scratch, multi-quarter platform +migrations. Boil lakes. Flag oceans as out of scope. + +**Completeness is cheap.** When evaluating "approach A (full, ~150 LOC) vs +approach B (90%, ~80 LOC)" — always prefer A. The 70-line delta costs +seconds with AI coding. "Ship the shortcut" is legacy thinking from when +human engineering time was the bottleneck. + +**Anti-patterns:** +- "Choose B — it covers 90% with less code." (If A is 70 lines more, choose A.) +- "Let's defer tests to a follow-up PR." (Tests are the cheapest lake to boil.) +- "This would take 2 weeks." (Say: "2 weeks human / ~1 hour AI-assisted.") + +Read more: https://garryslist.org/posts/boil-the-ocean + +--- + +## 2. Search Before Building + +The 1000x engineer's first instinct is "has someone already solved this?" not +"let me design it from scratch." Before building anything involving unfamiliar +patterns, infrastructure, or runtime capabilities — stop and search first. +The cost of checking is near-zero. The cost of not checking is reinventing +something worse. + +### Three Layers of Knowledge + +There are three distinct sources of truth when building anything. Understand +which layer you're operating in: + +**Layer 1: Tried and true.** Standard patterns, battle-tested approaches, +things deeply in distribution. You probably already know these. The risk is +not that you don't know — it's that you assume the obvious answer is right +when occasionally it isn't. The cost of checking is near-zero. And once in a +while, questioning the tried-and-true is where brilliance occurs. + +**Layer 2: New and popular.** Current best practices, blog posts, ecosystem +trends. Search for these. But scrutinize what you find — humans are subject +to mania. Mr. Market is either too fearful or too greedy. The crowd can be +wrong about new things just as easily as old things. Search results are inputs +to your thinking, not answers. + +**Layer 3: First principles.** Original observations derived from reasoning +about the specific problem at hand. These are the most valuable of all. Prize +them above everything else. The best projects both avoid mistakes (don't +reinvent the wheel — Layer 1) while also making brilliant observations that +are out of distribution (Layer 3). + +### The Eureka Moment + +The most valuable outcome of searching is not finding a solution to copy. +It is: + +1. Understanding what everyone is doing and WHY (Layers 1 + 2) +2. Applying first-principles reasoning to their assumptions (Layer 3) +3. Discovering a clear reason why the conventional approach is wrong + +This is the 11 out of 10. The truly superlative projects are full of these +moments — zig while others zag. When you find one, name it. Celebrate it. +Build on it. + +**Anti-patterns:** +- Rolling a custom solution when the runtime has a built-in. (Layer 1 miss) +- Accepting blog posts uncritically in novel territory. (Layer 2 mania) +- Assuming tried-and-true is right without questioning premises. (Layer 3 blindness) + +--- + +## How They Work Together + +Boil the Lake says: **do the complete thing.** +Search Before Building says: **know what exists before you decide what to build.** + +Together: search first, then build the complete version of the right thing. +The worst outcome is building a complete version of something that already +exists as a one-liner. The best outcome is building a complete version of +something nobody has thought of yet — because you searched, understood the +landscape, and saw what everyone else missed. + +--- + +## Build for Yourself + +The best tools solve your own problem. gstack exists because its creator +wanted it. Every feature was built because it was needed, not because it +was requested. If you're building something for yourself, trust that instinct. +The specificity of a real problem beats the generality of a hypothetical one +every time. diff --git a/README.md b/README.md index 07047797..dcb76dd8 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,7 @@ Fifteen specialists and six power tools. All slash commands. All Markdown. All f | Doc | What it covers | |-----|---------------| | [Skill Deep Dives](docs/skills.md) | Philosophy, examples, and workflow for every skill (includes Greptile integration) | +| [Builder Ethos](ETHOS.md) | Builder philosophy: Boil the Lake, Search Before Building, three layers of knowledge | | [Architecture](ARCHITECTURE.md) | Design decisions and system internals | | [Browser Reference](BROWSER.md) | Full command reference for `/browse` | | [Contributing](CONTRIBUTING.md) | Dev setup, testing, contributor mode, and dev mode | diff --git a/SKILL.md b/SKILL.md index 46b7a558..d8e51bd1 100644 --- a/SKILL.md +++ b/SKILL.md @@ -162,6 +162,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/TODOS.md b/TODOS.md index 766c3a78..445d1603 100644 --- a/TODOS.md +++ b/TODOS.md @@ -1,5 +1,19 @@ # TODOS +## Builder Ethos + +### First-time Search Before Building intro + +**What:** Add a `generateSearchIntro()` function (like `generateLakeIntro()`) that introduces the Search Before Building principle on first use, with a link to the blog essay. + +**Why:** Boil the Lake has an intro flow that links to the essay and marks `.completeness-intro-seen`. Search Before Building should have the same pattern for discoverability. + +**Context:** Blocked on a blog post to link to. When the essay exists, add the intro flow with a `.search-intro-seen` marker file. Pattern: `generateLakeIntro()` at gen-skill-docs.ts:176. + +**Effort:** S +**Priority:** P2 +**Depends on:** Blog post about Search Before Building + ## Browse ### Bundle server.ts into compiled binary diff --git a/browse/SKILL.md b/browse/SKILL.md index 2acf60b0..e7ab6205 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -131,6 +131,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/codex/SKILL.md b/codex/SKILL.md index 77705f7e..8af53954 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -132,6 +132,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/design-consultation/SKILL.md b/design-consultation/SKILL.md index cc21fa7e..f707f5b3 100644 --- a/design-consultation/SKILL.md +++ b/design-consultation/SKILL.md @@ -136,6 +136,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -333,7 +353,12 @@ If browse is not available, rely on WebSearch results and your built-in design k **Step 3: Synthesize findings** -The goal of research is NOT to copy. It is to get in the ballpark — to understand the visual language users in this category already expect. This gives you the baseline. The interesting design work starts after you have the baseline: deciding where to follow conventions (so the product feels literate) and where to break from them (so the product is memorable). +**Three-layer synthesis:** +- **Layer 1 (tried and true):** What design patterns does every product in this category share? These are table stakes — users expect them. +- **Layer 2 (new and popular):** What are the search results and current design discourse saying? What's trending? What new patterns are emerging? +- **Layer 3 (first principles):** Given what we know about THIS product's users and positioning — is there a reason the conventional design approach is wrong? Where should we deliberately break from the category norms? + +**Eureka check:** If Layer 3 reasoning reveals a genuine design insight — a reason the category's visual language fails THIS product — name it: "EUREKA: Every [category] product does X because they assume [assumption]. But this product's users [evidence] — so we should do Y instead." Log the eureka moment (see preamble). Summarize conversationally: > "I looked at what's out there. Here's the landscape: they converge on [patterns]. Most of them feel [observation — e.g., interchangeable, polished but generic, etc.]. The opportunity to stand out is [gap]. Here's where I'd play it safe and where I'd take a risk..." diff --git a/design-consultation/SKILL.md.tmpl b/design-consultation/SKILL.md.tmpl index 1e8b0bff..ed9a4efa 100644 --- a/design-consultation/SKILL.md.tmpl +++ b/design-consultation/SKILL.md.tmpl @@ -112,7 +112,12 @@ If browse is not available, rely on WebSearch results and your built-in design k **Step 3: Synthesize findings** -The goal of research is NOT to copy. It is to get in the ballpark — to understand the visual language users in this category already expect. This gives you the baseline. The interesting design work starts after you have the baseline: deciding where to follow conventions (so the product feels literate) and where to break from them (so the product is memorable). +**Three-layer synthesis:** +- **Layer 1 (tried and true):** What design patterns does every product in this category share? These are table stakes — users expect them. +- **Layer 2 (new and popular):** What are the search results and current design discourse saying? What's trending? What new patterns are emerging? +- **Layer 3 (first principles):** Given what we know about THIS product's users and positioning — is there a reason the conventional design approach is wrong? Where should we deliberately break from the category norms? + +**Eureka check:** If Layer 3 reasoning reveals a genuine design insight — a reason the category's visual language fails THIS product — name it: "EUREKA: Every [category] product does X because they assume [assumption]. But this product's users [evidence] — so we should do Y instead." Log the eureka moment (see preamble). Summarize conversationally: > "I looked at what's out there. Here's the landscape: they converge on [patterns]. Most of them feel [observation — e.g., interchangeable, polished but generic, etc.]. The opportunity to stand out is [gap]. Here's where I'd play it safe and where I'd take a risk..." diff --git a/design-review/SKILL.md b/design-review/SKILL.md index cbc0d5e8..606ed2cd 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -136,6 +136,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/document-release/SKILL.md b/document-release/SKILL.md index d71d5d3b..7beb7a9e 100644 --- a/document-release/SKILL.md +++ b/document-release/SKILL.md @@ -133,6 +133,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/investigate/SKILL.md b/investigate/SKILL.md index eccce870..9a61f540 100644 --- a/investigate/SKILL.md +++ b/investigate/SKILL.md @@ -16,6 +16,7 @@ allowed-tools: - Grep - Glob - AskUserQuestion + - WebSearch hooks: PreToolUse: - matcher: "Edit" @@ -146,6 +147,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -308,6 +329,12 @@ Also check: - `TODOS.md` for related known issues - `git log` for prior fixes in the same area — **recurring bugs in the same files are an architectural smell**, not a coincidence +**External pattern search:** If the bug doesn't match a known pattern above, WebSearch for: +- "{framework} {generic error type}" — **sanitize first:** strip hostnames, IPs, file paths, SQL, customer data. Search the error category, not the raw message. +- "{library} {component} known issues" + +If WebSearch is unavailable, skip this search and proceed with hypothesis testing. If a documented solution or known dependency bug surfaces, present it as a candidate hypothesis in Phase 3. + --- ## Phase 3: Hypothesis Testing @@ -316,7 +343,7 @@ Before writing ANY fix, verify your hypothesis. 1. **Confirm the hypothesis:** Add a temporary log statement, assertion, or debug output at the suspected root cause. Run the reproduction. Does the evidence match? -2. **If the hypothesis is wrong:** Return to Phase 1. Gather more evidence. Do not guess. +2. **If the hypothesis is wrong:** Before forming the next hypothesis, consider searching for the error. **Sanitize first** — strip hostnames, IPs, file paths, SQL fragments, customer identifiers, and any internal/proprietary data from the error message. Search only the generic error type and framework context: "{component} {sanitized error type} {framework version}". If the error message is too specific to sanitize safely, skip the search. If WebSearch is unavailable, skip and proceed. Then return to Phase 1. Gather more evidence. Do not guess. 3. **3-strike rule:** If 3 hypotheses fail, **STOP**. Use AskUserQuestion: ``` diff --git a/investigate/SKILL.md.tmpl b/investigate/SKILL.md.tmpl index 4db09f30..8e37becd 100644 --- a/investigate/SKILL.md.tmpl +++ b/investigate/SKILL.md.tmpl @@ -16,6 +16,7 @@ allowed-tools: - Grep - Glob - AskUserQuestion + - WebSearch hooks: PreToolUse: - matcher: "Edit" @@ -104,6 +105,12 @@ Also check: - `TODOS.md` for related known issues - `git log` for prior fixes in the same area — **recurring bugs in the same files are an architectural smell**, not a coincidence +**External pattern search:** If the bug doesn't match a known pattern above, WebSearch for: +- "{framework} {generic error type}" — **sanitize first:** strip hostnames, IPs, file paths, SQL, customer data. Search the error category, not the raw message. +- "{library} {component} known issues" + +If WebSearch is unavailable, skip this search and proceed with hypothesis testing. If a documented solution or known dependency bug surfaces, present it as a candidate hypothesis in Phase 3. + --- ## Phase 3: Hypothesis Testing @@ -112,7 +119,7 @@ Before writing ANY fix, verify your hypothesis. 1. **Confirm the hypothesis:** Add a temporary log statement, assertion, or debug output at the suspected root cause. Run the reproduction. Does the evidence match? -2. **If the hypothesis is wrong:** Return to Phase 1. Gather more evidence. Do not guess. +2. **If the hypothesis is wrong:** Before forming the next hypothesis, consider searching for the error. **Sanitize first** — strip hostnames, IPs, file paths, SQL fragments, customer identifiers, and any internal/proprietary data from the error message. Search only the generic error type and framework context: "{component} {sanitized error type} {framework version}". If the error message is too specific to sanitize safely, skip the search. If WebSearch is unavailable, skip and proceed. Then return to Phase 1. Gather more evidence. Do not guess. 3. **3-strike rule:** If 3 hypotheses fail, **STOP**. Use AskUserQuestion: ``` diff --git a/office-hours/SKILL.md b/office-hours/SKILL.md index 2a2e7583..218fe133 100644 --- a/office-hours/SKILL.md +++ b/office-hours/SKILL.md @@ -19,6 +19,7 @@ allowed-tools: - Write - Edit - AskUserQuestion + - WebSearch --- @@ -137,6 +138,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -456,6 +477,43 @@ If no matches found, proceed silently. --- +## Phase 2.75: Landscape Awareness + +Read ETHOS.md for the full Search Before Building framework (three layers, eureka moments). The preamble's Search Before Building section has the ETHOS.md path. + +After understanding the problem through questioning, search for what the world thinks. This is NOT competitive research (that's /design-consultation's job). This is understanding conventional wisdom so you can evaluate where it's wrong. + +**Privacy gate:** Before searching, use AskUserQuestion: "I'd like to search for what the world thinks about this space to inform our discussion. This sends generalized category terms (not your specific idea) to a search provider. OK to proceed?" +Options: A) Yes, search away B) Skip — keep this session private +If B: skip this phase entirely and proceed to Phase 3. Use only in-distribution knowledge. + +When searching, use **generalized category terms** — never the user's specific product name, proprietary concept, or stealth idea. For example, search "task management app landscape" not "SuperTodo AI-powered task killer." + +If WebSearch is unavailable, skip this phase and note: "Search unavailable — proceeding with in-distribution knowledge only." + +**Startup mode:** WebSearch for: +- "[problem space] startup approach {current year}" +- "[problem space] common mistakes" +- "why [incumbent solution] fails" OR "why [incumbent solution] works" + +**Builder mode:** WebSearch for: +- "[thing being built] existing solutions" +- "[thing being built] open source alternatives" +- "best [thing category] {current year}" + +Read the top 2-3 results. Run the three-layer synthesis: +- **[Layer 1]** What does everyone already know about this space? +- **[Layer 2]** What are the search results and current discourse saying? +- **[Layer 3]** Given what WE learned in Phase 2A/2B — is there a reason the conventional approach is wrong? + +**Eureka check:** If Layer 3 reasoning reveals a genuine insight, name it: "EUREKA: Everyone does X because they assume [assumption]. But [evidence from our conversation] suggests that's wrong here. This means [implication]." Log the eureka moment (see preamble). + +If no eureka moment exists, say: "The conventional wisdom seems sound here. Let's build on it." Proceed to Phase 3. + +**Important:** This search feeds Phase 3 (Premise Challenge). If you found reasons the conventional approach fails, those become premises to challenge. If conventional wisdom is solid, that raises the bar for any premise that contradicts it. + +--- + ## Phase 3: Premise Challenge Before proposing solutions, challenge the premises: diff --git a/office-hours/SKILL.md.tmpl b/office-hours/SKILL.md.tmpl index e0ff98a7..7dbc6d32 100644 --- a/office-hours/SKILL.md.tmpl +++ b/office-hours/SKILL.md.tmpl @@ -19,6 +19,7 @@ allowed-tools: - Write - Edit - AskUserQuestion + - WebSearch --- {{PREAMBLE}} @@ -235,6 +236,43 @@ If no matches found, proceed silently. --- +## Phase 2.75: Landscape Awareness + +Read ETHOS.md for the full Search Before Building framework (three layers, eureka moments). The preamble's Search Before Building section has the ETHOS.md path. + +After understanding the problem through questioning, search for what the world thinks. This is NOT competitive research (that's /design-consultation's job). This is understanding conventional wisdom so you can evaluate where it's wrong. + +**Privacy gate:** Before searching, use AskUserQuestion: "I'd like to search for what the world thinks about this space to inform our discussion. This sends generalized category terms (not your specific idea) to a search provider. OK to proceed?" +Options: A) Yes, search away B) Skip — keep this session private +If B: skip this phase entirely and proceed to Phase 3. Use only in-distribution knowledge. + +When searching, use **generalized category terms** — never the user's specific product name, proprietary concept, or stealth idea. For example, search "task management app landscape" not "SuperTodo AI-powered task killer." + +If WebSearch is unavailable, skip this phase and note: "Search unavailable — proceeding with in-distribution knowledge only." + +**Startup mode:** WebSearch for: +- "[problem space] startup approach {current year}" +- "[problem space] common mistakes" +- "why [incumbent solution] fails" OR "why [incumbent solution] works" + +**Builder mode:** WebSearch for: +- "[thing being built] existing solutions" +- "[thing being built] open source alternatives" +- "best [thing category] {current year}" + +Read the top 2-3 results. Run the three-layer synthesis: +- **[Layer 1]** What does everyone already know about this space? +- **[Layer 2]** What are the search results and current discourse saying? +- **[Layer 3]** Given what WE learned in Phase 2A/2B — is there a reason the conventional approach is wrong? + +**Eureka check:** If Layer 3 reasoning reveals a genuine insight, name it: "EUREKA: Everyone does X because they assume [assumption]. But [evidence from our conversation] suggests that's wrong here. This means [implication]." Log the eureka moment (see preamble). + +If no eureka moment exists, say: "The conventional wisdom seems sound here. Let's build on it." Proceed to Phase 3. + +**Important:** This search feeds Phase 3 (Premise Challenge). If you found reasons the conventional approach fails, those become premises to challenge. If conventional wisdom is solid, that raises the bar for any premise that contradicts it. + +--- + ## Phase 3: Premise Challenge Before proposing solutions, challenge the premises: diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index be25485a..33903b13 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -17,6 +17,7 @@ allowed-tools: - Glob - Bash - AskUserQuestion + - WebSearch --- @@ -135,6 +136,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -442,6 +463,22 @@ Analyze the plan. If it involves ANY of: new UI screens/pages, changes to existi Identify 2-3 files or patterns in the existing codebase that are particularly well-designed. Note them as style references for the review. Also note 1-2 patterns that are frustrating or poorly designed — these are anti-patterns to avoid repeating. Report findings before proceeding to Step 0. +### Landscape Check + +Read ETHOS.md for the Search Before Building framework (the preamble's Search Before Building section has the path). Before challenging scope, understand the landscape. WebSearch for: +- "[product category] landscape {current year}" +- "[key feature] alternatives" +- "why [incumbent/conventional approach] [succeeds/fails]" + +If WebSearch is unavailable, skip this check and note: "Search unavailable — proceeding with in-distribution knowledge only." + +Run the three-layer synthesis: +- **[Layer 1]** What's the tried-and-true approach in this space? +- **[Layer 2]** What are the search results saying? +- **[Layer 3]** First-principles reasoning — where might the conventional wisdom be wrong? + +Feed into the Premise Challenge (0A) and Dream State Mapping (0C). If you find a eureka moment, surface it during the Expansion opt-in ceremony as a differentiation opportunity. Log it (see preamble). + ## Step 0: Nuclear Scope Challenge + Mode Selection ### 0A. Premise Challenge diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index fea6879c..7342e790 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -17,6 +17,7 @@ allowed-tools: - Glob - Bash - AskUserQuestion + - WebSearch --- {{PREAMBLE}} @@ -204,6 +205,22 @@ Analyze the plan. If it involves ANY of: new UI screens/pages, changes to existi Identify 2-3 files or patterns in the existing codebase that are particularly well-designed. Note them as style references for the review. Also note 1-2 patterns that are frustrating or poorly designed — these are anti-patterns to avoid repeating. Report findings before proceeding to Step 0. +### Landscape Check + +Read ETHOS.md for the Search Before Building framework (the preamble's Search Before Building section has the path). Before challenging scope, understand the landscape. WebSearch for: +- "[product category] landscape {current year}" +- "[key feature] alternatives" +- "why [incumbent/conventional approach] [succeeds/fails]" + +If WebSearch is unavailable, skip this check and note: "Search unavailable — proceeding with in-distribution knowledge only." + +Run the three-layer synthesis: +- **[Layer 1]** What's the tried-and-true approach in this space? +- **[Layer 2]** What are the search results saying? +- **[Layer 3]** First-principles reasoning — where might the conventional wisdom be wrong? + +Feed into the Premise Challenge (0A) and Dream State Mapping (0C). If you find a eureka moment, surface it during the Expansion opt-in ceremony as a differentiation opportunity. Log it (see preamble). + ## Step 0: Nuclear Scope Challenge + Mode Selection ### 0A. Premise Challenge diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 1483e6e8..644191bf 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -134,6 +134,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 63fda40b..74bc8464 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -16,6 +16,7 @@ allowed-tools: - Glob - AskUserQuestion - Bash + - WebSearch --- @@ -134,6 +135,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -302,7 +323,15 @@ Before reviewing anything, answer these questions: 1. **What existing code already partially or fully solves each sub-problem?** Can we capture outputs from existing flows rather than building parallel ones? 2. **What is the minimum set of changes that achieves the stated goal?** Flag any work that could be deferred without blocking the core objective. Be ruthless about scope creep. 3. **Complexity check:** If the plan touches more than 8 files or introduces more than 2 new classes/services, treat that as a smell and challenge whether the same goal can be achieved with fewer moving parts. -4. **TODOS cross-reference:** Read `TODOS.md` if it exists. Are any deferred items blocking this plan? Can any deferred items be bundled into this PR without expanding scope? Does this plan create new work that should be captured as a TODO? +4. **Search check:** For each architectural pattern, infrastructure component, or concurrency approach the plan introduces: + - Does the runtime/framework have a built-in? Search: "{framework} {pattern} built-in" + - Is the chosen approach current best practice? Search: "{pattern} best practice {current year}" + - Are there known footguns? Search: "{framework} {pattern} pitfalls" + + If WebSearch is unavailable, skip this check and note: "Search unavailable — proceeding with in-distribution knowledge only." + + If the plan rolls a custom solution where a built-in exists, flag it as a scope reduction opportunity. Annotate recommendations with **[Layer 1]**, **[Layer 2]**, **[Layer 3]**, or **[EUREKA]** (see preamble's Search Before Building section). If you find a eureka moment — a reason the standard approach is wrong for this case — present it as an architectural insight. +5. **TODOS cross-reference:** Read `TODOS.md` if it exists. Are any deferred items blocking this plan? Can any deferred items be bundled into this PR without expanding scope? Does this plan create new work that should be captured as a TODO? 5. **Completeness check:** Is the plan doing the complete version or a shortcut? With AI-assisted coding, the cost of completeness (100% test coverage, full edge case handling, complete error paths) is 10-100x cheaper than with a human team. If the plan proposes a shortcut that saves human-hours but only saves minutes with CC+gstack, recommend the complete version. Boil the lake. diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index 09782a9d..bc254d4f 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -16,6 +16,7 @@ allowed-tools: - Glob - AskUserQuestion - Bash + - WebSearch --- {{PREAMBLE}} @@ -81,7 +82,15 @@ Before reviewing anything, answer these questions: 1. **What existing code already partially or fully solves each sub-problem?** Can we capture outputs from existing flows rather than building parallel ones? 2. **What is the minimum set of changes that achieves the stated goal?** Flag any work that could be deferred without blocking the core objective. Be ruthless about scope creep. 3. **Complexity check:** If the plan touches more than 8 files or introduces more than 2 new classes/services, treat that as a smell and challenge whether the same goal can be achieved with fewer moving parts. -4. **TODOS cross-reference:** Read `TODOS.md` if it exists. Are any deferred items blocking this plan? Can any deferred items be bundled into this PR without expanding scope? Does this plan create new work that should be captured as a TODO? +4. **Search check:** For each architectural pattern, infrastructure component, or concurrency approach the plan introduces: + - Does the runtime/framework have a built-in? Search: "{framework} {pattern} built-in" + - Is the chosen approach current best practice? Search: "{pattern} best practice {current year}" + - Are there known footguns? Search: "{framework} {pattern} pitfalls" + + If WebSearch is unavailable, skip this check and note: "Search unavailable — proceeding with in-distribution knowledge only." + + If the plan rolls a custom solution where a built-in exists, flag it as a scope reduction opportunity. Annotate recommendations with **[Layer 1]**, **[Layer 2]**, **[Layer 3]**, or **[EUREKA]** (see preamble's Search Before Building section). If you find a eureka moment — a reason the standard approach is wrong for this case — present it as an architectural insight. +5. **TODOS cross-reference:** Read `TODOS.md` if it exists. Are any deferred items blocking this plan? Can any deferred items be bundled into this PR without expanding scope? Does this plan create new work that should be captured as a TODO? 5. **Completeness check:** Is the plan doing the complete version or a shortcut? With AI-assisted coding, the cost of completeness (100% test coverage, full edge case handling, complete error paths) is 10-100x cheaper than with a human team. If the plan proposes a shortcut that saves human-hours but only saves minutes with CC+gstack, recommend the complete version. Boil the lake. diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index 1d2479d1..a46233a3 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -12,6 +12,7 @@ allowed-tools: - Read - Write - AskUserQuestion + - WebSearch --- @@ -130,6 +131,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/qa-only/SKILL.md.tmpl b/qa-only/SKILL.md.tmpl index e85d643a..293a7b36 100644 --- a/qa-only/SKILL.md.tmpl +++ b/qa-only/SKILL.md.tmpl @@ -12,6 +12,7 @@ allowed-tools: - Read - Write - AskUserQuestion + - WebSearch --- {{PREAMBLE}} diff --git a/qa/SKILL.md b/qa/SKILL.md index 92799a0d..6e7d49a0 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -137,6 +137,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/retro/SKILL.md b/retro/SKILL.md index ff4f7283..635b5747 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -131,6 +131,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -376,6 +396,20 @@ If TODOS.md doesn't exist, skip the Backlog Health row. If the JSONL file doesn't exist or has no entries in the window, skip the Skill Usage row. +**Eureka Moments (if logged):** Read `~/.gstack/analytics/eureka.jsonl` if it exists. Filter entries within the retro time window by `ts` field. For each eureka moment, show the skill that flagged it, the branch, and a one-line summary of the insight. Present as: + +``` +| Eureka Moments | 2 this period | +``` + +If moments exist, list them: +``` + EUREKA /office-hours (branch: garrytan/auth-rethink): "Session tokens don't need server storage — browser crypto API makes client-side JWT validation viable" + EUREKA /plan-eng-review (branch: garrytan/cache-layer): "Redis isn't needed here — Bun's built-in LRU cache handles this workload" +``` + +If the JSONL file doesn't exist or has no entries in the window, skip the Eureka Moments row. + ### Step 3: Commit Time Distribution Show hourly histogram in local time using bar chart: diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index a81b12c9..b3fe8046 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -172,6 +172,20 @@ If TODOS.md doesn't exist, skip the Backlog Health row. If the JSONL file doesn't exist or has no entries in the window, skip the Skill Usage row. +**Eureka Moments (if logged):** Read `~/.gstack/analytics/eureka.jsonl` if it exists. Filter entries within the retro time window by `ts` field. For each eureka moment, show the skill that flagged it, the branch, and a one-line summary of the insight. Present as: + +``` +| Eureka Moments | 2 this period | +``` + +If moments exist, list them: +``` + EUREKA /office-hours (branch: garrytan/auth-rethink): "Session tokens don't need server storage — browser crypto API makes client-side JWT validation viable" + EUREKA /plan-eng-review (branch: garrytan/cache-layer): "Redis isn't needed here — Bun's built-in LRU cache handles this workload" +``` + +If the JSONL file doesn't exist or has no entries in the window, skip the Eureka Moments row. + ### Step 3: Commit Time Distribution Show hourly histogram in local time using bar chart: diff --git a/review/SKILL.md b/review/SKILL.md index 4a646f6e..83a145f5 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -14,6 +14,7 @@ allowed-tools: - Grep - Glob - AskUserQuestion + - WebSearch --- @@ -132,6 +133,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. @@ -328,6 +349,13 @@ Apply the checklist against the diff in two passes: **Enum & Value Completeness requires reading code OUTSIDE the diff.** When the diff introduces a new enum value, status, tier, or type constant, use Grep to find all files that reference sibling values, then Read those files to check if the new value is handled. This is the one category where within-diff review is insufficient. +**Search-before-recommending:** When recommending a fix pattern (especially for concurrency, caching, auth, or framework-specific behavior): +- Verify the pattern is current best practice for the framework version in use +- Check if a built-in solution exists in newer versions before recommending a workaround +- Verify API signatures against current docs (APIs change between versions) + +Takes seconds, prevents recommending outdated patterns. If WebSearch is unavailable, note it and proceed with in-distribution knowledge. + Follow the output format specified in the checklist. Respect the suppressions — do NOT flag items listed in the "DO NOT flag" section. --- diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index 34a25018..3e65ce27 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -14,6 +14,7 @@ allowed-tools: - Grep - Glob - AskUserQuestion + - WebSearch --- {{PREAMBLE}} @@ -107,6 +108,13 @@ Apply the checklist against the diff in two passes: **Enum & Value Completeness requires reading code OUTSIDE the diff.** When the diff introduces a new enum value, status, tier, or type constant, use Grep to find all files that reference sibling values, then Read those files to check if the new value is handled. This is the one category where within-diff review is insufficient. +**Search-before-recommending:** When recommending a fix pattern (especially for concurrency, caching, auth, or framework-specific behavior): +- Verify the pattern is current best practice for the framework version in use +- Check if a built-in solution exists in newer versions before recommending a workaround +- Verify API signatures against current docs (APIs change between versions) + +Takes seconds, prevents recommending outdated patterns. If WebSearch is unavailable, note it and proceed with in-distribution knowledge. + Follow the output format specified in the checklist. Respect the suppressions — do NOT flag items listed in the "DO NOT flag" section. --- diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 8bb16bf9..3a5de960 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -263,6 +263,28 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.")`; } +function generateSearchBeforeBuildingSection(ctx: TemplateContext): string { + return `## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read \`${ctx.paths.skillRoot}/ETHOS.md\` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +\`\`\`bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +\`\`\` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only."`; +} + function generateContributorMode(): string { return `## Contributor Mode @@ -365,6 +387,7 @@ function generatePreamble(ctx: TemplateContext): string { generateTelemetryPrompt(ctx), generateAskUserFormat(ctx), generateCompletenessSection(), + generateSearchBeforeBuildingSection(ctx), generateContributorMode(), generateCompletionStatus(), ].join('\n\n'); diff --git a/setup b/setup index 09d2282f..d67bdec1 100755 --- a/setup +++ b/setup @@ -205,6 +205,17 @@ create_agents_sidecar() { fi fi done + + # Sidecar files that skills reference at runtime + for file in ETHOS.md; do + local src="$GSTACK_DIR/$file" + local dst="$agents_gstack/$file" + if [ -f "$src" ]; then + if [ -L "$dst" ] || [ ! -e "$dst" ]; then + ln -snf "$src" "$dst" + fi + fi + done } # 4. Install for Claude (default) diff --git a/setup-browser-cookies/SKILL.md b/setup-browser-cookies/SKILL.md index ac2d873c..a98ebec1 100644 --- a/setup-browser-cookies/SKILL.md +++ b/setup-browser-cookies/SKILL.md @@ -128,6 +128,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. diff --git a/ship/SKILL.md b/ship/SKILL.md index 232a23e0..5ca6785f 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -131,6 +131,26 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. From 6c69febf11a815dcc2e49b402652be0398f86b57 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 21 Mar 2026 12:12:56 -0700 Subject: [PATCH 7/8] feat: auto-scaled adversarial review (v0.9.5.0) (#297) * feat: auto-scaled adversarial review by diff size Replace config-driven Codex review step with automatic adversarial review that scales by diff size: small (<50 lines) skips adversarial, medium (50-199) gets cross-model adversarial, large (200+) gets all 4 passes. Adds Claude adversarial subagent as fallback when Codex unavailable. Review log uses new "adversarial-review" skill name with source/tier fields. Dashboard updated to read both adversarial-review and legacy codex-review. * chore: regenerate SKILL.md files for auto-scaled adversarial review * chore: bump version and changelog (v0.9.5.0) Co-Authored-By: Claude Opus 4.6 * feat: allow user override of adversarial review tier Users can now say "paranoid review", "run all passes", "full adversarial", etc. to force the large tier regardless of diff size. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- .../skills/gstack-plan-ceo-review/SKILL.md | 6 +- .../skills/gstack-plan-design-review/SKILL.md | 6 +- .../skills/gstack-plan-eng-review/SKILL.md | 6 +- .agents/skills/gstack-ship/SKILL.md | 8 +- CHANGELOG.md | 8 + VERSION | 2 +- plan-ceo-review/SKILL.md | 6 +- plan-design-review/SKILL.md | 6 +- plan-eng-review/SKILL.md | 6 +- review/SKILL.md | 200 ++++++++-------- review/SKILL.md.tmpl | 3 +- scripts/gen-skill-docs.ts | 214 +++++++++--------- ship/SKILL.md | 206 +++++++++-------- ship/SKILL.md.tmpl | 5 +- test/skill-validation.test.ts | 45 ++-- 15 files changed, 398 insertions(+), 329 deletions(-) diff --git a/.agents/skills/gstack-plan-ceo-review/SKILL.md b/.agents/skills/gstack-plan-ceo-review/SKILL.md index 290ddbb2..52edc3ba 100644 --- a/.agents/skills/gstack-plan-ceo-review/SKILL.md +++ b/.agents/skills/gstack-plan-ceo-review/SKILL.md @@ -1075,7 +1075,7 @@ After completing the review, read the review log and config to display the dashb ~/.codex/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -1086,7 +1086,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -1096,7 +1096,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/.agents/skills/gstack-plan-design-review/SKILL.md b/.agents/skills/gstack-plan-design-review/SKILL.md index 8ee46259..ee71ce2e 100644 --- a/.agents/skills/gstack-plan-design-review/SKILL.md +++ b/.agents/skills/gstack-plan-design-review/SKILL.md @@ -527,7 +527,7 @@ After completing the review, read the review log and config to display the dashb ~/.codex/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -538,7 +538,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -548,7 +548,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/.agents/skills/gstack-plan-eng-review/SKILL.md b/.agents/skills/gstack-plan-eng-review/SKILL.md index 942d5822..fa89ed78 100644 --- a/.agents/skills/gstack-plan-eng-review/SKILL.md +++ b/.agents/skills/gstack-plan-eng-review/SKILL.md @@ -524,7 +524,7 @@ After completing the review, read the review log and config to display the dashb ~/.codex/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -535,7 +535,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -545,7 +545,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/.agents/skills/gstack-ship/SKILL.md b/.agents/skills/gstack-ship/SKILL.md index 0809141b..442c4a72 100644 --- a/.agents/skills/gstack-ship/SKILL.md +++ b/.agents/skills/gstack-ship/SKILL.md @@ -294,7 +294,7 @@ After completing the review, read the review log and config to display the dashb ~/.codex/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -305,7 +305,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -315,7 +315,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) @@ -1098,7 +1098,7 @@ doc updates — the user runs `/ship` and documentation stays current without a - **Never skip tests.** If tests fail, stop. - **Never skip the pre-landing review.** If checklist.md is unreadable, stop. - **Never force push.** Use regular `git push` only. -- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), Codex critical findings ([P1]), and the one-time Codex adoption prompt. +- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), and Codex structured review [P1] findings (large diffs only). - **Always use the 4-digit version format** from the VERSION file. - **Date format in CHANGELOG:** `YYYY-MM-DD` - **Split commits for bisectability** — each commit = one logical change. diff --git a/CHANGELOG.md b/CHANGELOG.md index ee1d230d..39ad4c75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [0.9.6.0] - 2026-03-21 — Auto-Scaled Adversarial Review + +### Changed + +- **Review thoroughness now scales automatically with diff size.** Small diffs (<50 lines) skip adversarial review entirely — no wasted time on typo fixes. Medium diffs (50–199 lines) get a cross-model adversarial challenge from Codex (or a Claude adversarial subagent if Codex isn't installed). Large diffs (200+ lines) get all four passes: Claude structured, Codex structured review with pass/fail gate, Claude adversarial subagent, and Codex adversarial challenge. No configuration needed — it just works. +- **Claude now has an adversarial mode.** A fresh Claude subagent with no checklist bias reviews your code like an attacker — finding edge cases, race conditions, security holes, and silent data corruption that the structured review might miss. Findings are classified as FIXABLE (auto-fixed) or INVESTIGATE (your call). +- **Review dashboard shows "Adversarial" instead of "Codex Review."** The dashboard row reflects the new multi-model reality — it tracks whichever adversarial passes actually ran, not just Codex. + ## [0.9.5.0] - 2026-03-21 — Builder Ethos ### Added diff --git a/VERSION b/VERSION index 719a2339..c318497f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.5.0 +0.9.6.0 diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index 33903b13..cc99f436 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -1084,7 +1084,7 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -1095,7 +1095,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -1105,7 +1105,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 644191bf..79d39438 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -535,7 +535,7 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -546,7 +546,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -556,7 +556,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 74bc8464..2ee0db68 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -534,7 +534,7 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -545,7 +545,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -555,7 +555,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) diff --git a/review/SKILL.md b/review/SKILL.md index 83a145f5..f3c427e0 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -13,6 +13,7 @@ allowed-tools: - Write - Grep - Glob + - Agent - AskUserQuestion - WebSearch --- @@ -511,128 +512,139 @@ If no documentation files exist, skip this step silently. --- -## Step 5.7: Codex review +## Step 5.7: Adversarial review (auto-scaled) -Check if the Codex CLI is available and read the user's Codex review preference: +Adversarial review thoroughness scales automatically based on diff size. No configuration needed. + +**Detect diff size and tool availability:** ```bash +DIFF_INS=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0") +DIFF_DEL=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0") +DIFF_TOTAL=$((DIFF_INS + DIFF_DEL)) which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" -CODEX_REVIEWS_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) -echo "CODEX_REVIEWS: ${CODEX_REVIEWS_CFG:-not_set}" +# Respect old opt-out +OLD_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) +echo "DIFF_SIZE: $DIFF_TOTAL" +echo "OLD_CFG: ${OLD_CFG:-not_set}" ``` -If `CODEX_NOT_AVAILABLE`: skip this step silently. Continue to the next step. +If `OLD_CFG` is `disabled`: skip this step silently. Continue to the next step. -If `CODEX_REVIEWS` is `disabled`: skip this step silently. Continue to the next step. +**User override:** If the user explicitly requested a specific tier (e.g., "run all passes", "paranoid review", "full adversarial", "do all 4 passes", "thorough review"), honor that request regardless of diff size. Jump to the matching tier section. -If `CODEX_REVIEWS` is `enabled`: run both code review and adversarial challenge automatically (no prompt). Jump to the "Run Codex" section below. +**Auto-select tier based on diff size:** +- **Small (< 50 lines changed):** Skip adversarial review entirely. Print: "Small diff ($DIFF_TOTAL lines) — adversarial review skipped." Continue to the next step. +- **Medium (50–199 lines changed):** Run Codex adversarial challenge (or Claude adversarial subagent if Codex unavailable). Jump to the "Medium tier" section. +- **Large (200+ lines changed):** Run all remaining passes — Codex structured review + Claude adversarial subagent + Codex adversarial. Jump to the "Large tier" section. -If `CODEX_REVIEWS` is `not_set`: use AskUserQuestion to offer the one-time adoption prompt: +--- -``` -GStack recommends enabling Codex code reviews — Codex is the super smart quiet engineer friend who will save your butt. +### Medium tier (50–199 lines) -A) Enable for all future runs (recommended, default) -B) Try it for now, ask me again later -C) No thanks, don't ask me again -``` +Claude's structured review already ran. Now add a **cross-model adversarial challenge**. -If the user chooses A: persist the setting and run both: -```bash -~/.claude/skills/gstack/bin/gstack-config set codex_reviews enabled -``` +**If Codex is available:** run the Codex adversarial challenge. **If Codex is NOT available:** fall back to the Claude adversarial subagent instead. -If the user chooses B: run both this time but do not persist any setting. +**Codex adversarial:** -If the user chooses C: persist the opt-out and skip: -```bash -~/.claude/skills/gstack/bin/gstack-config set codex_reviews disabled -``` -Then skip this step. Continue to the next step. - -### Run Codex - -Always run **both** code review and adversarial challenge. Use a 5-minute timeout (`timeout: 300000`) on each Bash call. - -First, create a temp file for stderr capture: -```bash -TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) -``` - -**Code review:** Run: -```bash -codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" -``` - -After the command completes, read stderr for cost/error info: -```bash -cat "$TMPERR" -``` - -Present the full output verbatim under a `CODEX SAYS (code review):` header: - -``` -CODEX SAYS (code review): -════════════════════════════════════════════════════════════ - -════════════════════════════════════════════════════════════ -GATE: PASS Tokens: N | Est. cost: ~$X.XX -``` - -Check the output for `[P1]` markers. If found: `GATE: FAIL`. If no `[P1]`: `GATE: PASS`. - -**If GATE is FAIL:** use AskUserQuestion: - -``` -Codex found N critical issues in the diff. - -A) Investigate and fix now (recommended) -B) Ship anyway — these issues may cause production problems -``` - -If the user chooses A: read the Codex findings carefully and work to address them. Then re-run `codex review` to verify the gate is now PASS. - -If the user chooses B: continue to the next step. - -### Error handling (code review) - -Before persisting the gate result, check for errors. All errors are non-blocking — Codex is a quality enhancement, not a prerequisite. Check `$TMPERR` output (already read above) for error indicators: - -- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key", tell the user: "Codex authentication failed. Run \`codex login\` in your terminal to authenticate via ChatGPT." Do NOT persist a review log entry. Continue to the adversarial step (it will likely fail too, but try anyway). -- **Timeout:** If the Bash call times out (5 min), tell the user: "Codex timed out after 5 minutes. The diff may be too large or the API may be slow." Do NOT persist a review log entry. Skip to cleanup. -- **Empty response:** If codex returned no stdout output, tell the user: "Codex returned no response. Stderr: ." Do NOT persist a review log entry. Skip to cleanup. - -**Only if codex produced a real review (non-empty stdout):** Persist the code review result: -```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' -``` - -Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). - -**Adversarial challenge:** Run: ```bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" ``` -After the command completes, read adversarial stderr: +Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: ```bash cat "$TMPERR_ADV" ``` -Present the full output verbatim under a `CODEX SAYS (adversarial challenge):` header. This is informational — it never blocks shipping. If the adversarial command timed out or returned no output, note this to the user and continue. +Present the full output verbatim. This is informational — it never blocks shipping. -**Cross-model analysis:** After both Codex outputs are presented, compare Codex's findings with your own review findings from the earlier review steps and output: +**Error handling:** All errors are non-blocking — adversarial review is a quality enhancement, not a prerequisite. +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \`codex login\` to authenticate." +- **Timeout:** "Codex timed out after 5 minutes." +- **Empty response:** "Codex returned no response. Stderr: ." +On any Codex error, fall back to the Claude adversarial subagent automatically. + +**Claude adversarial subagent** (fallback when Codex unavailable or errored): + +Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to. + +Subagent prompt: +"Read the diff for this branch with `git diff origin/`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment)." + +Present findings under an `ADVERSARIAL REVIEW (Claude subagent):` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational. + +If the subagent fails or times out: "Claude adversarial subagent unavailable. Continuing without adversarial review." + +**Persist the review result:** +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"medium","commit":"'"$(git rev-parse --short HEAD)"'"}' ``` -CROSS-MODEL ANALYSIS: - Both found: [findings that overlap between Claude and Codex] - Only Codex found: [findings unique to Codex] - Only Claude found: [findings unique to Claude's review] - Agreement rate: X% (N/M total unique findings overlap) +Substitute STATUS: "clean" if no findings, "issues_found" if findings exist. SOURCE: "codex" if Codex ran, "claude" if subagent ran. If both failed, do NOT persist. + +**Cleanup:** Run `rm -f "$TMPERR_ADV"` after processing (if Codex was used). + +--- + +### Large tier (200+ lines) + +Claude's structured review already ran. Now run **all three remaining passes** for maximum coverage: + +**1. Codex structured review (if available):** +```bash +TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" ``` -**Cleanup:** Run `rm -f "$TMPERR" "$TMPERR_ADV"` after processing. +Use a 5-minute timeout. Present output under `CODEX SAYS (code review):` header. +Check for `[P1]` markers: found → `GATE: FAIL`, not found → `GATE: PASS`. + +If GATE is FAIL, use AskUserQuestion: +``` +Codex found N critical issues in the diff. + +A) Investigate and fix now (recommended) +B) Continue — review will still complete +``` + +If A: address the findings. Re-run `codex review` to verify. + +Read stderr for errors (same error handling as medium tier). + +After stderr: `rm -f "$TMPERR"` + +**2. Claude adversarial subagent:** Dispatch a subagent with the adversarial prompt (same prompt as medium tier). This always runs regardless of Codex availability. + +**3. Codex adversarial challenge (if available):** Run `codex exec` with the adversarial prompt (same as medium tier). + +If Codex is not available for steps 1 and 3, note to the user: "Codex CLI not found — large-diff review ran Claude structured + Claude adversarial (2 of 4 passes). Install Codex for full 4-pass coverage: `npm install -g @openai/codex`" + +**Persist the review result AFTER all passes complete** (not after each sub-step): +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"large","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' +``` +Substitute: STATUS = "clean" if no findings across ALL passes, "issues_found" if any pass found issues. SOURCE = "both" if Codex ran, "claude" if only Claude subagent ran. GATE = the Codex structured review gate result ("pass"/"fail"), or "informational" if Codex was unavailable. If all passes failed, do NOT persist. + +--- + +### Cross-model synthesis (medium and large tiers) + +After all passes complete, synthesize findings across all sources: + +``` +ADVERSARIAL REVIEW SYNTHESIS (auto: TIER, N lines): +════════════════════════════════════════════════════════════ + High confidence (found by multiple sources): [findings agreed on by >1 pass] + Unique to Claude structured review: [from earlier step] + Unique to Claude adversarial: [from subagent, if ran] + Unique to Codex: [from codex adversarial or code review, if ran] + Models used: Claude structured ✓ Claude adversarial ✓/✗ Codex ✓/✗ +════════════════════════════════════════════════════════════ +``` + +High-confidence findings (agreed on by multiple sources) should be prioritized for fixes. --- diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index 3e65ce27..5ed337af 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -13,6 +13,7 @@ allowed-tools: - Write - Grep - Glob + - Agent - AskUserQuestion - WebSearch --- @@ -239,7 +240,7 @@ If no documentation files exist, skip this step silently. --- -{{CODEX_REVIEW_STEP}} +{{ADVERSARIAL_STEP}} ## Important Rules diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 3a5de960..2d862383 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -1094,7 +1094,7 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read \`\`\` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between \`adversarial-review\` (new auto-scaled) and \`codex-review\` (legacy). For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: \`\`\` +====================================================================+ @@ -1105,7 +1105,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -1115,7 +1115,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \\\`gstack-config set skip_eng_review true\\\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \\\`gstack-config set codex_reviews enabled|disabled\\\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \\\`skip_eng_review\\\` is \\\`true\\\`) @@ -1435,135 +1435,146 @@ The screenshot file at \`/tmp/gstack-sketch.png\` can be referenced by downstrea (\`/plan-design-review\`, \`/design-review\`) to see what was originally envisioned.`; } -function generateCodexReviewStep(ctx: TemplateContext): string { +function generateAdversarialStep(ctx: TemplateContext): string { // Codex host: strip entirely — Codex should never invoke itself if (ctx.host === 'codex') return ''; const isShip = ctx.skillName === 'ship'; const stepNum = isShip ? '3.8' : '5.7'; - return `## Step ${stepNum}: Codex review + return `## Step ${stepNum}: Adversarial review (auto-scaled) -Check if the Codex CLI is available and read the user's Codex review preference: +Adversarial review thoroughness scales automatically based on diff size. No configuration needed. + +**Detect diff size and tool availability:** \`\`\`bash +DIFF_INS=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0") +DIFF_DEL=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0") +DIFF_TOTAL=$((DIFF_INS + DIFF_DEL)) which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" -CODEX_REVIEWS_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) -echo "CODEX_REVIEWS: \${CODEX_REVIEWS_CFG:-not_set}" +# Respect old opt-out +OLD_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) +echo "DIFF_SIZE: $DIFF_TOTAL" +echo "OLD_CFG: \${OLD_CFG:-not_set}" \`\`\` -If \`CODEX_NOT_AVAILABLE\`: skip this step silently. Continue to the next step. +If \`OLD_CFG\` is \`disabled\`: skip this step silently. Continue to the next step. -If \`CODEX_REVIEWS\` is \`disabled\`: skip this step silently. Continue to the next step. +**User override:** If the user explicitly requested a specific tier (e.g., "run all passes", "paranoid review", "full adversarial", "do all 4 passes", "thorough review"), honor that request regardless of diff size. Jump to the matching tier section. -If \`CODEX_REVIEWS\` is \`enabled\`: run both code review and adversarial challenge automatically (no prompt). Jump to the "Run Codex" section below. +**Auto-select tier based on diff size:** +- **Small (< 50 lines changed):** Skip adversarial review entirely. Print: "Small diff ($DIFF_TOTAL lines) — adversarial review skipped." Continue to the next step. +- **Medium (50–199 lines changed):** Run Codex adversarial challenge (or Claude adversarial subagent if Codex unavailable). Jump to the "Medium tier" section. +- **Large (200+ lines changed):** Run all remaining passes — Codex structured review + Claude adversarial subagent + Codex adversarial. Jump to the "Large tier" section. -If \`CODEX_REVIEWS\` is \`not_set\`: use AskUserQuestion to offer the one-time adoption prompt: +--- -\`\`\` -GStack recommends enabling Codex code reviews — Codex is the super smart quiet engineer friend who will save your butt. +### Medium tier (50–199 lines) -A) Enable for all future runs (recommended, default) -B) Try it for now, ask me again later -C) No thanks, don't ask me again -\`\`\` +Claude's structured review already ran. Now add a **cross-model adversarial challenge**. -If the user chooses A: persist the setting and run both: -\`\`\`bash -~/.claude/skills/gstack/bin/gstack-config set codex_reviews enabled -\`\`\` +**If Codex is available:** run the Codex adversarial challenge. **If Codex is NOT available:** fall back to the Claude adversarial subagent instead. -If the user chooses B: run both this time but do not persist any setting. +**Codex adversarial:** -If the user chooses C: persist the opt-out and skip: -\`\`\`bash -~/.claude/skills/gstack/bin/gstack-config set codex_reviews disabled -\`\`\` -Then skip this step. Continue to the next step. - -### Run Codex - -Always run **both** code review and adversarial challenge. Use a 5-minute timeout (\`timeout: 300000\`) on each Bash call. - -First, create a temp file for stderr capture: -\`\`\`bash -TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) -\`\`\` - -**Code review:** Run: -\`\`\`bash -codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" -\`\`\` - -After the command completes, read stderr for cost/error info: -\`\`\`bash -cat "$TMPERR" -\`\`\` - -Present the full output verbatim under a \`CODEX SAYS (code review):\` header: - -\`\`\` -CODEX SAYS (code review): -════════════════════════════════════════════════════════════ - -════════════════════════════════════════════════════════════ -GATE: PASS Tokens: N | Est. cost: ~$X.XX -\`\`\` - -Check the output for \`[P1]\` markers. If found: \`GATE: FAIL\`. If no \`[P1]\`: \`GATE: PASS\`. - -**If GATE is FAIL:** use AskUserQuestion: - -\`\`\` -Codex found N critical issues in the diff. - -A) Investigate and fix now (recommended) -B) Ship anyway — these issues may cause production problems -\`\`\` - -If the user chooses A: read the Codex findings carefully and work to address them${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Then re-run \`codex review\` to verify the gate is now PASS. - -If the user chooses B: continue to the next step. - -### Error handling (code review) - -Before persisting the gate result, check for errors. All errors are non-blocking — Codex is a quality enhancement, not a prerequisite. Check \`$TMPERR\` output (already read above) for error indicators: - -- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key", tell the user: "Codex authentication failed. Run \\\`codex login\\\` in your terminal to authenticate via ChatGPT." Do NOT persist a review log entry. Continue to the adversarial step (it will likely fail too, but try anyway). -- **Timeout:** If the Bash call times out (5 min), tell the user: "Codex timed out after 5 minutes. The diff may be too large or the API may be slow." Do NOT persist a review log entry. Skip to cleanup. -- **Empty response:** If codex returned no stdout output, tell the user: "Codex returned no response. Stderr: ." Do NOT persist a review log entry. Skip to cleanup. - -**Only if codex produced a real review (non-empty stdout):** Persist the code review result: -\`\`\`bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' -\`\`\` - -Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). - -**Adversarial challenge:** Run: \`\`\`bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" \`\`\` -After the command completes, read adversarial stderr: +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: \`\`\`bash cat "$TMPERR_ADV" \`\`\` -Present the full output verbatim under a \`CODEX SAYS (adversarial challenge):\` header. This is informational — it never blocks shipping. If the adversarial command timed out or returned no output, note this to the user and continue. -${!isShip ? ` -**Cross-model analysis:** After both Codex outputs are presented, compare Codex's findings with your own review findings from the earlier review steps and output: +Present the full output verbatim. This is informational — it never blocks shipping. + +**Error handling:** All errors are non-blocking — adversarial review is a quality enhancement, not a prerequisite. +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \\\`codex login\\\` to authenticate." +- **Timeout:** "Codex timed out after 5 minutes." +- **Empty response:** "Codex returned no response. Stderr: ." + +On any Codex error, fall back to the Claude adversarial subagent automatically. + +**Claude adversarial subagent** (fallback when Codex unavailable or errored): + +Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to. + +Subagent prompt: +"Read the diff for this branch with \`git diff origin/\`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment)." + +Present findings under an \`ADVERSARIAL REVIEW (Claude subagent):\` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational. + +If the subagent fails or times out: "Claude adversarial subagent unavailable. Continuing without adversarial review." + +**Persist the review result:** +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"medium","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` +Substitute STATUS: "clean" if no findings, "issues_found" if findings exist. SOURCE: "codex" if Codex ran, "claude" if subagent ran. If both failed, do NOT persist. + +**Cleanup:** Run \`rm -f "$TMPERR_ADV"\` after processing (if Codex was used). + +--- + +### Large tier (200+ lines) + +Claude's structured review already ran. Now run **all three remaining passes** for maximum coverage: + +**1. Codex structured review (if available):** +\`\`\`bash +TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" +\`\`\` + +Use a 5-minute timeout. Present output under \`CODEX SAYS (code review):\` header. +Check for \`[P1]\` markers: found → \`GATE: FAIL\`, not found → \`GATE: PASS\`. + +If GATE is FAIL, use AskUserQuestion: +\`\`\` +Codex found N critical issues in the diff. + +A) Investigate and fix now (recommended) +B) Continue — review will still complete +\`\`\` + +If A: address the findings${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Re-run \`codex review\` to verify. + +Read stderr for errors (same error handling as medium tier). + +After stderr: \`rm -f "$TMPERR"\` + +**2. Claude adversarial subagent:** Dispatch a subagent with the adversarial prompt (same prompt as medium tier). This always runs regardless of Codex availability. + +**3. Codex adversarial challenge (if available):** Run \`codex exec\` with the adversarial prompt (same as medium tier). + +If Codex is not available for steps 1 and 3, note to the user: "Codex CLI not found — large-diff review ran Claude structured + Claude adversarial (2 of 4 passes). Install Codex for full 4-pass coverage: \`npm install -g @openai/codex\`" + +**Persist the review result AFTER all passes complete** (not after each sub-step): +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"large","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` +Substitute: STATUS = "clean" if no findings across ALL passes, "issues_found" if any pass found issues. SOURCE = "both" if Codex ran, "claude" if only Claude subagent ran. GATE = the Codex structured review gate result ("pass"/"fail"), or "informational" if Codex was unavailable. If all passes failed, do NOT persist. + +--- + +### Cross-model synthesis (medium and large tiers) + +After all passes complete, synthesize findings across all sources: \`\`\` -CROSS-MODEL ANALYSIS: - Both found: [findings that overlap between Claude and Codex] - Only Codex found: [findings unique to Codex] - Only Claude found: [findings unique to Claude's review] - Agreement rate: X% (N/M total unique findings overlap) +ADVERSARIAL REVIEW SYNTHESIS (auto: TIER, N lines): +════════════════════════════════════════════════════════════ + High confidence (found by multiple sources): [findings agreed on by >1 pass] + Unique to Claude structured review: [from earlier step] + Unique to Claude adversarial: [from subagent, if ran] + Unique to Codex: [from codex adversarial or code review, if ran] + Models used: Claude structured ✓ Claude adversarial ✓/✗ Codex ✓/✗ +════════════════════════════════════════════════════════════ \`\`\` -` : ''} -**Cleanup:** Run \`rm -f "$TMPERR" "$TMPERR_ADV"\` after processing. + +High-confidence findings (agreed on by multiple sources) should be prioritized for fixes. ---`; } @@ -1582,7 +1593,8 @@ const RESOLVERS: Record string> = { SPEC_REVIEW_LOOP: generateSpecReviewLoop, DESIGN_SKETCH: generateDesignSketch, BENEFITS_FROM: generateBenefitsFrom, - CODEX_REVIEW_STEP: generateCodexReviewStep, + CODEX_REVIEW_STEP: generateAdversarialStep, + ADVERSARIAL_STEP: generateAdversarialStep, }; // ─── Codex Helpers ─────────────────────────────────────────── diff --git a/ship/SKILL.md b/ship/SKILL.md index 5ca6785f..6ad69ba7 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -11,6 +11,7 @@ allowed-tools: - Edit - Grep - Glob + - Agent - AskUserQuestion - WebSearch --- @@ -304,7 +305,7 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, codex-review). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite, adversarial-review, codex-review). Ignore entries with timestamps older than 7 days. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -315,7 +316,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl | Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | | CEO Review | 0 | — | — | no | | Design Review | 0 | — | — | no | -| Codex Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +--------------------------------------------------------------------+ | VERDICT: CLEARED — Eng Review passed | +====================================================================+ @@ -325,7 +326,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \`gstack-config set skip_eng_review true\` (the "don't bother me" setting). - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. -- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \`gstack-config set codex_reviews enabled|disabled\`. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. **Verdict logic:** - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \`skip_eng_review\` is \`true\`) @@ -867,118 +868,139 @@ For each classified comment: --- -## Step 3.8: Codex review +## Step 3.8: Adversarial review (auto-scaled) -Check if the Codex CLI is available and read the user's Codex review preference: +Adversarial review thoroughness scales automatically based on diff size. No configuration needed. + +**Detect diff size and tool availability:** ```bash +DIFF_INS=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0") +DIFF_DEL=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0") +DIFF_TOTAL=$((DIFF_INS + DIFF_DEL)) which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" -CODEX_REVIEWS_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) -echo "CODEX_REVIEWS: ${CODEX_REVIEWS_CFG:-not_set}" +# Respect old opt-out +OLD_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) +echo "DIFF_SIZE: $DIFF_TOTAL" +echo "OLD_CFG: ${OLD_CFG:-not_set}" ``` -If `CODEX_NOT_AVAILABLE`: skip this step silently. Continue to the next step. +If `OLD_CFG` is `disabled`: skip this step silently. Continue to the next step. -If `CODEX_REVIEWS` is `disabled`: skip this step silently. Continue to the next step. +**User override:** If the user explicitly requested a specific tier (e.g., "run all passes", "paranoid review", "full adversarial", "do all 4 passes", "thorough review"), honor that request regardless of diff size. Jump to the matching tier section. -If `CODEX_REVIEWS` is `enabled`: run both code review and adversarial challenge automatically (no prompt). Jump to the "Run Codex" section below. +**Auto-select tier based on diff size:** +- **Small (< 50 lines changed):** Skip adversarial review entirely. Print: "Small diff ($DIFF_TOTAL lines) — adversarial review skipped." Continue to the next step. +- **Medium (50–199 lines changed):** Run Codex adversarial challenge (or Claude adversarial subagent if Codex unavailable). Jump to the "Medium tier" section. +- **Large (200+ lines changed):** Run all remaining passes — Codex structured review + Claude adversarial subagent + Codex adversarial. Jump to the "Large tier" section. -If `CODEX_REVIEWS` is `not_set`: use AskUserQuestion to offer the one-time adoption prompt: +--- -``` -GStack recommends enabling Codex code reviews — Codex is the super smart quiet engineer friend who will save your butt. +### Medium tier (50–199 lines) -A) Enable for all future runs (recommended, default) -B) Try it for now, ask me again later -C) No thanks, don't ask me again -``` +Claude's structured review already ran. Now add a **cross-model adversarial challenge**. -If the user chooses A: persist the setting and run both: -```bash -~/.claude/skills/gstack/bin/gstack-config set codex_reviews enabled -``` +**If Codex is available:** run the Codex adversarial challenge. **If Codex is NOT available:** fall back to the Claude adversarial subagent instead. -If the user chooses B: run both this time but do not persist any setting. +**Codex adversarial:** -If the user chooses C: persist the opt-out and skip: -```bash -~/.claude/skills/gstack/bin/gstack-config set codex_reviews disabled -``` -Then skip this step. Continue to the next step. - -### Run Codex - -Always run **both** code review and adversarial challenge. Use a 5-minute timeout (`timeout: 300000`) on each Bash call. - -First, create a temp file for stderr capture: -```bash -TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) -``` - -**Code review:** Run: -```bash -codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" -``` - -After the command completes, read stderr for cost/error info: -```bash -cat "$TMPERR" -``` - -Present the full output verbatim under a `CODEX SAYS (code review):` header: - -``` -CODEX SAYS (code review): -════════════════════════════════════════════════════════════ - -════════════════════════════════════════════════════════════ -GATE: PASS Tokens: N | Est. cost: ~$X.XX -``` - -Check the output for `[P1]` markers. If found: `GATE: FAIL`. If no `[P1]`: `GATE: PASS`. - -**If GATE is FAIL:** use AskUserQuestion: - -``` -Codex found N critical issues in the diff. - -A) Investigate and fix now (recommended) -B) Ship anyway — these issues may cause production problems -``` - -If the user chooses A: read the Codex findings carefully and work to address them. After fixing, re-run tests (Step 3) since code has changed. Then re-run `codex review` to verify the gate is now PASS. - -If the user chooses B: continue to the next step. - -### Error handling (code review) - -Before persisting the gate result, check for errors. All errors are non-blocking — Codex is a quality enhancement, not a prerequisite. Check `$TMPERR` output (already read above) for error indicators: - -- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key", tell the user: "Codex authentication failed. Run \`codex login\` in your terminal to authenticate via ChatGPT." Do NOT persist a review log entry. Continue to the adversarial step (it will likely fail too, but try anyway). -- **Timeout:** If the Bash call times out (5 min), tell the user: "Codex timed out after 5 minutes. The diff may be too large or the API may be slow." Do NOT persist a review log entry. Skip to cleanup. -- **Empty response:** If codex returned no stdout output, tell the user: "Codex returned no response. Stderr: ." Do NOT persist a review log entry. Skip to cleanup. - -**Only if codex produced a real review (non-empty stdout):** Persist the code review result: -```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' -``` - -Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail"). - -**Adversarial challenge:** Run: ```bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" ``` -After the command completes, read adversarial stderr: +Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: ```bash cat "$TMPERR_ADV" ``` -Present the full output verbatim under a `CODEX SAYS (adversarial challenge):` header. This is informational — it never blocks shipping. If the adversarial command timed out or returned no output, note this to the user and continue. +Present the full output verbatim. This is informational — it never blocks shipping. -**Cleanup:** Run `rm -f "$TMPERR" "$TMPERR_ADV"` after processing. +**Error handling:** All errors are non-blocking — adversarial review is a quality enhancement, not a prerequisite. +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \`codex login\` to authenticate." +- **Timeout:** "Codex timed out after 5 minutes." +- **Empty response:** "Codex returned no response. Stderr: ." + +On any Codex error, fall back to the Claude adversarial subagent automatically. + +**Claude adversarial subagent** (fallback when Codex unavailable or errored): + +Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to. + +Subagent prompt: +"Read the diff for this branch with `git diff origin/`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment)." + +Present findings under an `ADVERSARIAL REVIEW (Claude subagent):` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational. + +If the subagent fails or times out: "Claude adversarial subagent unavailable. Continuing without adversarial review." + +**Persist the review result:** +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"medium","commit":"'"$(git rev-parse --short HEAD)"'"}' +``` +Substitute STATUS: "clean" if no findings, "issues_found" if findings exist. SOURCE: "codex" if Codex ran, "claude" if subagent ran. If both failed, do NOT persist. + +**Cleanup:** Run `rm -f "$TMPERR_ADV"` after processing (if Codex was used). + +--- + +### Large tier (200+ lines) + +Claude's structured review already ran. Now run **all three remaining passes** for maximum coverage: + +**1. Codex structured review (if available):** +```bash +TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" +``` + +Use a 5-minute timeout. Present output under `CODEX SAYS (code review):` header. +Check for `[P1]` markers: found → `GATE: FAIL`, not found → `GATE: PASS`. + +If GATE is FAIL, use AskUserQuestion: +``` +Codex found N critical issues in the diff. + +A) Investigate and fix now (recommended) +B) Continue — review will still complete +``` + +If A: address the findings. After fixing, re-run tests (Step 3) since code has changed. Re-run `codex review` to verify. + +Read stderr for errors (same error handling as medium tier). + +After stderr: `rm -f "$TMPERR"` + +**2. Claude adversarial subagent:** Dispatch a subagent with the adversarial prompt (same prompt as medium tier). This always runs regardless of Codex availability. + +**3. Codex adversarial challenge (if available):** Run `codex exec` with the adversarial prompt (same as medium tier). + +If Codex is not available for steps 1 and 3, note to the user: "Codex CLI not found — large-diff review ran Claude structured + Claude adversarial (2 of 4 passes). Install Codex for full 4-pass coverage: `npm install -g @openai/codex`" + +**Persist the review result AFTER all passes complete** (not after each sub-step): +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"large","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' +``` +Substitute: STATUS = "clean" if no findings across ALL passes, "issues_found" if any pass found issues. SOURCE = "both" if Codex ran, "claude" if only Claude subagent ran. GATE = the Codex structured review gate result ("pass"/"fail"), or "informational" if Codex was unavailable. If all passes failed, do NOT persist. + +--- + +### Cross-model synthesis (medium and large tiers) + +After all passes complete, synthesize findings across all sources: + +``` +ADVERSARIAL REVIEW SYNTHESIS (auto: TIER, N lines): +════════════════════════════════════════════════════════════ + High confidence (found by multiple sources): [findings agreed on by >1 pass] + Unique to Claude structured review: [from earlier step] + Unique to Claude adversarial: [from subagent, if ran] + Unique to Codex: [from codex adversarial or code review, if ran] + Models used: Claude structured ✓ Claude adversarial ✓/✗ Codex ✓/✗ +════════════════════════════════════════════════════════════ +``` + +High-confidence findings (agreed on by multiple sources) should be prioritized for fixes. --- @@ -1221,7 +1243,7 @@ doc updates — the user runs `/ship` and documentation stays current without a - **Never skip tests.** If tests fail, stop. - **Never skip the pre-landing review.** If checklist.md is unreadable, stop. - **Never force push.** Use regular `git push` only. -- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), Codex critical findings ([P1]), and the one-time Codex adoption prompt. +- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), and Codex structured review [P1] findings (large diffs only). - **Always use the 4-digit version format** from the VERSION file. - **Date format in CHANGELOG:** `YYYY-MM-DD` - **Split commits for bisectability** — each commit = one logical change. diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index 6b441870..a748314d 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -11,6 +11,7 @@ allowed-tools: - Edit - Grep - Glob + - Agent - AskUserQuestion - WebSearch --- @@ -403,7 +404,7 @@ For each classified comment: --- -{{CODEX_REVIEW_STEP}} +{{ADVERSARIAL_STEP}} ## Step 4: Version bump (auto-decide) @@ -644,7 +645,7 @@ doc updates — the user runs `/ship` and documentation stays current without a - **Never skip tests.** If tests fail, stop. - **Never skip the pre-landing review.** If checklist.md is unreadable, stop. - **Never force push.** Use regular `git push` only. -- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), Codex critical findings ([P1]), and the one-time Codex adoption prompt. +- **Never ask for trivial confirmations** (e.g., "ready to push?", "create PR?"). DO stop for: version bumps (MINOR/MAJOR), pre-landing review findings (ASK items), and Codex structured review [P1] findings (large diffs only). - **Always use the 4-digit version format** from the VERSION file. - **Date format in CHANGELOG:** `YYYY-MM-DD` - **Split commits for bisectability** — each commit = one logical change. diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 6300803d..e84a2605 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -1256,35 +1256,48 @@ describe('Codex skill', () => { expect(content).toContain('mktemp'); }); - test('codex integration in /review has config-driven review step', () => { + test('adversarial review in /review auto-scales by diff size', () => { const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); - expect(content).toContain('Codex review'); - expect(content).toContain('codex_reviews'); - expect(content).toContain('codex review'); - expect(content).toContain('adversarial'); + expect(content).toContain('Adversarial review (auto-scaled)'); + // Diff size thresholds + expect(content).toContain('< 50'); + expect(content).toContain('50–199'); + expect(content).toContain('200+'); + // All three tiers present + expect(content).toContain('Small'); + expect(content).toContain('Medium tier'); + expect(content).toContain('Large tier'); + // Claude adversarial subagent dispatch + expect(content).toContain('Agent tool'); + expect(content).toContain('FIXABLE'); + expect(content).toContain('INVESTIGATE'); + // Codex fallback logic + expect(content).toContain('CODEX_NOT_AVAILABLE'); + expect(content).toContain('fall back to the Claude adversarial subagent'); + // Review log uses new skill name + expect(content).toContain('adversarial-review'); expect(content).toContain('xhigh'); - expect(content).toContain('Investigate and fix'); - expect(content).toContain('CROSS-MODEL'); + expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS'); }); - test('codex integration in /ship has config-driven review step', () => { + test('adversarial review in /ship auto-scales by diff size', () => { const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); - expect(content).toContain('Codex review'); - expect(content).toContain('codex_reviews'); - expect(content).toContain('codex review'); - expect(content).toContain('codex-review'); + expect(content).toContain('Adversarial review (auto-scaled)'); + expect(content).toContain('< 50'); + expect(content).toContain('200+'); + expect(content).toContain('adversarial-review'); expect(content).toContain('xhigh'); expect(content).toContain('Investigate and fix'); }); - test('codex-host ship/review do NOT contain codex review step', () => { + test('codex-host ship/review do NOT contain adversarial review step', () => { const shipContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-ship', 'SKILL.md'), 'utf-8'); expect(shipContent).not.toContain('codex review --base'); expect(shipContent).not.toContain('Investigate and fix'); const reviewContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-review', 'SKILL.md'), 'utf-8'); expect(reviewContent).not.toContain('codex review --base'); - expect(reviewContent).not.toContain('codex_reviews'); + expect(reviewContent).not.toContain('adversarial-review'); expect(reviewContent).not.toContain('Investigate and fix'); }); @@ -1294,9 +1307,9 @@ describe('Codex skill', () => { expect(content).toContain('codex exec'); }); - test('Review Readiness Dashboard includes Codex Review row', () => { + test('Review Readiness Dashboard includes Adversarial Review row', () => { const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); - expect(content).toContain('Codex Review'); + expect(content).toContain('Adversarial'); expect(content).toContain('codex-review'); }); }); From 8321115a4e4ad534a97a39ce8cf922e8e9bcee52 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 21 Mar 2026 12:55:02 -0700 Subject: [PATCH 8/8] feat: plan file review report + enriched JSONL logging (v0.9.7.0) (#303) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: plan file review report — markdown table appended to plan files Adds {{PLAN_FILE_REVIEW_REPORT}} template resolver that instructs review skills to write a structured markdown table (with Trigger/Why/Status/Findings columns) to the plan file itself, so review status is visible to anyone reading the plan — not just in conversation output. Integrated into plan-ceo-review, plan-eng-review, plan-design-review, and codex skill templates. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: enrich JSONL review logs for accurate plan file report CEO reviews now log scope_proposed/accepted/deferred counts, eng reviews log total issues_found, design reviews log initial_score for before→after tracking, and codex reviews log findings_fixed. Report generator references these fields directly instead of requiring agents to reconstruct from partial data. Also fixes footer replacement to handle mid-file sections robustly. Co-Authored-By: Claude Opus 4.6 (1M context) * chore: bump version and changelog (v0.9.7.0) Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .../skills/gstack-plan-ceo-review/SKILL.md | 72 ++++++++++++++++++- .../skills/gstack-plan-design-review/SKILL.md | 72 ++++++++++++++++++- .../skills/gstack-plan-eng-review/SKILL.md | 70 +++++++++++++++++- CHANGELOG.md | 7 ++ VERSION | 2 +- codex/SKILL.md | 72 ++++++++++++++++++- codex/SKILL.md.tmpl | 7 +- plan-ceo-review/SKILL.md | 72 ++++++++++++++++++- plan-ceo-review/SKILL.md.tmpl | 7 +- plan-design-review/SKILL.md | 72 ++++++++++++++++++- plan-design-review/SKILL.md.tmpl | 7 +- plan-eng-review/SKILL.md | 70 +++++++++++++++++- plan-eng-review/SKILL.md.tmpl | 5 +- scripts/gen-skill-docs.ts | 70 ++++++++++++++++++ test/gen-skill-docs.test.ts | 24 +++++++ 15 files changed, 612 insertions(+), 17 deletions(-) diff --git a/.agents/skills/gstack-plan-ceo-review/SKILL.md b/.agents/skills/gstack-plan-ceo-review/SKILL.md index 52edc3ba..f253d18d 100644 --- a/.agents/skills/gstack-plan-ceo-review/SKILL.md +++ b/.agents/skills/gstack-plan-ceo-review/SKILL.md @@ -1056,7 +1056,7 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","commit":"COMMIT"}' +~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","scope_proposed":N,"scope_accepted":N,"scope_deferred":N,"commit":"COMMIT"}' ``` Before running this command, substitute the placeholder values from the Completion Summary you just produced: @@ -1065,6 +1065,9 @@ Before running this command, substitute the placeholder values from the Completi - **unresolved**: number from "Unresolved decisions" in the summary - **critical_gaps**: number from "Failure modes: ___ CRITICAL GAPS" in the summary - **MODE**: the mode the user selected (SCOPE_EXPANSION / SELECTIVE_EXPANSION / HOLD_SCOPE / SCOPE_REDUCTION) +- **scope_proposed**: number from "Scope proposals: ___ proposed" in the summary (0 for HOLD/REDUCTION) +- **scope_accepted**: number from "Scope proposals: ___ accepted" in the summary (0 for HOLD/REDUCTION) +- **scope_deferred**: number of items deferred to TODOS.md from scope decisions (0 for HOLD/REDUCTION) - **COMMIT**: output of `git rev-parse --short HEAD` ## Review Readiness Dashboard @@ -1110,6 +1113,73 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - For entries without a \`commit\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection" - If all reviews match the current HEAD, do not display any staleness notes +## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`mode\`, \`scope_proposed\`, \`scope_accepted\`, \`scope_deferred\`, \`commit\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`issues_found\`, \`mode\`, \`commit\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \`status\`, \`initial_score\`, \`overall_score\`, \`unresolved\`, \`decisions_made\`, \`commit\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \`status\`, \`gate\`, \`findings\`, \`findings_fixed\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \`/codex review\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \`/plan-design-review\` | UI/UX gaps | {runs} | {status} | {findings} | +\`\`\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \`## GSTACK REVIEW REPORT\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \`## GSTACK REVIEW REPORT\` + through either the next \`## \` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end. + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, recommend the next review(s) based on what this CEO review discovered. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/.agents/skills/gstack-plan-design-review/SKILL.md b/.agents/skills/gstack-plan-design-review/SKILL.md index ee71ce2e..af092247 100644 --- a/.agents/skills/gstack-plan-design-review/SKILL.md +++ b/.agents/skills/gstack-plan-design-review/SKILL.md @@ -508,13 +508,14 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N,"commit":"COMMIT"}' +~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","initial_score":N,"overall_score":N,"unresolved":N,"decisions_made":N,"commit":"COMMIT"}' ``` Substitute values from the Completion Summary: - **TIMESTAMP**: current ISO 8601 datetime - **STATUS**: "clean" if overall score 8+ AND 0 unresolved; otherwise "issues_open" -- **overall_score**: final overall design score (0-10) +- **initial_score**: initial overall design score before fixes (0-10) +- **overall_score**: final overall design score after fixes (0-10) - **unresolved**: number of unresolved design decisions - **decisions_made**: number of design decisions added to the plan - **COMMIT**: output of `git rev-parse --short HEAD` @@ -562,6 +563,73 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - For entries without a \`commit\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection" - If all reviews match the current HEAD, do not display any staleness notes +## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`mode\`, \`scope_proposed\`, \`scope_accepted\`, \`scope_deferred\`, \`commit\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`issues_found\`, \`mode\`, \`commit\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \`status\`, \`initial_score\`, \`overall_score\`, \`unresolved\`, \`decisions_made\`, \`commit\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \`status\`, \`gate\`, \`findings\`, \`findings_fixed\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \`/codex review\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \`/plan-design-review\` | UI/UX gaps | {runs} | {status} | {findings} | +\`\`\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \`## GSTACK REVIEW REPORT\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \`## GSTACK REVIEW REPORT\` + through either the next \`## \` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end. + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, recommend the next review(s) based on what this design review discovered. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/.agents/skills/gstack-plan-eng-review/SKILL.md b/.agents/skills/gstack-plan-eng-review/SKILL.md index fa89ed78..f2be53a3 100644 --- a/.agents/skills/gstack-plan-eng-review/SKILL.md +++ b/.agents/skills/gstack-plan-eng-review/SKILL.md @@ -505,7 +505,7 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","commit":"COMMIT"}' +~/.codex/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"issues_found":N,"mode":"MODE","commit":"COMMIT"}' ``` Substitute values from the Completion Summary: @@ -513,6 +513,7 @@ Substitute values from the Completion Summary: - **STATUS**: "clean" if 0 unresolved decisions AND 0 critical gaps; otherwise "issues_open" - **unresolved**: number from "Unresolved decisions" count - **critical_gaps**: number from "Failure modes: ___ critical gaps flagged" +- **issues_found**: total issues found across all review sections (Architecture + Code Quality + Performance + Test gaps) - **MODE**: FULL_REVIEW / SCOPE_REDUCED - **COMMIT**: output of `git rev-parse --short HEAD` @@ -559,6 +560,73 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - For entries without a \`commit\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection" - If all reviews match the current HEAD, do not display any staleness notes +## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`mode\`, \`scope_proposed\`, \`scope_accepted\`, \`scope_deferred\`, \`commit\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`issues_found\`, \`mode\`, \`commit\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \`status\`, \`initial_score\`, \`overall_score\`, \`unresolved\`, \`decisions_made\`, \`commit\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \`status\`, \`gate\`, \`findings\`, \`findings_fixed\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \`/codex review\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \`/plan-design-review\` | UI/UX gaps | {runs} | {status} | {findings} | +\`\`\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \`## GSTACK REVIEW REPORT\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \`## GSTACK REVIEW REPORT\` + through either the next \`## \` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end. + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, check if additional reviews would be valuable. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/CHANGELOG.md b/CHANGELOG.md index 39ad4c75..dbfe286a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.9.7.0] - 2026-03-21 — Plan File Review Report + +### Added + +- **Every plan file now shows which reviews have run.** After any review skill finishes (`/plan-ceo-review`, `/plan-eng-review`, `/plan-design-review`, `/codex review`), a markdown table is appended to the plan file itself — showing each review's trigger command, purpose, run count, status, and findings summary. Anyone reading the plan can see review status at a glance without checking conversation history. +- **Review logs now capture richer data.** CEO reviews log scope proposal counts (proposed/accepted/deferred), eng reviews log total issues found, design reviews log before→after scores, and codex reviews log how many findings were fixed. The plan file report uses these fields directly — no more guessing from partial metadata. + ## [0.9.6.0] - 2026-03-21 — Auto-Scaled Adversarial Review ### Changed diff --git a/VERSION b/VERSION index c318497f..d9439040 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.6.0 +0.9.7.0 diff --git a/codex/SKILL.md b/codex/SKILL.md index 8af53954..86715597 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -367,17 +367,85 @@ CROSS-MODEL ANALYSIS: 7. Persist the review result: ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N,"findings_fixed":N}' ``` Substitute: TIMESTAMP (ISO 8601), STATUS ("clean" if PASS, "issues_found" if FAIL), -GATE ("pass" or "fail"), findings (count of [P1] + [P2] markers). +GATE ("pass" or "fail"), findings (count of [P1] + [P2] markers), +findings_fixed (count of findings that were addressed/fixed before shipping). 8. Clean up temp files: ```bash rm -f "$TMPERR" ``` +## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`mode\`, \`scope_proposed\`, \`scope_accepted\`, \`scope_deferred\`, \`commit\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`issues_found\`, \`mode\`, \`commit\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \`status\`, \`initial_score\`, \`overall_score\`, \`unresolved\`, \`decisions_made\`, \`commit\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \`status\`, \`gate\`, \`findings\`, \`findings_fixed\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \`/codex review\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \`/plan-design-review\` | UI/UX gaps | {runs} | {status} | {findings} | +\`\`\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \`## GSTACK REVIEW REPORT\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \`## GSTACK REVIEW REPORT\` + through either the next \`## \` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end. + --- ## Step 2B: Challenge (Adversarial) Mode diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index 30b603ee..0aa7fec6 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -126,17 +126,20 @@ CROSS-MODEL ANALYSIS: 7. Persist the review result: ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N,"findings_fixed":N}' ``` Substitute: TIMESTAMP (ISO 8601), STATUS ("clean" if PASS, "issues_found" if FAIL), -GATE ("pass" or "fail"), findings (count of [P1] + [P2] markers). +GATE ("pass" or "fail"), findings (count of [P1] + [P2] markers), +findings_fixed (count of findings that were addressed/fixed before shipping). 8. Clean up temp files: ```bash rm -f "$TMPERR" ``` +{{PLAN_FILE_REVIEW_REPORT}} + --- ## Step 2B: Challenge (Adversarial) Mode diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index cc99f436..28ba5910 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -1065,7 +1065,7 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","commit":"COMMIT"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","scope_proposed":N,"scope_accepted":N,"scope_deferred":N,"commit":"COMMIT"}' ``` Before running this command, substitute the placeholder values from the Completion Summary you just produced: @@ -1074,6 +1074,9 @@ Before running this command, substitute the placeholder values from the Completi - **unresolved**: number from "Unresolved decisions" in the summary - **critical_gaps**: number from "Failure modes: ___ CRITICAL GAPS" in the summary - **MODE**: the mode the user selected (SCOPE_EXPANSION / SELECTIVE_EXPANSION / HOLD_SCOPE / SCOPE_REDUCTION) +- **scope_proposed**: number from "Scope proposals: ___ proposed" in the summary (0 for HOLD/REDUCTION) +- **scope_accepted**: number from "Scope proposals: ___ accepted" in the summary (0 for HOLD/REDUCTION) +- **scope_deferred**: number of items deferred to TODOS.md from scope decisions (0 for HOLD/REDUCTION) - **COMMIT**: output of `git rev-parse --short HEAD` ## Review Readiness Dashboard @@ -1119,6 +1122,73 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - For entries without a \`commit\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection" - If all reviews match the current HEAD, do not display any staleness notes +## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`mode\`, \`scope_proposed\`, \`scope_accepted\`, \`scope_deferred\`, \`commit\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`issues_found\`, \`mode\`, \`commit\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \`status\`, \`initial_score\`, \`overall_score\`, \`unresolved\`, \`decisions_made\`, \`commit\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \`status\`, \`gate\`, \`findings\`, \`findings_fixed\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \`/codex review\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \`/plan-design-review\` | UI/UX gaps | {runs} | {status} | {findings} | +\`\`\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \`## GSTACK REVIEW REPORT\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \`## GSTACK REVIEW REPORT\` + through either the next \`## \` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end. + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, recommend the next review(s) based on what this CEO review discovered. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index 7342e790..6b676a86 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -747,7 +747,7 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","commit":"COMMIT"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","scope_proposed":N,"scope_accepted":N,"scope_deferred":N,"commit":"COMMIT"}' ``` Before running this command, substitute the placeholder values from the Completion Summary you just produced: @@ -756,10 +756,15 @@ Before running this command, substitute the placeholder values from the Completi - **unresolved**: number from "Unresolved decisions" in the summary - **critical_gaps**: number from "Failure modes: ___ CRITICAL GAPS" in the summary - **MODE**: the mode the user selected (SCOPE_EXPANSION / SELECTIVE_EXPANSION / HOLD_SCOPE / SCOPE_REDUCTION) +- **scope_proposed**: number from "Scope proposals: ___ proposed" in the summary (0 for HOLD/REDUCTION) +- **scope_accepted**: number from "Scope proposals: ___ accepted" in the summary (0 for HOLD/REDUCTION) +- **scope_deferred**: number of items deferred to TODOS.md from scope decisions (0 for HOLD/REDUCTION) - **COMMIT**: output of `git rev-parse --short HEAD` {{REVIEW_DASHBOARD}} +{{PLAN_FILE_REVIEW_REPORT}} + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, recommend the next review(s) based on what this CEO review discovered. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 79d39438..d7aaa3e8 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -516,13 +516,14 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N,"commit":"COMMIT"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","initial_score":N,"overall_score":N,"unresolved":N,"decisions_made":N,"commit":"COMMIT"}' ``` Substitute values from the Completion Summary: - **TIMESTAMP**: current ISO 8601 datetime - **STATUS**: "clean" if overall score 8+ AND 0 unresolved; otherwise "issues_open" -- **overall_score**: final overall design score (0-10) +- **initial_score**: initial overall design score before fixes (0-10) +- **overall_score**: final overall design score after fixes (0-10) - **unresolved**: number of unresolved design decisions - **decisions_made**: number of design decisions added to the plan - **COMMIT**: output of `git rev-parse --short HEAD` @@ -570,6 +571,73 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - For entries without a \`commit\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection" - If all reviews match the current HEAD, do not display any staleness notes +## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`mode\`, \`scope_proposed\`, \`scope_accepted\`, \`scope_deferred\`, \`commit\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`issues_found\`, \`mode\`, \`commit\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \`status\`, \`initial_score\`, \`overall_score\`, \`unresolved\`, \`decisions_made\`, \`commit\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \`status\`, \`gate\`, \`findings\`, \`findings_fixed\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \`/codex review\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \`/plan-design-review\` | UI/UX gaps | {runs} | {status} | {findings} | +\`\`\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \`## GSTACK REVIEW REPORT\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \`## GSTACK REVIEW REPORT\` + through either the next \`## \` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end. + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, recommend the next review(s) based on what this design review discovered. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/plan-design-review/SKILL.md.tmpl b/plan-design-review/SKILL.md.tmpl index 597ff6a7..46e5b6f1 100644 --- a/plan-design-review/SKILL.md.tmpl +++ b/plan-design-review/SKILL.md.tmpl @@ -275,19 +275,22 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N,"commit":"COMMIT"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","initial_score":N,"overall_score":N,"unresolved":N,"decisions_made":N,"commit":"COMMIT"}' ``` Substitute values from the Completion Summary: - **TIMESTAMP**: current ISO 8601 datetime - **STATUS**: "clean" if overall score 8+ AND 0 unresolved; otherwise "issues_open" -- **overall_score**: final overall design score (0-10) +- **initial_score**: initial overall design score before fixes (0-10) +- **overall_score**: final overall design score after fixes (0-10) - **unresolved**: number of unresolved design decisions - **decisions_made**: number of design decisions added to the plan - **COMMIT**: output of `git rev-parse --short HEAD` {{REVIEW_DASHBOARD}} +{{PLAN_FILE_REVIEW_REPORT}} + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, recommend the next review(s) based on what this design review discovered. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 2ee0db68..b3f099a0 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -515,7 +515,7 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","commit":"COMMIT"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"issues_found":N,"mode":"MODE","commit":"COMMIT"}' ``` Substitute values from the Completion Summary: @@ -523,6 +523,7 @@ Substitute values from the Completion Summary: - **STATUS**: "clean" if 0 unresolved decisions AND 0 critical gaps; otherwise "issues_open" - **unresolved**: number from "Unresolved decisions" count - **critical_gaps**: number from "Failure modes: ___ critical gaps flagged" +- **issues_found**: total issues found across all review sections (Architecture + Code Quality + Performance + Test gaps) - **MODE**: FULL_REVIEW / SCOPE_REDUCED - **COMMIT**: output of `git rev-parse --short HEAD` @@ -569,6 +570,73 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - For entries without a \`commit\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection" - If all reviews match the current HEAD, do not display any staleness notes +## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`mode\`, \`scope_proposed\`, \`scope_accepted\`, \`scope_deferred\`, \`commit\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \`status\`, \`unresolved\`, \`critical_gaps\`, \`issues_found\`, \`mode\`, \`commit\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \`status\`, \`initial_score\`, \`overall_score\`, \`unresolved\`, \`decisions_made\`, \`commit\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \`status\`, \`gate\`, \`findings\`, \`findings_fixed\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \`/codex review\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \`/plan-design-review\` | UI/UX gaps | {runs} | {status} | {findings} | +\`\`\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \`## GSTACK REVIEW REPORT\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \`## GSTACK REVIEW REPORT\` + through either the next \`## \` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end. + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, check if additional reviews would be valuable. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index bc254d4f..f48bdd49 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -274,7 +274,7 @@ the same pattern. The review dashboard depends on this data. Skipping this command breaks the review readiness dashboard in /ship. ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"MODE","commit":"COMMIT"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"TIMESTAMP","status":"STATUS","unresolved":N,"critical_gaps":N,"issues_found":N,"mode":"MODE","commit":"COMMIT"}' ``` Substitute values from the Completion Summary: @@ -282,11 +282,14 @@ Substitute values from the Completion Summary: - **STATUS**: "clean" if 0 unresolved decisions AND 0 critical gaps; otherwise "issues_open" - **unresolved**: number from "Unresolved decisions" count - **critical_gaps**: number from "Failure modes: ___ critical gaps flagged" +- **issues_found**: total issues found across all review sections (Architecture + Code Quality + Performance + Test gaps) - **MODE**: FULL_REVIEW / SCOPE_REDUCED - **COMMIT**: output of `git rev-parse --short HEAD` {{REVIEW_DASHBOARD}} +{{PLAN_FILE_REVIEW_REPORT}} + ## Next Steps — Review Chaining After displaying the Review Readiness Dashboard, check if additional reviews would be valuable. Read the dashboard output to see which reviews have already been run and whether they are stale. diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 2d862383..c347e69a 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -1130,6 +1130,75 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - If all reviews match the current HEAD, do not display any staleness notes`; } +function generatePlanFileReviewReport(_ctx: TemplateContext): string { + return `## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \\\`status\\\`, \\\`unresolved\\\`, \\\`critical_gaps\\\`, \\\`mode\\\`, \\\`scope_proposed\\\`, \\\`scope_accepted\\\`, \\\`scope_deferred\\\`, \\\`commit\\\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \\\`status\\\`, \\\`unresolved\\\`, \\\`critical_gaps\\\`, \\\`issues_found\\\`, \\\`mode\\\`, \\\`commit\\\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \\\`status\\\`, \\\`initial_score\\\`, \\\`overall_score\\\`, \\\`unresolved\\\`, \\\`decisions_made\\\`, \\\`commit\\\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \\\`status\\\`, \\\`gate\\\`, \\\`findings\\\`, \\\`findings_fixed\\\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\\\`\\\`\\\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \\\`/plan-ceo-review\\\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \\\`/codex review\\\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \\\`/plan-eng-review\\\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \\\`/plan-design-review\\\` | UI/UX gaps | {runs} | {status} | {findings} | +\\\`\\\`\\\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \\\`## GSTACK REVIEW REPORT\\\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \\\`## GSTACK REVIEW REPORT\\\` + through either the next \\\`## \\\` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end.`; +} + function generateTestBootstrap(_ctx: TemplateContext): string { return `## Test Framework Bootstrap @@ -1589,6 +1658,7 @@ const RESOLVERS: Record string> = { DESIGN_METHODOLOGY: generateDesignMethodology, DESIGN_REVIEW_LITE: generateDesignReviewLite, REVIEW_DASHBOARD: generateReviewDashboard, + PLAN_FILE_REVIEW_REPORT: generatePlanFileReviewReport, TEST_BOOTSTRAP: generateTestBootstrap, SPEC_REVIEW_LOOP: generateSpecReviewLoop, DESIGN_SKETCH: generateDesignSketch, diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 64b39118..cc75da65 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -416,6 +416,30 @@ describe('REVIEW_DASHBOARD resolver', () => { }); }); +// --- {{PLAN_FILE_REVIEW_REPORT}} resolver tests --- + +describe('PLAN_FILE_REVIEW_REPORT resolver', () => { + const REVIEW_SKILLS = ['plan-ceo-review', 'plan-eng-review', 'plan-design-review', 'codex']; + + for (const skill of REVIEW_SKILLS) { + test(`plan file review report appears in ${skill} generated file`, () => { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + expect(content).toContain('GSTACK REVIEW REPORT'); + }); + } + + test('resolver output contains key report elements', () => { + const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8'); + expect(content).toContain('Trigger'); + expect(content).toContain('Findings'); + expect(content).toContain('VERDICT'); + expect(content).toContain('/plan-ceo-review'); + expect(content).toContain('/plan-eng-review'); + expect(content).toContain('/plan-design-review'); + expect(content).toContain('/codex review'); + }); +}); + // --- {{SPEC_REVIEW_LOOP}} resolver tests --- describe('SPEC_REVIEW_LOOP resolver', () => {