From ab4eb3558a1306100ce755cf053f536cb83456a3 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 13 May 2026 12:01:55 -0700 Subject: [PATCH] test(gbrain): periodic E2E for /setup-gbrain Path 4 + Step 4.5 Yes flow End-to-end coverage of the new opt-in question via runAgentSdkTest. Stubs the MCP endpoint at /tools/list with a 200 response carrying a fake gbrain v0.32.3.0 serverInfo, and fakes the gbrain + claude CLIs so init writes a PGLite config and mcp add succeeds. Asserts the model: 1. invokes gstack-gbrain-install (Step 4.5 Yes branch) 2. invokes `gbrain init --pglite --json` 3. writes a working ~/.gbrain/config.json with engine=pglite 4. registers the remote MCP via `claude mcp add --transport http` 5. never leaks the bearer token to CLAUDE.md Classified as periodic-tier per plan D6 (codex #12 flagged AgentSDK flakiness; gate-tier coverage of the split-engine behavior lives in the deterministic unit tests at gbrain-local-status.test.ts and gbrain-sync-skip.test.ts). Touchfile fires the test when the skill template, install/verify/init helpers, the local-status classifier, or the agent-sdk-runner harness changes. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 6 + ...2e-setup-gbrain-path4-local-pglite.test.ts | 263 ++++++++++++++++++ 2 files changed, 269 insertions(+) create mode 100644 test/skill-e2e-setup-gbrain-path4-local-pglite.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 5043884c3..093855c18 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -157,6 +157,11 @@ export const E2E_TOUCHFILES: Record = { // or the detect script changes. 'setup-gbrain-remote': ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'bin/gstack-artifacts-init', 'bin/gstack-gbrain-detect', 'test/helpers/agent-sdk-runner.ts'], 'setup-gbrain-bad-token': ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'test/helpers/agent-sdk-runner.ts'], + // v1.34.0.0 split-engine Path 4 + Step 4.5 Yes (local PGLite for code). + // Periodic-tier per codex #12 (AgentSDK harness is non-deterministic). + // Fires when the setup-gbrain template, install/verify/init helpers, or + // the agent-sdk-runner harness changes. + 'setup-gbrain-path4-local-pglite': ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'bin/gstack-gbrain-install', 'bin/gstack-gbrain-detect', 'lib/gbrain-local-status.ts', 'test/helpers/agent-sdk-runner.ts'], // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) // Fires when either template OR the two preamble resolvers change. @@ -471,6 +476,7 @@ export const E2E_TIERS: Record = { // model's behavior against a stub MCP server. 'setup-gbrain-remote': 'periodic', 'setup-gbrain-bad-token': 'periodic', + 'setup-gbrain-path4-local-pglite': 'periodic', // AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark) 'plan-ceo-review-format-mode': 'periodic', diff --git a/test/skill-e2e-setup-gbrain-path4-local-pglite.test.ts b/test/skill-e2e-setup-gbrain-path4-local-pglite.test.ts new file mode 100644 index 000000000..9cf8fed9c --- /dev/null +++ b/test/skill-e2e-setup-gbrain-path4-local-pglite.test.ts @@ -0,0 +1,263 @@ +// E2E: /setup-gbrain Path 4 with Step 4.5 "Yes" — local PGLite for code search. +// +// Drives the skill against a stub HTTP MCP server (200 OK on tools/list). +// Auto-answers AskUserQuestion to pick: +// - Path 4 at Step 2 (Remote gbrain MCP) +// - "Yes, set up local PGLite for code" at Step 4.5 +// +// Asserts that the model: +// 1. ran the verify helper successfully (got past Step 4c) +// 2. invoked gstack-gbrain-install (Step 4.5 Yes branch) +// 3. invoked `gbrain init --pglite --json` (also Step 4.5 Yes branch) +// 4. registered the remote MCP via claude mcp add --transport http +// 5. wrote a "Code search ..... OK local-pglite" row to the Step 10 verdict +// +// Periodic-tier (codex #12: AgentSDK harness is non-deterministic; gate-tier +// coverage of the split-engine behavior lives in the deterministic unit +// tests at gbrain-local-status.test.ts, gbrain-sync-skip.test.ts, etc). +// +// Cost: ~$0.50-$1.00 per run. Periodic-tier (EVALS=1 EVALS_TIER=periodic). + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import * as http from 'http'; +import { + runAgentSdkTest, + passThroughNonAskUserQuestion, + resolveClaudeBinary, +} from './helpers/agent-sdk-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +/** + * Minimal stub MCP server that returns success on initialize / tools/list. + * Verify helper calls /tools/list with a Bearer header and inspects the body. + */ +function startStubMcp(): Promise<{ url: string; close: () => Promise }> { + return new Promise((resolve) => { + const server = http.createServer((req, res) => { + let body = ''; + req.on('data', (c) => (body += c)); + req.on('end', () => { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/event-stream'); + // Try to be useful: respond with a fake initialize + tools/list payload. + let payload: unknown = { jsonrpc: '2.0', id: 1, result: { tools: [] } }; + try { + const req = JSON.parse(body); + if (req.method === 'initialize') { + payload = { + jsonrpc: '2.0', + id: req.id, + result: { + protocolVersion: '2024-11-05', + capabilities: { tools: {} }, + serverInfo: { name: 'gbrain', version: '0.32.3.0' }, + }, + }; + } + } catch { + // ignore parse failure; default payload + } + res.end(`event: message\ndata: ${JSON.stringify(payload)}\n\n`); + }); + }); + server.listen(0, '127.0.0.1', () => { + const addr = server.address(); + if (!addr || typeof addr === 'string') throw new Error('no address'); + resolve({ + url: `http://127.0.0.1:${addr.port}/mcp`, + close: () => new Promise((r) => server.close(() => r())), + }); + }); + }); +} + +/** + * Fake gbrain CLI: + * - --version → echoes a version + * - init --pglite --json → writes a pglite config, exits 0 + * - everything else → exits 0 quietly + * + * Logs every invocation so we can assert init was called. + */ +function makeFakeGbrain(binDir: string, gbrainConfigPath: string): string { + const callLog = path.join(binDir, 'gbrain-calls.log'); + const script = `#!/bin/bash +echo "gbrain $@" >> "${callLog}" +case "$1 $2" in + "--version "*) echo "gbrain 0.33.1.0"; exit 0 ;; + "init --pglite") cat > "${gbrainConfigPath}" <> "${callLog}" +case "$1 $2" in + "mcp add") exit 0 ;; + "mcp list") echo "gbrain: http://stub/mcp (HTTP) — connected" ; exit 0 ;; + "mcp remove") exit 0 ;; + "mcp get") echo '{"type":"http","url":"http://stub/mcp"}'; exit 0 ;; +esac +exit 0 +`; + fs.writeFileSync(path.join(binDir, 'claude'), script, { mode: 0o755 }); + return callLog; +} + +/** + * Fake gstack-gbrain-install so we don't actually clone the gbrain repo + + * bun-link. The test only cares that the skill INVOKED it on the Yes branch. + */ +function makeFakeInstall(binDir: string): string { + const callLog = path.join(binDir, 'install-calls.log'); + const script = `#!/bin/bash +echo "install $@" >> "${callLog}" +exit 0 +`; + fs.writeFileSync(path.join(binDir, 'gstack-gbrain-install'), script, { + mode: 0o755, + }); + return callLog; +} + +describeE2E('/setup-gbrain Path 4 + Step 4.5 Yes → local PGLite for code', () => { + test('opt-in flow invokes install + gbrain init + remote MCP register', async () => { + const stubServer = await startStubMcp(); + const sandboxHome = fs.mkdtempSync(path.join(os.tmpdir(), 'path4-pglite-')); + const fakeBinDir = fs.mkdtempSync(path.join(os.tmpdir(), 'path4-pglite-bin-')); + const gbrainConfigDir = path.join(sandboxHome, '.gbrain'); + fs.mkdirSync(gbrainConfigDir, { recursive: true }); + const gbrainConfigPath = path.join(gbrainConfigDir, 'config.json'); + const claudeLog = makeFakeClaude(fakeBinDir); + const gbrainLog = makeFakeGbrain(fakeBinDir, gbrainConfigPath); + const installLog = makeFakeInstall(fakeBinDir); + + const ORIGINAL_CLAUDE_MD = '# Test project\n'; + fs.writeFileSync(path.join(sandboxHome, 'CLAUDE.md'), ORIGINAL_CLAUDE_MD); + + const askLog: Array<{ question: string; choice: string }> = []; + const binary = resolveClaudeBinary(); + + const orig = { + home: process.env.HOME, + pathEnv: process.env.PATH, + mcpToken: process.env.GBRAIN_MCP_TOKEN, + }; + process.env.HOME = sandboxHome; + process.env.PATH = `${fakeBinDir}:${path.join(path.resolve(import.meta.dir, '..'), 'bin')}:${process.env.PATH ?? '/usr/bin:/bin:/opt/homebrew/bin'}`; + process.env.GBRAIN_MCP_TOKEN = 'gbrain_fake_token_for_test'; + + try { + const skillPath = path.resolve( + import.meta.dir, + '..', + 'setup-gbrain', + 'SKILL.md', + ); + const result = await runAgentSdkTest({ + systemPrompt: { type: 'preset', preset: 'claude_code' }, + userPrompt: + `Read the skill file at ${skillPath} and follow Path 4 (Remote MCP). ` + + `Use this MCP URL: ${stubServer.url}. ` + + `The bearer token is already in GBRAIN_MCP_TOKEN. ` + + `At Step 4.5 (the new "Want symbol-aware code search?" question), PICK YES — set up local PGLite for code. ` + + `Then continue through Step 5a (MCP registration) → Step 10 (verdict). ` + + `Do not skip Step 4.5; the test depends on the Yes path being taken.`, + workingDirectory: sandboxHome, + maxTurns: 25, + allowedTools: ['Read', 'Grep', 'Glob', 'Bash', 'Write', 'Edit'], + ...(binary ? { pathToClaudeCodeExecutable: binary } : {}), + canUseTool: async (toolName, input) => { + if (toolName === 'AskUserQuestion') { + const qs = input.questions as Array<{ + question: string; + options: Array<{ label: string }>; + }>; + const answers: Record = {}; + for (const q of qs) { + // Heuristics: pick the option that screams "yes/PGLite/code search" for our flow. + const yes = + q.options.find((o) => + /yes.*local|local.*pglite|code search|opt in/i.test(o.label), + ) ?? + q.options.find((o) => /remote.*mcp|path 4/i.test(o.label)) ?? + q.options[0]!; + answers[q.question] = yes.label; + askLog.push({ question: q.question, choice: yes.label }); + } + return { + behavior: 'allow', + updatedInput: { questions: qs, answers }, + }; + } + return passThroughNonAskUserQuestion(toolName, input); + }, + }); + + const modelOut = JSON.stringify(result); + + // Assertion 1: gstack-gbrain-install was invoked (Step 4.5 Yes branch). + const installCalls = fs.existsSync(installLog) + ? fs.readFileSync(installLog, 'utf-8') + : ''; + expect(installCalls.length).toBeGreaterThan(0); + + // Assertion 2: `gbrain init --pglite` was invoked. + const gbrainCalls = fs.existsSync(gbrainLog) + ? fs.readFileSync(gbrainLog, 'utf-8') + : ''; + expect(gbrainCalls).toMatch(/gbrain init --pglite/); + + // Assertion 3: local PGLite config was written. + expect(fs.existsSync(gbrainConfigPath)).toBe(true); + const cfg = JSON.parse(fs.readFileSync(gbrainConfigPath, 'utf-8')) as { + engine: string; + }; + expect(cfg.engine).toBe('pglite'); + + // Assertion 4: claude mcp add --transport http was invoked (remote MCP register). + const claudeCalls = fs.existsSync(claudeLog) + ? fs.readFileSync(claudeLog, 'utf-8') + : ''; + expect(claudeCalls).toMatch(/mcp add.*--transport http|mcp add.*--header/); + + // Assertion 5: token never leaked to CLAUDE.md + const finalClaudeMd = fs.readFileSync( + path.join(sandboxHome, 'CLAUDE.md'), + 'utf-8', + ); + expect(finalClaudeMd).not.toContain('gbrain_fake_token_for_test'); + + // Soft assertion: AskUserQuestion was actually called (sanity) + expect(askLog.length).toBeGreaterThan(0); + } finally { + if (orig.home === undefined) delete process.env.HOME; + else process.env.HOME = orig.home; + if (orig.pathEnv === undefined) delete process.env.PATH; + else process.env.PATH = orig.pathEnv; + if (orig.mcpToken === undefined) delete process.env.GBRAIN_MCP_TOKEN; + else process.env.GBRAIN_MCP_TOKEN = orig.mcpToken; + await stubServer.close(); + fs.rmSync(sandboxHome, { recursive: true, force: true }); + fs.rmSync(fakeBinDir, { recursive: true, force: true }); + } + }, 300_000); +});