diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index c42a03c7..c03974f7 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -141,6 +141,10 @@ export const E2E_TOUCHFILES: Record = { 'benchmark-workflow': ['benchmark/**', 'browse/src/**'], 'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'], + // Sidebar agent + 'sidebar-navigate': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'], + 'sidebar-url-accuracy': ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'], + // Autoplan 'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'], @@ -262,6 +266,10 @@ export const E2E_TIERS: Record = { 'benchmark-workflow': 'gate', 'setup-deploy-workflow': 'gate', + // Sidebar agent + 'sidebar-navigate': 'periodic', + 'sidebar-url-accuracy': 'periodic', + // Autoplan — periodic (not yet implemented) 'autoplan-core': 'periodic', diff --git a/test/skill-e2e-sidebar.test.ts b/test/skill-e2e-sidebar.test.ts new file mode 100644 index 00000000..e986bd85 --- /dev/null +++ b/test/skill-e2e-sidebar.test.ts @@ -0,0 +1,215 @@ +/** + * Layer 4: E2E tests for the sidebar agent with real Claude. + * Starts browse server + fixture server + sidebar-agent, POSTs to /sidebar-command + * (simulating what the Chrome extension does), and verifies Claude actually processes + * the request and responds through the chat buffer. + * + * These tests cost ~$0.80 total and run as periodic tier. + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { spawn, type Subprocess } from 'bun'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { + ROOT, evalsEnabled, + describeIfSelected, testConcurrentIfSelected, + logCost, recordE2E, + createEvalCollector, finalizeEvalCollector, +} from './helpers/e2e-helpers'; +import { startTestServer } from '../browse/test/test-server'; + +const evalCollector = createEvalCollector('e2e-sidebar'); + +// --- Sidebar Agent E2E --- + +describeIfSelected('Sidebar agent E2E', ['sidebar-navigate', 'sidebar-url-accuracy'], () => { + let serverProc: Subprocess | null = null; + let agentProc: Subprocess | null = null; + let fixtureServer: { server: ReturnType; url: string } | null = null; + let serverPort: number = 0; + let authToken: string = ''; + let tmpDir: string = ''; + let stateFile: string = ''; + let queueFile: string = ''; + + async function api(pathname: string, opts: RequestInit = {}): Promise { + const headers: Record = { + 'Content-Type': 'application/json', + ...(opts.headers as Record || {}), + }; + if (!headers['Authorization'] && authToken) { + headers['Authorization'] = `Bearer ${authToken}`; + } + return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers }); + } + + async function resetState() { + await api('/sidebar-session/new', { method: 'POST' }); + fs.writeFileSync(queueFile, ''); + } + + async function pollChatUntil( + predicate: (entries: any[]) => boolean, + timeoutMs = 60000, + ): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + const resp = await api('/sidebar-chat?after=0'); + const data = await resp.json(); + if (predicate(data.entries)) return data.entries; + await new Promise(r => setTimeout(r, 2000)); + } + const resp = await api('/sidebar-chat?after=0'); + return (await resp.json()).entries; + } + + beforeAll(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-')); + stateFile = path.join(tmpDir, 'browse.json'); + queueFile = path.join(tmpDir, 'sidebar-queue.jsonl'); + fs.mkdirSync(path.dirname(queueFile), { recursive: true }); + + // Start fixture server for test pages + fixtureServer = startTestServer(0); + + // Start browse server (no browser — sidebar agent uses `browse` commands + // which will fail without a browser, but we're testing the message flow) + const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts'); + serverProc = spawn(['bun', 'run', serverScript], { + env: { + ...process.env, + BROWSE_STATE_FILE: stateFile, + BROWSE_HEADLESS_SKIP: '1', + BROWSE_PORT: '0', + SIDEBAR_QUEUE_PATH: queueFile, + BROWSE_IDLE_TIMEOUT: '300', + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + // Wait for server + const deadline = Date.now() + 15000; + while (Date.now() < deadline) { + if (fs.existsSync(stateFile)) { + try { + const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8')); + if (state.port && state.token) { + serverPort = state.port; + authToken = state.token; + break; + } + } catch {} + } + await new Promise(r => setTimeout(r, 100)); + } + if (!serverPort) throw new Error('Server did not start in time'); + + // Start sidebar-agent with real claude + const agentScript = path.resolve(ROOT, 'browse', 'src', 'sidebar-agent.ts'); + const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse'); + agentProc = spawn(['bun', 'run', agentScript], { + env: { + ...process.env, + BROWSE_SERVER_PORT: String(serverPort), + BROWSE_STATE_FILE: stateFile, + SIDEBAR_QUEUE_PATH: queueFile, + SIDEBAR_AGENT_TIMEOUT: '120000', + BROWSE_BIN: fs.existsSync(browseBin) ? browseBin : 'browse', + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + await new Promise(r => setTimeout(r, 1500)); + }, 25000); + + afterAll(async () => { + if (agentProc) { try { agentProc.kill(); } catch {} } + if (serverProc) { try { serverProc.kill(); } catch {} } + if (fixtureServer) { try { fixtureServer.server.stop(); } catch {} } + finalizeEvalCollector(evalCollector); + try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('sidebar-navigate', async () => { + await resetState(); + const startTime = Date.now(); + + // Ask Claude to describe the page at the fixture URL + const fixtureUrl = `${fixtureServer!.url}/basic.html`; + const resp = await api('/sidebar-command', { + method: 'POST', + body: JSON.stringify({ + message: `What is the title of the page at ${fixtureUrl}? Just tell me the title text, nothing else.`, + activeTabUrl: fixtureUrl, + }), + }); + expect(resp.status).toBe(200); + + // Wait for Claude to finish (agent_done) + const entries = await pollChatUntil( + (entries) => entries.some((e: any) => e.type === 'agent_done'), + 90000, + ); + + const duration = Date.now() - startTime; + const doneEntry = entries.find((e: any) => e.type === 'agent_done'); + expect(doneEntry).toBeDefined(); + + // Claude should have responded with something about the page + const agentEntries = entries.filter((e: any) => e.role === 'agent' && (e.type === 'text' || e.type === 'result')); + expect(agentEntries.length).toBeGreaterThan(0); + + // Check that Claude mentioned the page title or content + const allText = agentEntries.map((e: any) => e.text || '').join(' ').toLowerCase(); + const mentionsPage = allText.includes('test page') || allText.includes('basic') || allText.includes('hello'); + + recordE2E(evalCollector, 'sidebar-navigate', 'Sidebar agent E2E', { + exitReason: doneEntry ? 'success' : 'timeout', + durationMs: duration, + toolCalls: entries.filter((e: any) => e.type === 'tool_use').length, + cost: 0, // we can't easily measure cost from chat entries + } as any); + + expect(mentionsPage).toBe(true); + }, 120_000); + + testConcurrentIfSelected('sidebar-url-accuracy', async () => { + await resetState(); + + // POST with an activeTabUrl that differs from any Playwright URL + const fakeExtensionUrl = `${fixtureServer!.url}/forms.html`; + const resp = await api('/sidebar-command', { + method: 'POST', + body: JSON.stringify({ + message: 'What URL am I on?', + activeTabUrl: fakeExtensionUrl, + }), + }); + expect(resp.status).toBe(200); + + // Verify the queue entry has the extension URL, not the Playwright URL + await new Promise(r => setTimeout(r, 200)); + const queueContent = fs.readFileSync(queueFile, 'utf-8').trim(); + const lines = queueContent.split('\n').filter(Boolean); + expect(lines.length).toBeGreaterThan(0); + const lastEntry = JSON.parse(lines[lines.length - 1]); + + // The prompt should contain the extension URL + expect(lastEntry.pageUrl).toBe(fakeExtensionUrl); + expect(lastEntry.prompt).toContain(fakeExtensionUrl); + // Should NOT contain 'about:blank' (the no-browser fallback) + expect(lastEntry.pageUrl).not.toBe('about:blank'); + + recordE2E(evalCollector, 'sidebar-url-accuracy', 'Sidebar agent E2E', { + exitReason: 'success', + durationMs: 0, + toolCalls: 0, + cost: 0, + } as any); + + // Kill the agent so it doesn't keep running + await api('/sidebar-agent/kill', { method: 'POST' }); + }, 30_000); +});