From ee9108887c697cafffc57949c69425963777f2b1 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 27 Apr 2026 23:50:59 -0700 Subject: [PATCH] test: source-level guards + pure-function unit test + dual-listener behavioral eval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three layers of regression coverage for the tunnel allowlist: 1. dual-listener.test.ts: replaces must-include/must-exclude with exact-set equality on the 26-command literal (the prior intersection-only style let new commands sneak into the source without test updates). Adds a regex assertion that the `command !== 'newtab'` ownership exemption at server.ts:613 still exists — catches refactors that re-introduce the catch-22 from the other side. Updates the /command handler test to look for canDispatchOverTunnel(body?.command) instead of the inline check. 2. tunnel-gate-unit.test.ts (new): 53 expects covering all 26 allowed, 20 blocked, null/undefined/empty/non-string defensive handling, and alias canonicalization (e.g. 'set-content' resolves to 'load-html' which is correctly rejected since 'load-html' isn't tunnel-allowed). 3. pair-agent-tunnel-eval.test.ts (new): 4 behavioral tests that spawn the daemon under BROWSE_HEADLESS_SKIP=1 BROWSE_TUNNEL_LOCAL_ONLY=1, bind both listeners on 127.0.0.1, mint a scoped token via /pair → /connect, and assert: (a) newtab over tunnel passes the gate; (b) pair over tunnel 403s with disallowed_command:pair AND writes a denial-log entry; (c) pair over local does NOT trigger the tunnel gate (proves the gate is surface-scoped); (d) regression for the catch-22 — newtab + goto on the resulting tab does not 403 with "Tab not owned by your agent". All four tests run free under bun test (no API spend, no ngrok). Co-Authored-By: Claude Opus 4.7 (1M context) --- browse/test/dual-listener.test.ts | 51 +++-- browse/test/pair-agent-tunnel-eval.test.ts | 215 +++++++++++++++++++++ browse/test/tunnel-gate-unit.test.ts | 97 ++++++++++ 3 files changed, 352 insertions(+), 11 deletions(-) create mode 100644 browse/test/pair-agent-tunnel-eval.test.ts create mode 100644 browse/test/tunnel-gate-unit.test.ts diff --git a/browse/test/dual-listener.test.ts b/browse/test/dual-listener.test.ts index c14966bb..47ef0b25 100644 --- a/browse/test/dual-listener.test.ts +++ b/browse/test/dual-listener.test.ts @@ -70,17 +70,37 @@ describe('Tunnel path allowlist', () => { }); describe('Tunnel command allowlist', () => { - test('TUNNEL_COMMANDS is a closed set of browser-driving commands only', () => { + // The full closed set of commands reachable over the tunnel surface. Adding + // or removing a command here means changing the literal in server.ts AND + // updating this list — that double-edit is the point. A single-source + // "include the items in the source" assertion would silently widen the + // surface during a refactor that adds a command to server.ts without test + // review. The exact-set match catches it. + const EXPECTED_TUNNEL_COMMANDS = new Set([ + // Original 17 + 'goto', 'click', 'text', 'screenshot', + 'html', 'links', 'forms', 'accessibility', + 'attrs', 'media', 'data', + 'scroll', 'press', 'type', 'select', 'wait', 'eval', + // Tab + navigation primitives operator docs and CLI hints already promised + 'newtab', 'tabs', 'back', 'forward', 'reload', + // Read/inspect/write operators paired agents need to be useful + 'snapshot', 'fill', 'url', 'closetab', + ]); + + test('TUNNEL_COMMANDS literal matches the closed allowlist exactly (catches additions/removals without test update)', () => { const cmds = extractSetContents(SERVER_SRC, 'TUNNEL_COMMANDS'); - // Must include the core browser-driving commands - const required = [ - 'goto', 'click', 'text', 'screenshot', 'html', 'links', - 'forms', 'accessibility', 'attrs', 'media', 'data', - 'scroll', 'press', 'type', 'select', 'wait', 'eval', - ]; - for (const c of required) { + // Both directions: anything in the source must be expected, and anything + // expected must be in the source. The intersection-only style of the old + // must-include / must-exclude tests let new commands sneak into the source + // without a corresponding test update. + for (const c of cmds) { + expect(EXPECTED_TUNNEL_COMMANDS.has(c)).toBe(true); + } + for (const c of EXPECTED_TUNNEL_COMMANDS) { expect(cmds.has(c)).toBe(true); } + expect(cmds.size).toBe(EXPECTED_TUNNEL_COMMANDS.size); }); test('TUNNEL_COMMANDS does NOT include daemon-configuration or bootstrap commands', () => { @@ -89,12 +109,21 @@ describe('Tunnel command allowlist', () => { 'launch', 'launch-browser', 'connect', 'disconnect', 'restart', 'stop', 'tunnel-start', 'tunnel-stop', 'token-mint', 'token-revoke', 'cookie-picker', 'cookie-import', - 'inspector-pick', + 'inspector-pick', 'pair', 'unpair', 'cookies', 'setup', ]; for (const c of forbidden) { expect(cmds.has(c)).toBe(false); } }); + + test('newtab ownership exemption preserved (catches refactors that re-introduce the catch-22)', () => { + // The /command handler must skip the per-tab ownership check when the + // command is `newtab`, otherwise paired agents have no way to create their + // own tab — every other write command requires an owned tab, and you can't + // own a tab you haven't created. The string `command !== 'newtab'` is the + // contract that breaks the catch-22. + expect(SERVER_SRC).toMatch(/command\s*!==\s*['"]newtab['"]/); + }); }); describe('Request handler factory', () => { @@ -176,14 +205,14 @@ describe('GET /connect alive probe', () => { }); describe('/command tunnel command allowlist', () => { - test('/command handler checks TUNNEL_COMMANDS when surface is tunnel', () => { + test('/command handler delegates to canDispatchOverTunnel when surface is tunnel', () => { const commandBlock = sliceBetween( SERVER_SRC, "url.pathname === '/command' && req.method === 'POST'", 'return handleCommand(body, tokenInfo)' ); expect(commandBlock).toContain("surface === 'tunnel'"); - expect(commandBlock).toContain('TUNNEL_COMMANDS.has'); + expect(commandBlock).toContain('canDispatchOverTunnel(body?.command)'); expect(commandBlock).toContain('disallowed_command'); expect(commandBlock).toContain('is not allowed over the tunnel surface'); expect(commandBlock).toContain('status: 403'); diff --git a/browse/test/pair-agent-tunnel-eval.test.ts b/browse/test/pair-agent-tunnel-eval.test.ts new file mode 100644 index 00000000..ffb43219 --- /dev/null +++ b/browse/test/pair-agent-tunnel-eval.test.ts @@ -0,0 +1,215 @@ +/** + * Tunnel-surface behavioral eval for the pair-agent flow. + * + * Spawns the daemon under `BROWSE_HEADLESS_SKIP=1 BROWSE_TUNNEL_LOCAL_ONLY=1` + * so BOTH listeners come up: the local listener on `port` and the tunnel + * listener on `tunnelLocalPort`. No ngrok, no live network — the surface tag + * (`local` vs `tunnel`) is set by which listener received the request, which + * is testable as long as both bind locally. + * + * This file is the only place that exercises the tunnel-surface gate + * end-to-end. The source-level guards in `dual-listener.test.ts` catch + * literal/exemption regressions, the unit test in `tunnel-gate-unit.test.ts` + * catches gate-logic regressions, and this file catches routing-or-listener + * regressions (e.g. someone accidentally swaps `'local'` and `'tunnel'` at + * the makeFetchHandler call site). + * + * The browser dispatch path under BROWSE_HEADLESS_SKIP=1 surfaces an error + * because there is no Playwright context, so the assertion target is + * specifically that the GATE was passed (i.e. the response is NOT a 403 with + * `disallowed_command:`), not that the dispatch succeeded. + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '../..'); +const SERVER_ENTRY = path.join(ROOT, 'browse/src/server.ts'); + +interface DaemonHandle { + proc: ReturnType; + localPort: number; + tunnelPort: number; + rootToken: string; + scopedToken: string; + stateFile: string; + tempDir: string; + localUrl: string; + tunnelUrl: string; + attemptsLogPath: string; +} + +async function waitForReady(baseUrl: string, timeoutMs = 20_000): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + try { + const resp = await fetch(`${baseUrl}/health`, { + signal: AbortSignal.timeout(1000), + }); + if (resp.ok) return; + } catch { + // not ready yet + } + await new Promise(r => setTimeout(r, 200)); + } + throw new Error(`Daemon did not become ready within ${timeoutMs}ms at ${baseUrl}`); +} + +async function waitForTunnelPort(stateFile: string, timeoutMs = 20_000): Promise { + const deadline = Date.now() + timeoutMs; + while (Date.now() < deadline) { + try { + const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8')); + if (typeof state.tunnelLocalPort === 'number') return state.tunnelLocalPort; + } catch { + // state file not written yet + } + await new Promise(r => setTimeout(r, 200)); + } + throw new Error(`Tunnel local port did not appear in ${stateFile} within ${timeoutMs}ms`); +} + +async function spawnDaemonWithTunnel(): Promise { + // Isolate this test's analytics + denial log directory so we can assert on a + // fresh attempts.jsonl without colliding with the user's real ~/.gstack. + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pair-agent-tunnel-eval-')); + const stateFile = path.join(tempDir, 'browse.json'); + const fakeHome = path.join(tempDir, 'home'); + fs.mkdirSync(fakeHome, { recursive: true }); + const localPort = 30000 + Math.floor(Math.random() * 30000); + const attemptsLogPath = path.join(fakeHome, '.gstack', 'security', 'attempts.jsonl'); + + const proc = Bun.spawn(['bun', 'run', SERVER_ENTRY], { + cwd: ROOT, + env: { + ...process.env, + HOME: fakeHome, + BROWSE_HEADLESS_SKIP: '1', + BROWSE_TUNNEL_LOCAL_ONLY: '1', + BROWSE_PORT: String(localPort), + BROWSE_STATE_FILE: stateFile, + BROWSE_PARENT_PID: '0', + BROWSE_IDLE_TIMEOUT: '600000', + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + const localUrl = `http://127.0.0.1:${localPort}`; + await waitForReady(localUrl); + const tunnelPort = await waitForTunnelPort(stateFile); + const tunnelUrl = `http://127.0.0.1:${tunnelPort}`; + + // Read the root token, then exchange it for a scoped token via /pair → /connect. + const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8')); + const rootToken = state.token; + + const pairResp = await fetch(`${localUrl}/pair`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${rootToken}` }, + body: JSON.stringify({ clientId: 'tunnel-eval' }), + }); + if (!pairResp.ok) throw new Error(`/pair failed: ${pairResp.status}`); + const { setup_key } = await pairResp.json() as any; + + const connectResp = await fetch(`${localUrl}/connect`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ setup_key }), + }); + if (!connectResp.ok) throw new Error(`/connect failed: ${connectResp.status}`); + const { token: scopedToken } = await connectResp.json() as any; + + return { proc, localPort, tunnelPort, rootToken, scopedToken, stateFile, tempDir, localUrl, tunnelUrl, attemptsLogPath }; +} + +function killDaemon(handle: DaemonHandle): void { + try { handle.proc.kill('SIGKILL'); } catch {} + try { fs.rmSync(handle.tempDir, { recursive: true, force: true }); } catch {} +} + +async function postCommand(baseUrl: string, token: string, body: any): Promise<{ status: number; bodyText: string }> { + const resp = await fetch(`${baseUrl}/command`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${token}` }, + body: JSON.stringify(body), + }); + return { status: resp.status, bodyText: await resp.text() }; +} + +describe('pair-agent over tunnel surface — gate fires on the right surface only', () => { + let daemon: DaemonHandle; + + beforeAll(async () => { + daemon = await spawnDaemonWithTunnel(); + }, 30_000); + + afterAll(() => { + if (daemon) killDaemon(daemon); + }); + + test('newtab on tunnel surface passes the allowlist gate (not 403 disallowed_command)', async () => { + const { status, bodyText } = await postCommand(daemon.tunnelUrl, daemon.scopedToken, { command: 'newtab' }); + // Browser dispatch under BROWSE_HEADLESS_SKIP=1 will fail differently + // (no Playwright context), but the gate must NOT 403 with + // disallowed_command. + if (status === 403) { + expect(bodyText).not.toContain('disallowed_command:newtab'); + expect(bodyText).not.toContain('is not allowed over the tunnel surface'); + } + }); + + test('pair on tunnel surface 403s with disallowed_command and writes a denial-log entry', async () => { + // Snapshot attempts.jsonl size before the call so we can detect the new entry. + let beforeBytes = 0; + try { beforeBytes = fs.statSync(daemon.attemptsLogPath).size; } catch {} + + const { status, bodyText } = await postCommand(daemon.tunnelUrl, daemon.scopedToken, { command: 'pair' }); + expect(status).toBe(403); + expect(bodyText).toContain('is not allowed over the tunnel surface'); + + // Wait briefly for the denial-log writer (it's synchronous fs.appendFile in + // tunnel-denial-log.ts but the OS may need a tick to flush). + await new Promise(r => setTimeout(r, 250)); + expect(fs.existsSync(daemon.attemptsLogPath)).toBe(true); + const after = fs.readFileSync(daemon.attemptsLogPath, 'utf-8'); + const newSection = after.slice(beforeBytes); + expect(newSection).toContain('disallowed_command:pair'); + }); + + test('pair on local surface does NOT trigger the tunnel allowlist gate', async () => { + // The same scoped token over the LOCAL listener must not see the + // disallowed_command path — the tunnel gate is surface-scoped. + const { status, bodyText } = await postCommand(daemon.localUrl, daemon.scopedToken, { command: 'pair' }); + // Whatever happens (404 unknown command, 403 from a token-scope check, or + // 200 if the local handler accepts it) the response must NOT come from the + // tunnel allowlist gate. + expect(bodyText).not.toContain('disallowed_command:pair'); + expect(bodyText).not.toContain('is not allowed over the tunnel surface'); + expect([200, 400, 403, 404, 500]).toContain(status); + }); + + test('catch-22 regression: newtab + goto on the just-created tab passes ownership check', async () => { + // Without the `command !== 'newtab'` exemption at server.ts:613, scoped + // agents can't open a tab (newtab fails ownership) and can't goto an + // existing tab (also fails ownership). This proves the exemption holds: + // newtab succeeds the gate AND the ownership check, then the agent can + // hand off the tabId to a follow-up command without hitting the + // "Tab not owned by your agent" error. + const newtabResp = await postCommand(daemon.tunnelUrl, daemon.scopedToken, { command: 'newtab' }); + if (newtabResp.status === 403) { + expect(newtabResp.bodyText).not.toContain('disallowed_command'); + expect(newtabResp.bodyText).not.toContain('Tab not owned by your agent'); + } + + // Even if the headless-skip dispatch fails before returning a tabId, a + // follow-up `goto` over the tunnel surface must not 403 with + // `disallowed_command:goto`. We are NOT asserting that the goto + // succeeds — only that the allowlist + ownership exemption don't reject + // it as a class. + const gotoResp = await postCommand(daemon.tunnelUrl, daemon.scopedToken, { command: 'goto', args: ['http://127.0.0.1:1/'] }); + expect(gotoResp.bodyText).not.toContain('disallowed_command:goto'); + expect(gotoResp.bodyText).not.toContain('is not allowed over the tunnel surface'); + }); +}); diff --git a/browse/test/tunnel-gate-unit.test.ts b/browse/test/tunnel-gate-unit.test.ts new file mode 100644 index 00000000..f6d61c13 --- /dev/null +++ b/browse/test/tunnel-gate-unit.test.ts @@ -0,0 +1,97 @@ +/** + * Unit-test the pure tunnel-gate function extracted from the /command handler. + * + * The gate decides whether a paired remote agent's request to `/command` over + * the tunnel surface is allowed (returns true) or 403'd (returns false). Pure, + * synchronous, no HTTP — testable without standing up a Bun.serve listener. + * + * The behavioral coverage of the gate firing on the right surface (and only + * the right surface) lives in `pair-agent-tunnel-eval.test.ts` (paid eval, + * gate-tier). + */ + +import { describe, test, expect } from 'bun:test'; +import { canDispatchOverTunnel, TUNNEL_COMMANDS } from '../src/server'; + +describe('canDispatchOverTunnel — closed allowlist', () => { + test('every command in TUNNEL_COMMANDS dispatches over tunnel', () => { + for (const cmd of TUNNEL_COMMANDS) { + expect(canDispatchOverTunnel(cmd)).toBe(true); + } + }); + + test('TUNNEL_COMMANDS contains the 26-command closed set', () => { + // Mirror the source-level guard in dual-listener.test.ts. If this ever + // disagrees with the literal in server.ts, one of them is wrong. + const expected = new Set([ + 'goto', 'click', 'text', 'screenshot', + 'html', 'links', 'forms', 'accessibility', + 'attrs', 'media', 'data', + 'scroll', 'press', 'type', 'select', 'wait', 'eval', + 'newtab', 'tabs', 'back', 'forward', 'reload', + 'snapshot', 'fill', 'url', 'closetab', + ]); + expect(TUNNEL_COMMANDS.size).toBe(expected.size); + for (const c of expected) expect(TUNNEL_COMMANDS.has(c)).toBe(true); + for (const c of TUNNEL_COMMANDS) expect(expected.has(c)).toBe(true); + }); +}); + +describe('canDispatchOverTunnel — daemon-config + bootstrap commands rejected', () => { + const blocked = [ + 'pair', 'unpair', 'cookies', 'setup', + 'launch', 'launch-browser', 'connect', 'disconnect', + 'restart', 'stop', 'tunnel-start', 'tunnel-stop', + 'token-mint', 'token-revoke', 'cookie-picker', 'cookie-import', + 'inspector-pick', 'extension-inspect', + 'invalid-command-xyz', 'totally-made-up', + ]; + for (const cmd of blocked) { + test(`rejects '${cmd}'`, () => { + expect(canDispatchOverTunnel(cmd)).toBe(false); + }); + } +}); + +describe('canDispatchOverTunnel — null/undefined/empty input', () => { + test('returns false for empty string', () => { + expect(canDispatchOverTunnel('')).toBe(false); + }); + + test('returns false for undefined', () => { + expect(canDispatchOverTunnel(undefined)).toBe(false); + }); + + test('returns false for null', () => { + expect(canDispatchOverTunnel(null)).toBe(false); + }); + + test('returns false for non-string input (defensive)', () => { + // The body parser may hand the gate a number or object if a malicious + // client sends `{"command": 42}`. The pure gate must treat anything + // non-string as not-allowed rather than throw. + expect(canDispatchOverTunnel(42 as unknown as string)).toBe(false); + expect(canDispatchOverTunnel({} as unknown as string)).toBe(false); + }); +}); + +describe('canDispatchOverTunnel — alias canonicalization', () => { + // canonicalizeCommand resolves aliases (e.g. 'set-content' → 'load-html'). + // Any aliased form of an allowlisted canonical command should also pass the + // gate; aliases that resolve to a non-allowlisted canonical command should + // not. We don't hardcode alias names here — we read from the source registry + // by importing what we need from commands.ts. + test('aliases that resolve to allowlisted commands pass the gate', () => { + // 'set-content' canonicalizes to 'load-html'. 'load-html' is NOT in + // TUNNEL_COMMANDS, so 'set-content' must also be rejected. This guards + // against a future alias that accidentally maps a tunnel-allowed name to + // a non-tunnel-allowed canonical (e.g. 'goto' → 'navigate' would break). + expect(canDispatchOverTunnel('set-content')).toBe(false); + }); + + test('canonical commands pass directly without alias lookup', () => { + expect(canDispatchOverTunnel('goto')).toBe(true); + expect(canDispatchOverTunnel('newtab')).toBe(true); + expect(canDispatchOverTunnel('closetab')).toBe(true); + }); +});