fix(browse): sanitize lone Unicode surrogates at commandResult chokepoint + /batch envelope (#1440)

Page captures with mixed-script Unicode round-trip cleanly to the Claude API. Two new utilities in browse/src/sanitize.ts: stripLoneSurrogates for raw UTF-16 strings, stripLoneSurrogateEscapes for \uXXXX JSON escape text. sanitizeBody picks the right pass based on cr.json. buildCommandResponse is extracted from handleCommand (now exported) and applies sanitization before new Response(). /batch was bypassing this chokepoint via direct JSON.stringify, so it sanitizes each cr.result before pushing AND wraps the envelope with stripLoneSurrogateEscapes. Defense in depth wraps at getCleanText, getCleanTextWithStripping, html, accessibility, and snapshot.ts return points so downstream consumers (datamarking, envelope wrapping) see sanitized text before the response is built. 25 new unit tests across sanitize.test.ts and build-command-response.test.ts. content-security.test.ts updated to accept either pre- or post-sanitize form of the snapshot scoped branch (source-level regression check). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-22 17:49:57 +02:00 · 2026-05-14 13:58:35 -07:00
parent 40e34deb7a
commit bdb6023713
21 changed files with 756 additions and 17 deletions
@@ -12,6 +12,7 @@

 import { randomBytes } from 'crypto';
 import type { Page, Frame } from 'playwright';
+import { stripLoneSurrogates } from './sanitize';

 // ─── Datamarking (Layer 1) ──────────────────────────────────────

@@ -167,7 +168,7 @@ export async function markHiddenElements(page: Page | Frame): Promise<string[]>
 * Uses clone + remove approach: clones body, removes marked elements, returns innerText.
 */
 export async function getCleanTextWithStripping(page: Page | Frame): Promise<string> {
-  return page.evaluate(() => {
+  const raw = await page.evaluate(() => {
    const body = document.body;
    if (!body) return '';
    const clone = body.cloneNode(true) as HTMLElement;
@@ -181,6 +182,7 @@ export async function getCleanTextWithStripping(page: Page | Frame): Promise<str
      .filter(line => line.length > 0)
      .join('\n');
  });
+  return stripLoneSurrogates(raw);
 }

 /**
@@ -14,6 +14,7 @@ import * as path from 'path';
 import { TEMP_DIR } from './platform';
 import { inspectElement, formatInspectorResult, getModificationHistory } from './cdp-inspector';
 import { validateReadPath } from './path-security';
+import { stripLoneSurrogates } from './sanitize';
 // Re-export for backward compatibility (tests import from read-commands)
 export { validateReadPath } from './path-security';

@@ -50,7 +51,7 @@ function wrapForEvaluate(code: string): string {
 * Exported for DRY reuse in meta-commands (diff).
 */
 export async function getCleanText(page: Page | Frame): Promise<string> {
-  return page.evaluate(() => {
+  const raw = await page.evaluate(() => {
    const body = document.body;
    if (!body) return '';
    const clone = body.cloneNode(true) as HTMLElement;
@@ -61,6 +62,7 @@ export async function getCleanText(page: Page | Frame): Promise<string> {
      .filter(line => line.length > 0)
      .join('\n');
  });
+  return stripLoneSurrogates(raw);
 }

 /**
@@ -115,9 +117,9 @@ export async function handleReadCommand(
      if (selector) {
        const resolved = await session.resolveRef(selector);
        if ('locator' in resolved) {
-          return resolved.locator.innerHTML({ timeout: 5000 });
+          return stripLoneSurrogates(await resolved.locator.innerHTML({ timeout: 5000 }));
        }
-        return target.locator(resolved.selector).innerHTML({ timeout: 5000 });
+        return stripLoneSurrogates(await target.locator(resolved.selector).innerHTML({ timeout: 5000 }));
      }
      // page.content() is page-only; use evaluate for frame compat
      const doctype = await target.evaluate(() => {
@@ -125,7 +127,7 @@ export async function handleReadCommand(
        return dt ? `<!DOCTYPE ${dt.name}>` : '';
      });
      const html = await target.evaluate(() => document.documentElement.outerHTML);
-      return doctype ? `${doctype}\n${html}` : html;
+      return stripLoneSurrogates(doctype ? `${doctype}\n${html}` : html);
    }

    case 'links': {
@@ -173,7 +175,7 @@ export async function handleReadCommand(

    case 'accessibility': {
      const snapshot = await target.locator("body").ariaSnapshot();
-      return snapshot;
+      return stripLoneSurrogates(snapshot);
    }

    case 'js': {
@@ -0,0 +1,34 @@
+// Lone Unicode surrogate sanitization.
+//
+// Lone surrogates (\uD800-\uDFFF without a matching pair) are valid UTF-16
+// but invalid UTF-8, so JSON.stringify produces output the Claude API rejects
+// with HTTP 400 "no low surrogate in string". Page captures from real-world
+// HTML hit this when content contains broken emoji bytes or mid-emoji splits.
+//
+// Two sanitizers are needed because both forms appear in browse responses:
+//   - Raw UTF-16 surrogates in text/plain bodies (pre-stringify state).
+//   - JSON \uXXXX escape sequences after JSON.stringify already ran.
+// Both replace lone surrogates with U+FFFD (replacement character).
+
+const LONE_SURROGATE_HIGH = /[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g;
+const LONE_SURROGATE_LOW = /(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g;
+
+export function stripLoneSurrogates(s: string): string {
+  return s.replace(LONE_SURROGATE_HIGH, '�').replace(LONE_SURROGATE_LOW, '�');
+}
+
+// Matches \uD8XX-\uDFXX escape text where the pair is not completed by an
+// adjacent \uDC00-\uDFFF (high) or preceded by \uD800-\uDBFF (low).
+const LONE_SURROGATE_HIGH_ESCAPE = /\\u[Dd][89ABab][0-9A-Fa-f]{2}(?!\\u[Dd][C-Fc-f][0-9A-Fa-f]{2})/g;
+const LONE_SURROGATE_LOW_ESCAPE = /(?<!\\u[Dd][89ABab][0-9A-Fa-f]{2})\\u[Dd][C-Fc-f][0-9A-Fa-f]{2}/g;
+
+export function stripLoneSurrogateEscapes(s: string): string {
+  return s.replace(LONE_SURROGATE_HIGH_ESCAPE, '\\uFFFD').replace(LONE_SURROGATE_LOW_ESCAPE, '\\uFFFD');
+}
+
+// Pick the right sanitizer based on whether the body has already been JSON-stringified.
+// For application/json bodies, run both passes: raw first (in case the JSON encoder
+// emitted surrogates as-is rather than escaping), then escape-text.
+export function sanitizeBody(body: string, isJson: boolean): string {
+  return isJson ? stripLoneSurrogateEscapes(stripLoneSurrogates(body)) : stripLoneSurrogates(body);
+}
@@ -42,6 +42,7 @@ import { inspectElement, modifyStyle, resetModifications, getModificationHistory
 // Bun.spawn used instead of child_process.spawn (compiled bun binaries
 // fail posix_spawn on all executables including /bin/bash)
 import { safeUnlink, safeUnlinkQuiet, safeKill } from './error-handling';
+import { sanitizeBody, stripLoneSurrogateEscapes } from './sanitize';
 import { startSocksBridge, testUpstream, type BridgeHandle } from './socks-bridge';
 import { parseProxyConfig, toUpstreamConfig, ProxyConfigError } from './proxy-config';
 import { redactProxyUrl } from './proxy-redact';
@@ -1027,16 +1028,26 @@ async function handleCommandInternal(
  }
 }

-/** HTTP wrapper — converts CommandResult to Response */
-async function handleCommand(body: any, tokenInfo?: TokenInfo | null): Promise<Response> {
-  const cr = await handleCommandInternal(body, tokenInfo);
+/**
+ * Build the HTTP response from a CommandResult. Pure function so it can be
+ * unit-tested without spinning up the server. Sanitizes lone Unicode surrogates
+ * (#1440) before the body crosses the API boundary.
+ */
+export function buildCommandResponse(cr: CommandResult): Response {
  const contentType = cr.json ? 'application/json' : 'text/plain';
-  return new Response(cr.result, {
+  const safeBody = typeof cr.result === 'string' ? sanitizeBody(cr.result, !!cr.json) : cr.result;
+  return new Response(safeBody, {
    status: cr.status,
    headers: { 'Content-Type': contentType, ...cr.headers },
  });
 }

+/** HTTP wrapper — converts CommandResult to Response */
+async function handleCommand(body: any, tokenInfo?: TokenInfo | null): Promise<Response> {
+  const cr = await handleCommandInternal(body, tokenInfo);
+  return buildCommandResponse(cr);
+}
+
 async function shutdown(exitCode: number = 0) {
  if (isShuttingDown) return;
  isShuttingDown = true;
@@ -1960,10 +1971,13 @@ export async function start() {
            tokenInfo,
            { skipRateCheck: true, skipActivity: true },
          );
+          // Sanitize lone surrogates per-result (#1440 — /batch bypasses the
+          // handleCommand chokepoint, so it needs its own sanitization).
+          const safeResult = typeof cr.result === 'string' ? sanitizeBody(cr.result, !!cr.json) : cr.result;
          results.push({
            index: i,
            status: cr.status,
-            result: cr.result,
+            result: safeResult,
            command: cmd.command,
            tabId: cmd.tabId,
          });
@@ -1983,13 +1997,17 @@ export async function start() {
          clientId: tokenInfo?.clientId,
        });

-        return new Response(JSON.stringify({
+        // Sanitize the JSON envelope a second time (defense in depth) — catches
+        // any \uXXXX escape sequences for lone surrogates that survived the
+        // per-result pass.
+        const batchBody = stripLoneSurrogateEscapes(JSON.stringify({
          results,
          duration,
          total: commands.length,
          succeeded: results.filter(r => r.status === 200).length,
          failed: results.filter(r => r.status !== 200).length,
-        }), {
+        }));
+        return new Response(batchBody, {
          status: 200,
          headers: { 'Content-Type': 'application/json' },
        });
@@ -22,6 +22,7 @@ import type { TabSession, RefEntry } from './tab-session';
 import * as Diff from 'diff';
 import { TEMP_DIR, isPathWithin } from './platform';
 import { escapeEnvelopeSentinels } from './content-security';
+import { stripLoneSurrogates } from './sanitize';

 // Roles considered "interactive" for the -i flag
 const INTERACTIVE_ROLES = new Set([
@@ -576,7 +577,7 @@ export async function handleSnapshot(
    }

    session.setLastSnapshot(snapshotText);
-    return diffOutput.join('\n');
+    return stripLoneSurrogates(diffOutput.join('\n'));
  }

  // Store for future diffs
@@ -623,8 +624,8 @@ export async function handleSnapshot(
    parts.push('═══ BEGIN UNTRUSTED WEB CONTENT ═══');
    parts.push(...safeUntrusted);
    parts.push('═══ END UNTRUSTED WEB CONTENT ═══');
-    return parts.join('\n');
+    return stripLoneSurrogates(parts.join('\n'));
  }

-  return output.join('\n');
+  return stripLoneSurrogates(output.join('\n'));
 }
@@ -0,0 +1,68 @@
+// Unit test for buildCommandResponse — the exported response builder that
+// sanitizes lone Unicode surrogates at the HTTP boundary (#1440, D7 + D13).
+//
+// The function is exported from server.ts specifically so we can test it
+// without spinning up a Bun server. Codex flagged in D13 finding 14 that
+// "mock cr.result" wasn't testable when handleCommand was the only entry
+// point; this refactor solves that.
+
+import { describe, expect, test } from 'bun:test';
+import { buildCommandResponse } from '../src/server';
+
+describe('buildCommandResponse', () => {
+  test('sanitizes lone surrogates in text/plain body', async () => {
+    const cr = { status: 200, result: `pre\uD800post`, json: false };
+    const res = buildCommandResponse(cr as any);
+    expect(res.headers.get('content-type')).toBe('text/plain');
+    expect(await res.text()).toBe(`pre�post`);
+  });
+
+  test('sanitizes lone escape sequences in application/json body', async () => {
+    // cr.result is already JSON-stringified by handleCommand callers when
+    // cr.json=true. Surrogate escape sequences in the stringified form must
+    // be neutralized.
+    const cr = { status: 200, result: '{"name":"\\uD800"}', json: true };
+    const res = buildCommandResponse(cr as any);
+    expect(res.headers.get('content-type')).toBe('application/json');
+    expect(await res.text()).toBe('{"name":"\\uFFFD"}');
+  });
+
+  test('non-string cr.result passes through unchanged', async () => {
+    // Some commands return Buffers or other ArrayBuffer-shaped bodies (e.g.
+    // screenshots). Sanitizer must NOT touch them.
+    const buf = new Uint8Array([1, 2, 3, 4]);
+    const cr = { status: 200, result: buf, json: false };
+    const res = buildCommandResponse(cr as any);
+    // body returned verbatim; reading as array buffer should give same bytes
+    const out = new Uint8Array(await res.arrayBuffer());
+    expect(out.length).toBe(4);
+    expect(out[0]).toBe(1);
+    expect(out[3]).toBe(4);
+  });
+
+  test('clean text passes through unchanged', async () => {
+    const cr = { status: 200, result: 'Hello, world!', json: false };
+    const res = buildCommandResponse(cr as any);
+    expect(await res.text()).toBe('Hello, world!');
+  });
+
+  test('status code propagates', async () => {
+    const cr = { status: 404, result: 'Not found', json: false };
+    const res = buildCommandResponse(cr as any);
+    expect(res.status).toBe(404);
+  });
+
+  test('extra headers propagate', async () => {
+    const cr = { status: 200, result: 'ok', json: false, headers: { 'X-Custom': 'value' } };
+    const res = buildCommandResponse(cr as any);
+    expect(res.headers.get('x-custom')).toBe('value');
+  });
+
+  test('JSON error body with lone surrogate is sanitized', async () => {
+    // Errors set cr.json=true; a stringified error containing surrogates would
+    // still crash the API without this sanitization.
+    const cr = { status: 500, result: '{"error":"crash at \\uDC00 byte"}', json: true };
+    const res = buildCommandResponse(cr as any);
+    expect(await res.text()).toBe('{"error":"crash at \\uFFFD byte"}');
+  });
+});
@@ -584,7 +584,17 @@ describe('Envelope sentinel escape', () => {
  test('scoped snapshot branch applies escapeEnvelopeSentinels to untrusted lines', () => {
    const branchStart = SNAPSHOT_SRC.indexOf('splitForScoped');
    expect(branchStart).toBeGreaterThan(-1);
-    const branchEnd = SNAPSHOT_SRC.indexOf("return output.join('\\n');", branchStart);
+    // Match either the original return (pre-#1440) or the surrogate-sanitized
+    // form (post-#1440) — both end the scoped branch.
+    const candidates = [
+      "return output.join('\\n');",
+      "return stripLoneSurrogates(output.join('\\n'));",
+    ];
+    let branchEnd = -1;
+    for (const c of candidates) {
+      const idx = SNAPSHOT_SRC.indexOf(c, branchStart);
+      if (idx > branchStart) { branchEnd = idx; break; }
+    }
    expect(branchEnd).toBeGreaterThan(branchStart);
    const branch = SNAPSHOT_SRC.slice(branchStart, branchEnd);
    // The escape helper must be invoked on the untrusted lines, and
@@ -0,0 +1,112 @@
+// Unit tests for browse/src/sanitize.ts (#1440).
+// Covers stripLoneSurrogates (raw UTF-16) and stripLoneSurrogateEscapes
+// (\uXXXX escape text) used by the response chokepoints.
+
+import { describe, expect, test } from 'bun:test';
+import { stripLoneSurrogates, stripLoneSurrogateEscapes, sanitizeBody } from '../src/sanitize';
+
+describe('stripLoneSurrogates', () => {
+  test('replaces lone high surrogate with U+FFFD', () => {
+    const lone = '\uD800x';
+    const out = stripLoneSurrogates(lone);
+    expect(out).toBe('�x');
+  });
+
+  test('replaces lone low surrogate with U+FFFD', () => {
+    const lone = 'x\uDC00';
+    expect(stripLoneSurrogates(lone)).toBe('x�');
+  });
+
+  test('leaves valid surrogate pairs (emoji) unchanged', () => {
+    const smiley = '😀'; // U+1F600 = 😀
+    expect(stripLoneSurrogates(smiley)).toBe(smiley);
+  });
+
+  test('empty string is unchanged', () => {
+    expect(stripLoneSurrogates('')).toBe('');
+  });
+
+  test('mixed valid + lone surrogates', () => {
+    const input = `a\uD800b😀c\uDC00d`;
+    const out = stripLoneSurrogates(input);
+    expect(out).toBe(`a�b😀c�d`);
+  });
+
+  test('clean text passes through unchanged', () => {
+    const text = 'The quick brown fox jumps over 13 lazy dogs.';
+    expect(stripLoneSurrogates(text)).toBe(text);
+  });
+
+  test('high surrogate immediately followed by high surrogate replaces both individually', () => {
+    const input = '\uD800\uD801'; // two lone highs in a row, neither paired
+    const out = stripLoneSurrogates(input);
+    expect(out).toBe('��');
+  });
+});
+
+describe('stripLoneSurrogateEscapes', () => {
+  test('replaces lone high surrogate ESCAPE with \\uFFFD', () => {
+    const json = '{"name":"\\uD800"}';
+    expect(stripLoneSurrogateEscapes(json)).toBe('{"name":"\\uFFFD"}');
+  });
+
+  test('replaces lone low surrogate ESCAPE with \\uFFFD', () => {
+    const json = '{"name":"\\uDC00"}';
+    expect(stripLoneSurrogateEscapes(json)).toBe('{"name":"\\uFFFD"}');
+  });
+
+  test('leaves valid escape pair unchanged', () => {
+    // 😀 = 😀 — must NOT be touched
+    const json = '{"emoji":"\\uD83D\\uDE00"}';
+    expect(stripLoneSurrogateEscapes(json)).toBe(json);
+  });
+
+  test('mixed escape pairs and lone escapes', () => {
+    const json = '{"a":"\\uD800","b":"\\uD83D\\uDE00","c":"\\uDC00"}';
+    expect(stripLoneSurrogateEscapes(json)).toBe('{"a":"\\uFFFD","b":"\\uD83D\\uDE00","c":"\\uFFFD"}');
+  });
+
+  test('clean JSON passes through unchanged', () => {
+    const json = '{"results":[{"status":200,"command":"text"}]}';
+    expect(stripLoneSurrogateEscapes(json)).toBe(json);
+  });
+
+  test('case-insensitive matching: \\uD8aa works like \\uD8AA', () => {
+    expect(stripLoneSurrogateEscapes('\\uD8aa')).toBe('\\uFFFD');
+  });
+});
+
+describe('sanitizeBody', () => {
+  test('text/plain body: applies raw-surrogate strip only', () => {
+    const input = `pre\uD800post`;
+    expect(sanitizeBody(input, false)).toBe(`pre�post`);
+  });
+
+  test('JSON body: applies both raw and escape passes', () => {
+    // Both raw and escape variants in the same body
+    const input = `{"raw":"\uD800","esc":"\\uD800"}`;
+    const out = sanitizeBody(input, true);
+    expect(out).toBe(`{"raw":"�","esc":"\\uFFFD"}`);
+  });
+
+  test('clean text/plain body unchanged', () => {
+    const text = 'Hello world\nLine 2';
+    expect(sanitizeBody(text, false)).toBe(text);
+  });
+
+  test('clean JSON body unchanged', () => {
+    const json = '{"ok":true}';
+    expect(sanitizeBody(json, true)).toBe(json);
+  });
+});
+
+describe('perf smoke', () => {
+  test('1MB of clean text sanitizes in <500ms', () => {
+    const big = 'A'.repeat(1024 * 1024);
+    const start = performance.now();
+    const out = stripLoneSurrogates(big);
+    const elapsed = performance.now() - start;
+    expect(out.length).toBe(big.length);
+    expect(elapsed).toBeLessThan(500);
+  });
+});