mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-28 20:50:05 +02:00
Merge remote-tracking branch 'origin/main' into garrytan/boston-v2
# Conflicts: # CHANGELOG.md # VERSION # browse/src/server.ts # package.json
This commit is contained in:
@@ -12,6 +12,7 @@
|
||||
|
||||
import { randomBytes } from 'crypto';
|
||||
import type { Page, Frame } from 'playwright';
|
||||
import { stripLoneSurrogates } from './sanitize';
|
||||
|
||||
// ─── Datamarking (Layer 1) ──────────────────────────────────────
|
||||
|
||||
@@ -167,7 +168,7 @@ export async function markHiddenElements(page: Page | Frame): Promise<string[]>
|
||||
* Uses clone + remove approach: clones body, removes marked elements, returns innerText.
|
||||
*/
|
||||
export async function getCleanTextWithStripping(page: Page | Frame): Promise<string> {
|
||||
return page.evaluate(() => {
|
||||
const raw = await page.evaluate(() => {
|
||||
const body = document.body;
|
||||
if (!body) return '';
|
||||
const clone = body.cloneNode(true) as HTMLElement;
|
||||
@@ -181,6 +182,7 @@ export async function getCleanTextWithStripping(page: Page | Frame): Promise<str
|
||||
.filter(line => line.length > 0)
|
||||
.join('\n');
|
||||
});
|
||||
return stripLoneSurrogates(raw);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -14,6 +14,7 @@ import * as path from 'path';
|
||||
import { TEMP_DIR } from './platform';
|
||||
import { inspectElement, formatInspectorResult, getModificationHistory } from './cdp-inspector';
|
||||
import { validateReadPath } from './path-security';
|
||||
import { stripLoneSurrogates } from './sanitize';
|
||||
// Re-export for backward compatibility (tests import from read-commands)
|
||||
export { validateReadPath } from './path-security';
|
||||
|
||||
@@ -50,7 +51,7 @@ function wrapForEvaluate(code: string): string {
|
||||
* Exported for DRY reuse in meta-commands (diff).
|
||||
*/
|
||||
export async function getCleanText(page: Page | Frame): Promise<string> {
|
||||
return page.evaluate(() => {
|
||||
const raw = await page.evaluate(() => {
|
||||
const body = document.body;
|
||||
if (!body) return '';
|
||||
const clone = body.cloneNode(true) as HTMLElement;
|
||||
@@ -61,6 +62,7 @@ export async function getCleanText(page: Page | Frame): Promise<string> {
|
||||
.filter(line => line.length > 0)
|
||||
.join('\n');
|
||||
});
|
||||
return stripLoneSurrogates(raw);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -115,9 +117,9 @@ export async function handleReadCommand(
|
||||
if (selector) {
|
||||
const resolved = await session.resolveRef(selector);
|
||||
if ('locator' in resolved) {
|
||||
return resolved.locator.innerHTML({ timeout: 5000 });
|
||||
return stripLoneSurrogates(await resolved.locator.innerHTML({ timeout: 5000 }));
|
||||
}
|
||||
return target.locator(resolved.selector).innerHTML({ timeout: 5000 });
|
||||
return stripLoneSurrogates(await target.locator(resolved.selector).innerHTML({ timeout: 5000 }));
|
||||
}
|
||||
// page.content() is page-only; use evaluate for frame compat
|
||||
const doctype = await target.evaluate(() => {
|
||||
@@ -125,7 +127,7 @@ export async function handleReadCommand(
|
||||
return dt ? `<!DOCTYPE ${dt.name}>` : '';
|
||||
});
|
||||
const html = await target.evaluate(() => document.documentElement.outerHTML);
|
||||
return doctype ? `${doctype}\n${html}` : html;
|
||||
return stripLoneSurrogates(doctype ? `${doctype}\n${html}` : html);
|
||||
}
|
||||
|
||||
case 'links': {
|
||||
@@ -173,7 +175,7 @@ export async function handleReadCommand(
|
||||
|
||||
case 'accessibility': {
|
||||
const snapshot = await target.locator("body").ariaSnapshot();
|
||||
return snapshot;
|
||||
return stripLoneSurrogates(snapshot);
|
||||
}
|
||||
|
||||
case 'js': {
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
// Lone Unicode surrogate sanitization.
|
||||
//
|
||||
// Lone surrogates (\uD800-\uDFFF without a matching pair) are valid UTF-16
|
||||
// but invalid UTF-8, so JSON.stringify produces output the Claude API rejects
|
||||
// with HTTP 400 "no low surrogate in string". Page captures from real-world
|
||||
// HTML hit this when content contains broken emoji bytes or mid-emoji splits.
|
||||
//
|
||||
// Two sanitizers are needed because both forms appear in browse responses:
|
||||
// - Raw UTF-16 surrogates in text/plain bodies (pre-stringify state).
|
||||
// - JSON \uXXXX escape sequences after JSON.stringify already ran.
|
||||
// Both replace lone surrogates with U+FFFD (replacement character).
|
||||
|
||||
const LONE_SURROGATE_HIGH = /[\uD800-\uDBFF](?![\uDC00-\uDFFF])/g;
|
||||
const LONE_SURROGATE_LOW = /(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g;
|
||||
|
||||
export function stripLoneSurrogates(s: string): string {
|
||||
return s.replace(LONE_SURROGATE_HIGH, '�').replace(LONE_SURROGATE_LOW, '�');
|
||||
}
|
||||
|
||||
// Matches \uD8XX-\uDFXX escape text where the pair is not completed by an
|
||||
// adjacent \uDC00-\uDFFF (high) or preceded by \uD800-\uDBFF (low).
|
||||
const LONE_SURROGATE_HIGH_ESCAPE = /\\u[Dd][89ABab][0-9A-Fa-f]{2}(?!\\u[Dd][C-Fc-f][0-9A-Fa-f]{2})/g;
|
||||
const LONE_SURROGATE_LOW_ESCAPE = /(?<!\\u[Dd][89ABab][0-9A-Fa-f]{2})\\u[Dd][C-Fc-f][0-9A-Fa-f]{2}/g;
|
||||
|
||||
export function stripLoneSurrogateEscapes(s: string): string {
|
||||
return s.replace(LONE_SURROGATE_HIGH_ESCAPE, '\\uFFFD').replace(LONE_SURROGATE_LOW_ESCAPE, '\\uFFFD');
|
||||
}
|
||||
|
||||
// Pick the right sanitizer based on whether the body has already been JSON-stringified.
|
||||
// For application/json bodies, run both passes: raw first (in case the JSON encoder
|
||||
// emitted surrogates as-is rather than escaping), then escape-text.
|
||||
export function sanitizeBody(body: string, isJson: boolean): string {
|
||||
return isJson ? stripLoneSurrogateEscapes(stripLoneSurrogates(body)) : stripLoneSurrogates(body);
|
||||
}
|
||||
+31
-8
@@ -42,6 +42,7 @@ import { inspectElement, modifyStyle, resetModifications, getModificationHistory
|
||||
// Bun.spawn used instead of child_process.spawn (compiled bun binaries
|
||||
// fail posix_spawn on all executables including /bin/bash)
|
||||
import { safeUnlink, safeUnlinkQuiet, safeKill } from './error-handling';
|
||||
import { sanitizeBody, stripLoneSurrogateEscapes } from './sanitize';
|
||||
import { startSocksBridge, testUpstream, type BridgeHandle } from './socks-bridge';
|
||||
import { parseProxyConfig, toUpstreamConfig, ProxyConfigError } from './proxy-config';
|
||||
import { redactProxyUrl } from './proxy-redact';
|
||||
@@ -1092,17 +1093,32 @@ async function handleCommandInternal(
|
||||
return { ...cr, result: sanitizeLoneSurrogates(cr.result) };
|
||||
}
|
||||
|
||||
/** HTTP wrapper — converts CommandResult to Response */
|
||||
async function handleCommand(body: any, tokenInfo?: TokenInfo | null): Promise<Response> {
|
||||
const cr = await handleCommandInternal(body, tokenInfo);
|
||||
/**
|
||||
* Build the HTTP response from a CommandResult. Pure function so it can be
|
||||
* unit-tested without spinning up the server (#1440). Defense in depth on top
|
||||
* of handleCommandInternal's choke-point sanitization: this catches any
|
||||
* \uXXXX JSON-escape surrogate forms that the raw-codepoint regex above
|
||||
* misses when the body has already been JSON-stringified.
|
||||
*/
|
||||
export function buildCommandResponse(cr: CommandResult): Response {
|
||||
const contentType = cr.json ? 'application/json' : 'text/plain';
|
||||
return new Response(cr.result, {
|
||||
const safeBody = typeof cr.result === 'string' ? sanitizeBody(cr.result, !!cr.json) : cr.result;
|
||||
return new Response(safeBody, {
|
||||
status: cr.status,
|
||||
headers: { 'Content-Type': contentType, ...cr.headers },
|
||||
});
|
||||
}
|
||||
|
||||
// Module-level shutdown function deleted in v1.35.0.0; it now lives inside
|
||||
/** HTTP wrapper — converts CommandResult to Response. Used by the /command
|
||||
* route dispatcher (line ~2158). The wrapper layer exists so
|
||||
* `buildCommandResponse` is independently unit-testable (v1.38.1.0).
|
||||
*/
|
||||
async function handleCommand(body: any, tokenInfo?: TokenInfo | null): Promise<Response> {
|
||||
const cr = await handleCommandInternal(body, tokenInfo);
|
||||
return buildCommandResponse(cr);
|
||||
}
|
||||
|
||||
// Module-level shutdown function deleted in v1.39.0.0; it now lives inside
|
||||
// the buildFetchHandler closure so it closes the cfg-provided browserManager.
|
||||
// Signal handlers below call activeShutdown which buildFetchHandler assigns.
|
||||
|
||||
@@ -1979,10 +1995,13 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle {
|
||||
tokenInfo,
|
||||
{ skipRateCheck: true, skipActivity: true },
|
||||
);
|
||||
// Sanitize lone surrogates per-result (#1440 — /batch bypasses the
|
||||
// handleCommand chokepoint, so it needs its own sanitization).
|
||||
const safeResult = typeof cr.result === 'string' ? sanitizeBody(cr.result, !!cr.json) : cr.result;
|
||||
results.push({
|
||||
index: i,
|
||||
status: cr.status,
|
||||
result: cr.result,
|
||||
result: safeResult,
|
||||
command: cmd.command,
|
||||
tabId: cmd.tabId,
|
||||
});
|
||||
@@ -2002,13 +2021,17 @@ export function buildFetchHandler(cfg: ServerConfig): ServerHandle {
|
||||
clientId: tokenInfo?.clientId,
|
||||
});
|
||||
|
||||
return new Response(JSON.stringify({
|
||||
// Sanitize the JSON envelope a second time (defense in depth) — catches
|
||||
// any \uXXXX escape sequences for lone surrogates that survived the
|
||||
// per-result pass.
|
||||
const batchBody = stripLoneSurrogateEscapes(JSON.stringify({
|
||||
results,
|
||||
duration,
|
||||
total: commands.length,
|
||||
succeeded: results.filter(r => r.status === 200).length,
|
||||
failed: results.filter(r => r.status !== 200).length,
|
||||
}), {
|
||||
}));
|
||||
return new Response(batchBody, {
|
||||
status: 200,
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
});
|
||||
|
||||
@@ -22,6 +22,7 @@ import type { TabSession, RefEntry } from './tab-session';
|
||||
import * as Diff from 'diff';
|
||||
import { TEMP_DIR, isPathWithin } from './platform';
|
||||
import { escapeEnvelopeSentinels } from './content-security';
|
||||
import { stripLoneSurrogates } from './sanitize';
|
||||
|
||||
// Roles considered "interactive" for the -i flag
|
||||
const INTERACTIVE_ROLES = new Set([
|
||||
@@ -576,7 +577,7 @@ export async function handleSnapshot(
|
||||
}
|
||||
|
||||
session.setLastSnapshot(snapshotText);
|
||||
return diffOutput.join('\n');
|
||||
return stripLoneSurrogates(diffOutput.join('\n'));
|
||||
}
|
||||
|
||||
// Store for future diffs
|
||||
@@ -623,8 +624,8 @@ export async function handleSnapshot(
|
||||
parts.push('═══ BEGIN UNTRUSTED WEB CONTENT ═══');
|
||||
parts.push(...safeUntrusted);
|
||||
parts.push('═══ END UNTRUSTED WEB CONTENT ═══');
|
||||
return parts.join('\n');
|
||||
return stripLoneSurrogates(parts.join('\n'));
|
||||
}
|
||||
|
||||
return output.join('\n');
|
||||
return stripLoneSurrogates(output.join('\n'));
|
||||
}
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
// Unit test for buildCommandResponse — the exported response builder that
|
||||
// sanitizes lone Unicode surrogates at the HTTP boundary (#1440, D7 + D13).
|
||||
//
|
||||
// The function is exported from server.ts specifically so we can test it
|
||||
// without spinning up a Bun server. Codex flagged in D13 finding 14 that
|
||||
// "mock cr.result" wasn't testable when handleCommand was the only entry
|
||||
// point; this refactor solves that.
|
||||
|
||||
import { describe, expect, test } from 'bun:test';
|
||||
import { buildCommandResponse } from '../src/server';
|
||||
|
||||
describe('buildCommandResponse', () => {
|
||||
test('sanitizes lone surrogates in text/plain body', async () => {
|
||||
const cr = { status: 200, result: `pre\uD800post`, json: false };
|
||||
const res = buildCommandResponse(cr as any);
|
||||
expect(res.headers.get('content-type')).toBe('text/plain');
|
||||
expect(await res.text()).toBe(`pre�post`);
|
||||
});
|
||||
|
||||
test('sanitizes lone escape sequences in application/json body', async () => {
|
||||
// cr.result is already JSON-stringified by handleCommand callers when
|
||||
// cr.json=true. Surrogate escape sequences in the stringified form must
|
||||
// be neutralized.
|
||||
const cr = { status: 200, result: '{"name":"\\uD800"}', json: true };
|
||||
const res = buildCommandResponse(cr as any);
|
||||
expect(res.headers.get('content-type')).toBe('application/json');
|
||||
expect(await res.text()).toBe('{"name":"\\uFFFD"}');
|
||||
});
|
||||
|
||||
test('non-string cr.result passes through unchanged', async () => {
|
||||
// Some commands return Buffers or other ArrayBuffer-shaped bodies (e.g.
|
||||
// screenshots). Sanitizer must NOT touch them.
|
||||
const buf = new Uint8Array([1, 2, 3, 4]);
|
||||
const cr = { status: 200, result: buf, json: false };
|
||||
const res = buildCommandResponse(cr as any);
|
||||
// body returned verbatim; reading as array buffer should give same bytes
|
||||
const out = new Uint8Array(await res.arrayBuffer());
|
||||
expect(out.length).toBe(4);
|
||||
expect(out[0]).toBe(1);
|
||||
expect(out[3]).toBe(4);
|
||||
});
|
||||
|
||||
test('clean text passes through unchanged', async () => {
|
||||
const cr = { status: 200, result: 'Hello, world!', json: false };
|
||||
const res = buildCommandResponse(cr as any);
|
||||
expect(await res.text()).toBe('Hello, world!');
|
||||
});
|
||||
|
||||
test('status code propagates', async () => {
|
||||
const cr = { status: 404, result: 'Not found', json: false };
|
||||
const res = buildCommandResponse(cr as any);
|
||||
expect(res.status).toBe(404);
|
||||
});
|
||||
|
||||
test('extra headers propagate', async () => {
|
||||
const cr = { status: 200, result: 'ok', json: false, headers: { 'X-Custom': 'value' } };
|
||||
const res = buildCommandResponse(cr as any);
|
||||
expect(res.headers.get('x-custom')).toBe('value');
|
||||
});
|
||||
|
||||
test('JSON error body with lone surrogate is sanitized', async () => {
|
||||
// Errors set cr.json=true; a stringified error containing surrogates would
|
||||
// still crash the API without this sanitization.
|
||||
const cr = { status: 500, result: '{"error":"crash at \\uDC00 byte"}', json: true };
|
||||
const res = buildCommandResponse(cr as any);
|
||||
expect(await res.text()).toBe('{"error":"crash at \\uFFFD byte"}');
|
||||
});
|
||||
});
|
||||
@@ -584,7 +584,17 @@ describe('Envelope sentinel escape', () => {
|
||||
test('scoped snapshot branch applies escapeEnvelopeSentinels to untrusted lines', () => {
|
||||
const branchStart = SNAPSHOT_SRC.indexOf('splitForScoped');
|
||||
expect(branchStart).toBeGreaterThan(-1);
|
||||
const branchEnd = SNAPSHOT_SRC.indexOf("return output.join('\\n');", branchStart);
|
||||
// Match either the original return (pre-#1440) or the surrogate-sanitized
|
||||
// form (post-#1440) — both end the scoped branch.
|
||||
const candidates = [
|
||||
"return output.join('\\n');",
|
||||
"return stripLoneSurrogates(output.join('\\n'));",
|
||||
];
|
||||
let branchEnd = -1;
|
||||
for (const c of candidates) {
|
||||
const idx = SNAPSHOT_SRC.indexOf(c, branchStart);
|
||||
if (idx > branchStart) { branchEnd = idx; break; }
|
||||
}
|
||||
expect(branchEnd).toBeGreaterThan(branchStart);
|
||||
const branch = SNAPSHOT_SRC.slice(branchStart, branchEnd);
|
||||
// The escape helper must be invoked on the untrusted lines, and
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
// Unit tests for browse/src/sanitize.ts (#1440).
|
||||
// Covers stripLoneSurrogates (raw UTF-16) and stripLoneSurrogateEscapes
|
||||
// (\uXXXX escape text) used by the response chokepoints.
|
||||
|
||||
import { describe, expect, test } from 'bun:test';
|
||||
import { stripLoneSurrogates, stripLoneSurrogateEscapes, sanitizeBody } from '../src/sanitize';
|
||||
|
||||
describe('stripLoneSurrogates', () => {
|
||||
test('replaces lone high surrogate with U+FFFD', () => {
|
||||
const lone = '\uD800x';
|
||||
const out = stripLoneSurrogates(lone);
|
||||
expect(out).toBe('�x');
|
||||
});
|
||||
|
||||
test('replaces lone low surrogate with U+FFFD', () => {
|
||||
const lone = 'x\uDC00';
|
||||
expect(stripLoneSurrogates(lone)).toBe('x�');
|
||||
});
|
||||
|
||||
test('leaves valid surrogate pairs (emoji) unchanged', () => {
|
||||
const smiley = '😀'; // U+1F600 = 😀
|
||||
expect(stripLoneSurrogates(smiley)).toBe(smiley);
|
||||
});
|
||||
|
||||
test('empty string is unchanged', () => {
|
||||
expect(stripLoneSurrogates('')).toBe('');
|
||||
});
|
||||
|
||||
test('mixed valid + lone surrogates', () => {
|
||||
const input = `a\uD800b😀c\uDC00d`;
|
||||
const out = stripLoneSurrogates(input);
|
||||
expect(out).toBe(`a�b😀c�d`);
|
||||
});
|
||||
|
||||
test('clean text passes through unchanged', () => {
|
||||
const text = 'The quick brown fox jumps over 13 lazy dogs.';
|
||||
expect(stripLoneSurrogates(text)).toBe(text);
|
||||
});
|
||||
|
||||
test('high surrogate immediately followed by high surrogate replaces both individually', () => {
|
||||
const input = '\uD800\uD801'; // two lone highs in a row, neither paired
|
||||
const out = stripLoneSurrogates(input);
|
||||
expect(out).toBe('��');
|
||||
});
|
||||
});
|
||||
|
||||
describe('stripLoneSurrogateEscapes', () => {
|
||||
test('replaces lone high surrogate ESCAPE with \\uFFFD', () => {
|
||||
const json = '{"name":"\\uD800"}';
|
||||
expect(stripLoneSurrogateEscapes(json)).toBe('{"name":"\\uFFFD"}');
|
||||
});
|
||||
|
||||
test('replaces lone low surrogate ESCAPE with \\uFFFD', () => {
|
||||
const json = '{"name":"\\uDC00"}';
|
||||
expect(stripLoneSurrogateEscapes(json)).toBe('{"name":"\\uFFFD"}');
|
||||
});
|
||||
|
||||
test('leaves valid escape pair unchanged', () => {
|
||||
// 😀 = 😀 — must NOT be touched
|
||||
const json = '{"emoji":"\\uD83D\\uDE00"}';
|
||||
expect(stripLoneSurrogateEscapes(json)).toBe(json);
|
||||
});
|
||||
|
||||
test('mixed escape pairs and lone escapes', () => {
|
||||
const json = '{"a":"\\uD800","b":"\\uD83D\\uDE00","c":"\\uDC00"}';
|
||||
expect(stripLoneSurrogateEscapes(json)).toBe('{"a":"\\uFFFD","b":"\\uD83D\\uDE00","c":"\\uFFFD"}');
|
||||
});
|
||||
|
||||
test('clean JSON passes through unchanged', () => {
|
||||
const json = '{"results":[{"status":200,"command":"text"}]}';
|
||||
expect(stripLoneSurrogateEscapes(json)).toBe(json);
|
||||
});
|
||||
|
||||
test('case-insensitive matching: \\uD8aa works like \\uD8AA', () => {
|
||||
expect(stripLoneSurrogateEscapes('\\uD8aa')).toBe('\\uFFFD');
|
||||
});
|
||||
});
|
||||
|
||||
describe('sanitizeBody', () => {
|
||||
test('text/plain body: applies raw-surrogate strip only', () => {
|
||||
const input = `pre\uD800post`;
|
||||
expect(sanitizeBody(input, false)).toBe(`pre�post`);
|
||||
});
|
||||
|
||||
test('JSON body: applies both raw and escape passes', () => {
|
||||
// Both raw and escape variants in the same body
|
||||
const input = `{"raw":"\uD800","esc":"\\uD800"}`;
|
||||
const out = sanitizeBody(input, true);
|
||||
expect(out).toBe(`{"raw":"�","esc":"\\uFFFD"}`);
|
||||
});
|
||||
|
||||
test('clean text/plain body unchanged', () => {
|
||||
const text = 'Hello world\nLine 2';
|
||||
expect(sanitizeBody(text, false)).toBe(text);
|
||||
});
|
||||
|
||||
test('clean JSON body unchanged', () => {
|
||||
const json = '{"ok":true}';
|
||||
expect(sanitizeBody(json, true)).toBe(json);
|
||||
});
|
||||
});
|
||||
|
||||
describe('perf smoke', () => {
|
||||
test('1MB of clean text sanitizes in <500ms', () => {
|
||||
const big = 'A'.repeat(1024 * 1024);
|
||||
const start = performance.now();
|
||||
const out = stripLoneSurrogates(big);
|
||||
const elapsed = performance.now() - start;
|
||||
expect(out.length).toBe(big.length);
|
||||
expect(elapsed).toBeLessThan(500);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user