Files
gstack/browse/test/sanitize.test.ts
T
Garry Tan bdb6023713 fix(browse): sanitize lone Unicode surrogates at commandResult chokepoint + /batch envelope (#1440)
Page captures with mixed-script Unicode round-trip cleanly to the Claude API.
Two new utilities in browse/src/sanitize.ts: stripLoneSurrogates for raw UTF-16
strings, stripLoneSurrogateEscapes for \uXXXX JSON escape text. sanitizeBody
picks the right pass based on cr.json.

buildCommandResponse is extracted from handleCommand (now exported) and
applies sanitization before new Response(). /batch was bypassing this
chokepoint via direct JSON.stringify, so it sanitizes each cr.result before
pushing AND wraps the envelope with stripLoneSurrogateEscapes. Defense in
depth wraps at getCleanText, getCleanTextWithStripping, html, accessibility,
and snapshot.ts return points so downstream consumers (datamarking, envelope
wrapping) see sanitized text before the response is built.

25 new unit tests across sanitize.test.ts and build-command-response.test.ts.
content-security.test.ts updated to accept either pre- or post-sanitize form
of the snapshot scoped branch (source-level regression check).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 13:58:35 -07:00

113 lines
3.8 KiB
TypeScript

// Unit tests for browse/src/sanitize.ts (#1440).
// Covers stripLoneSurrogates (raw UTF-16) and stripLoneSurrogateEscapes
// (\uXXXX escape text) used by the response chokepoints.
import { describe, expect, test } from 'bun:test';
import { stripLoneSurrogates, stripLoneSurrogateEscapes, sanitizeBody } from '../src/sanitize';
describe('stripLoneSurrogates', () => {
test('replaces lone high surrogate with U+FFFD', () => {
const lone = '\uD800x';
const out = stripLoneSurrogates(lone);
expect(out).toBe('x');
});
test('replaces lone low surrogate with U+FFFD', () => {
const lone = 'x\uDC00';
expect(stripLoneSurrogates(lone)).toBe('x');
});
test('leaves valid surrogate pairs (emoji) unchanged', () => {
const smiley = '😀'; // U+1F600 = 😀
expect(stripLoneSurrogates(smiley)).toBe(smiley);
});
test('empty string is unchanged', () => {
expect(stripLoneSurrogates('')).toBe('');
});
test('mixed valid + lone surrogates', () => {
const input = `a\uD800b😀c\uDC00d`;
const out = stripLoneSurrogates(input);
expect(out).toBe(`ab😀cd`);
});
test('clean text passes through unchanged', () => {
const text = 'The quick brown fox jumps over 13 lazy dogs.';
expect(stripLoneSurrogates(text)).toBe(text);
});
test('high surrogate immediately followed by high surrogate replaces both individually', () => {
const input = '\uD800\uD801'; // two lone highs in a row, neither paired
const out = stripLoneSurrogates(input);
expect(out).toBe('');
});
});
describe('stripLoneSurrogateEscapes', () => {
test('replaces lone high surrogate ESCAPE with \\uFFFD', () => {
const json = '{"name":"\\uD800"}';
expect(stripLoneSurrogateEscapes(json)).toBe('{"name":"\\uFFFD"}');
});
test('replaces lone low surrogate ESCAPE with \\uFFFD', () => {
const json = '{"name":"\\uDC00"}';
expect(stripLoneSurrogateEscapes(json)).toBe('{"name":"\\uFFFD"}');
});
test('leaves valid escape pair unchanged', () => {
// 😀 = 😀 — must NOT be touched
const json = '{"emoji":"\\uD83D\\uDE00"}';
expect(stripLoneSurrogateEscapes(json)).toBe(json);
});
test('mixed escape pairs and lone escapes', () => {
const json = '{"a":"\\uD800","b":"\\uD83D\\uDE00","c":"\\uDC00"}';
expect(stripLoneSurrogateEscapes(json)).toBe('{"a":"\\uFFFD","b":"\\uD83D\\uDE00","c":"\\uFFFD"}');
});
test('clean JSON passes through unchanged', () => {
const json = '{"results":[{"status":200,"command":"text"}]}';
expect(stripLoneSurrogateEscapes(json)).toBe(json);
});
test('case-insensitive matching: \\uD8aa works like \\uD8AA', () => {
expect(stripLoneSurrogateEscapes('\\uD8aa')).toBe('\\uFFFD');
});
});
describe('sanitizeBody', () => {
test('text/plain body: applies raw-surrogate strip only', () => {
const input = `pre\uD800post`;
expect(sanitizeBody(input, false)).toBe(`prepost`);
});
test('JSON body: applies both raw and escape passes', () => {
// Both raw and escape variants in the same body
const input = `{"raw":"\uD800","esc":"\\uD800"}`;
const out = sanitizeBody(input, true);
expect(out).toBe(`{"raw":"","esc":"\\uFFFD"}`);
});
test('clean text/plain body unchanged', () => {
const text = 'Hello world\nLine 2';
expect(sanitizeBody(text, false)).toBe(text);
});
test('clean JSON body unchanged', () => {
const json = '{"ok":true}';
expect(sanitizeBody(json, true)).toBe(json);
});
});
describe('perf smoke', () => {
test('1MB of clean text sanitizes in <500ms', () => {
const big = 'A'.repeat(1024 * 1024);
const start = performance.now();
const out = stripLoneSurrogates(big);
const elapsed = performance.now() - start;
expect(out.length).toBe(big.length);
expect(elapsed).toBeLessThan(500);
});
});