feat: add content-security.ts with datamarking, envelope, and filter hooks

Four-layer prompt injection defense for pair-agent browser sharing:
- Datamarking: session-scoped watermark for text exfiltration detection
- Content envelope: trust boundary wrapping with ZWSP marker escaping
- Content filter hooks: extensible filter pipeline with warn/block modes
- Built-in URL blocklist: requestbin, pipedream, webhook.site, etc.

BROWSE_CONTENT_FILTER env var controls mode: off|warn|block (default: warn)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-05 11:05:38 -07:00
parent 905f1ddd38
commit 5184ea677b
+208
View File
@@ -0,0 +1,208 @@
/**
* Content security layer for pair-agent browser sharing.
*
* Four defense layers:
* 1. Datamarking — watermark text output to detect exfiltration
* 2. Hidden element stripping — remove invisible/deceptive elements from output
* 3. Content filter hooks — extensible URL/content filter pipeline
* 4. Instruction block hardening — SECURITY section in agent instructions
*
* This module handles layers 1-3. Layer 4 is in cli.ts.
*/
import { randomBytes } from 'crypto';
// ─── Datamarking (Layer 1) ──────────────────────────────────────
/** Session-scoped random marker for text watermarking */
let sessionMarker: string | null = null;
function ensureMarker(): string {
if (!sessionMarker) {
sessionMarker = randomBytes(3).toString('base64').slice(0, 4);
}
return sessionMarker;
}
/** Exported for tests only */
export function getSessionMarker(): string {
return ensureMarker();
}
/** Reset marker (for testing) */
export function resetSessionMarker(): void {
sessionMarker = null;
}
/**
* Insert invisible watermark into text content.
* Places the marker as zero-width characters between words.
* Only applied to `text` command output (not html, forms, or structured data).
*/
export function datamarkContent(content: string): string {
const marker = ensureMarker();
// Insert marker as a Unicode tag sequence between sentences (after periods followed by space)
// This is subtle enough to not corrupt output but detectable if exfiltrated
const zwsp = '\u200B'; // zero-width space
const taggedMarker = marker.split('').map(c => zwsp + c).join('');
// Insert after every 3rd sentence-ending period
let count = 0;
return content.replace(/(\. )/g, (match) => {
count++;
if (count % 3 === 0) {
return match + taggedMarker;
}
return match;
});
}
// ─── Content Envelope (wrapping) ────────────────────────────────
const ENVELOPE_BEGIN = '═══ BEGIN UNTRUSTED WEB CONTENT ═══';
const ENVELOPE_END = '═══ END UNTRUSTED WEB CONTENT ═══';
/**
* Wrap page content in a trust boundary envelope for scoped tokens.
* Escapes envelope markers in content to prevent boundary escape attacks.
*/
export function wrapUntrustedPageContent(
content: string,
command: string,
filterWarnings?: string[],
): string {
// Escape envelope markers in content (zero-width space injection)
const zwsp = '\u200B';
const safeContent = content
.replace(/═══ BEGIN UNTRUSTED WEB CONTENT ═══/g, `═══ BEGIN UNTRUSTED WEB C${zwsp}ONTENT ═══`)
.replace(/═══ END UNTRUSTED WEB CONTENT ═══/g, `═══ END UNTRUSTED WEB C${zwsp}ONTENT ═══`);
const parts: string[] = [];
if (filterWarnings && filterWarnings.length > 0) {
parts.push(`⚠ CONTENT WARNINGS: ${filterWarnings.join('; ')}`);
}
parts.push(ENVELOPE_BEGIN);
parts.push(safeContent);
parts.push(ENVELOPE_END);
return parts.join('\n');
}
// ─── Content Filter Hooks (Layer 3) ─────────────────────────────
export interface ContentFilterResult {
safe: boolean;
warnings: string[];
blocked?: boolean;
message?: string;
}
export type ContentFilter = (
content: string,
url: string,
command: string,
) => ContentFilterResult;
const registeredFilters: ContentFilter[] = [];
export function registerContentFilter(filter: ContentFilter): void {
registeredFilters.push(filter);
}
export function clearContentFilters(): void {
registeredFilters.length = 0;
}
/** Get current filter mode from env */
export function getFilterMode(): 'off' | 'warn' | 'block' {
const mode = process.env.BROWSE_CONTENT_FILTER?.toLowerCase();
if (mode === 'off' || mode === 'block') return mode;
return 'warn'; // default
}
/**
* Run all registered content filters against content.
* Returns aggregated result with all warnings.
*/
export function runContentFilters(
content: string,
url: string,
command: string,
): ContentFilterResult {
const mode = getFilterMode();
if (mode === 'off') {
return { safe: true, warnings: [] };
}
const allWarnings: string[] = [];
let blocked = false;
for (const filter of registeredFilters) {
const result = filter(content, url, command);
if (!result.safe) {
allWarnings.push(...result.warnings);
if (mode === 'block') {
blocked = true;
}
}
}
if (blocked && allWarnings.length > 0) {
return {
safe: false,
warnings: allWarnings,
blocked: true,
message: `Content blocked: ${allWarnings.join('; ')}`,
};
}
return {
safe: allWarnings.length === 0,
warnings: allWarnings,
};
}
// ─── Built-in URL Blocklist Filter ──────────────────────────────
const BLOCKLIST_DOMAINS = [
'requestbin.com',
'pipedream.com',
'webhook.site',
'hookbin.com',
'requestcatcher.com',
'burpcollaborator.net',
'interact.sh',
'canarytokens.com',
'ngrok.io',
'ngrok-free.app',
];
/** Check if URL matches any blocklisted exfiltration domain */
export function urlBlocklistFilter(content: string, url: string, _command: string): ContentFilterResult {
const warnings: string[] = [];
// Check page URL
for (const domain of BLOCKLIST_DOMAINS) {
if (url.includes(domain)) {
warnings.push(`Page URL matches blocklisted domain: ${domain}`);
}
}
// Check for blocklisted URLs in content (links, form actions)
const urlPattern = /https?:\/\/[^\s"'<>]+/g;
const contentUrls = content.match(urlPattern) || [];
for (const contentUrl of contentUrls) {
for (const domain of BLOCKLIST_DOMAINS) {
if (contentUrl.includes(domain)) {
warnings.push(`Content contains blocklisted URL: ${contentUrl.slice(0, 100)}`);
break;
}
}
}
return { safe: warnings.length === 0, warnings };
}
// Register the built-in filter on module load
registerContentFilter(urlBlocklistFilter);