security: gate download + scrape through validateNavigationUrl (SSRF)

The `goto` command was correctly wired through validateNavigationUrl,
but `download` and `scrape` called page.request.fetch(url, ...) directly.
A caller with the default write scope could hit the /command endpoint
and ask the daemon to fetch http://169.254.169.254/latest/meta-data/
(AWS IMDSv1) or the GCP/Azure/internal equivalents. The response body
comes back as base64 or lands on disk where GET /file serves it.

Fix: call validateNavigationUrl(url) immediately before each
page.request.fetch() call site in download and in the scrape loop.
Same blocklist that already protects `goto`: file://, javascript:,
data:, chrome://, cloud metadata (IPv4 all encodings, IPv6 ULA,
metadata.*.internal).

Tests: extend browse/test/url-validation.test.ts with a source-level
guard that walks every `await page.request.fetch(` call site and
asserts a validateNavigationUrl call precedes it within the same
branch. Regression trips before code review if a future refactor
drops the gate.
This commit is contained in:
gus
2026-04-16 19:47:19 -03:00
committed by Garry Tan
parent de972b514f
commit b7cf46d6e6
2 changed files with 88 additions and 1 deletions
+14 -1
View File
@@ -1188,7 +1188,16 @@ export async function handleWriteCommand(
contentType = match[1];
buffer = Buffer.from(match[2], 'base64');
} else {
// Strategy 1: Direct URL via page.request.fetch()
// Strategy 1: Direct URL via page.request.fetch().
// Gate the URL through the same validator `goto` uses. Without
// this check, download + scrape bypass the navigation
// blocklist and a caller with write scope can read
// http://169.254.169.254/latest/meta-data/ (AWS IMDSv1), the
// GCP/Azure metadata equivalents, or any internal IPv4/IPv6
// the server happens to route to. The response body is then
// returned to the caller (base64) or written to disk where
// GET /file serves it back.
await validateNavigationUrl(url);
const response = await page.request.fetch(url, { timeout: 30000 });
const status = response.status();
if (status >= 400) {
@@ -1286,6 +1295,10 @@ export async function handleWriteCommand(
for (let i = 0; i < toDownload.length; i++) {
const { url, type } = toDownload[i];
try {
// Same gate as the download command — page.request.fetch
// must not reach cloud metadata, ULA ranges, or the rest of
// the blocklist. See url-validation.ts for the full list.
await validateNavigationUrl(url);
const response = await page.request.fetch(url, { timeout: 30000 });
if (response.status() >= 400) throw new Error(`HTTP ${response.status()}`);
const ct = response.headers()['content-type'] || 'application/octet-stream';
+74
View File
@@ -221,3 +221,77 @@ describe('validateNavigationUrl — file:// URL-encoding', () => {
).rejects.toThrow(/encoded \/|Path must be within/i);
});
});
// ---------------------------------------------------------------------------
// download + scrape must gate page.request.fetch through validateNavigationUrl
//
// Regression: the `goto` command was correctly wired through
// validateNavigationUrl, but the `download` and `scrape` commands
// called page.request.fetch(url, ...) directly. A caller with the
// default write scope could hit the /command endpoint and ask the
// daemon to fetch http://169.254.169.254/latest/meta-data/ (AWS
// IMDSv1) or the GCP/Azure/internal equivalents; the body comes back
// as base64 or lands on disk where GET /file serves it.
//
// Source-level check: both page.request.fetch call sites must have a
// validateNavigationUrl invocation immediately before them.
// ---------------------------------------------------------------------------
import { readFileSync } from 'fs';
import { join } from 'path';
describe('download + scrape SSRF gate', () => {
const WRITE_COMMANDS_SRC = readFileSync(
join(import.meta.dir, '..', 'src', 'write-commands.ts'),
'utf-8',
);
function callsitesOf(needle: string): number[] {
const idxs: number[] = [];
let at = 0;
while ((at = WRITE_COMMANDS_SRC.indexOf(needle, at)) !== -1) {
idxs.push(at);
at += needle.length;
}
return idxs;
}
it('every page.request.fetch sits under a preceding validateNavigationUrl', () => {
// Match the actual call site (`await page.request.fetch(`), not the
// token when it appears inside a code comment.
const fetches = callsitesOf('await page.request.fetch(');
expect(fetches.length).toBeGreaterThan(0);
for (const idx of fetches) {
// Look at the 400 chars preceding the call — the gate must live
// within the same branch / try block. 400 covers the comment +
// await invocation without letting an unrelated upstream gate
// pass as evidence.
const lead = WRITE_COMMANDS_SRC.slice(Math.max(0, idx - 400), idx);
expect(lead).toMatch(/validateNavigationUrl\s*\(/);
}
});
it('download command validates the URL before fetch', () => {
const block = WRITE_COMMANDS_SRC.slice(
WRITE_COMMANDS_SRC.indexOf("case 'download'"),
WRITE_COMMANDS_SRC.indexOf("case 'scrape'"),
);
const vIdx = block.indexOf('validateNavigationUrl');
const fIdx = block.indexOf('await page.request.fetch(');
expect(vIdx).toBeGreaterThan(-1);
expect(fIdx).toBeGreaterThan(-1);
expect(vIdx).toBeLessThan(fIdx);
});
it('scrape command validates each URL before fetch in the loop', () => {
const block = WRITE_COMMANDS_SRC.slice(
WRITE_COMMANDS_SRC.indexOf("case 'scrape'"),
);
// find the first actual `await page.request.fetch(` call site in scrape
// and the nearest preceding validateNavigationUrl
const fIdx = block.indexOf('await page.request.fetch(');
expect(fIdx).toBeGreaterThan(-1);
const preFetch = block.slice(0, fIdx);
const vIdx = preFetch.lastIndexOf('validateNavigationUrl');
expect(vIdx).toBeGreaterThan(-1);
});
});