feat: add download, scrape, and archive commands

download: fetch any URL or @ref element to disk using browser session
cookies via page.request.fetch(). Supports blob: URLs via in-page
base64 conversion. --base64 flag returns inline data URI (cap 10MB).
Detects HLS/DASH and rejects with yt-dlp hint.

scrape: bulk media download composing media discovery + download loop.
Sequential with 100ms delay, URL deduplication, configurable --limit.
Writes manifest.json with per-file metadata for machine consumption.

archive: saves complete page as MHTML via CDP Page.captureSnapshot.
No silent fallback -- errors clearly if CDP unavailable.

All three are WRITE scope (write to disk, blocked in watch mode).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-07 19:02:34 -10:00
parent cc63edb006
commit c1581c9b0a
3 changed files with 229 additions and 0 deletions
+5
View File
@@ -25,6 +25,7 @@ export const WRITE_COMMANDS = new Set([
'viewport', 'cookie', 'cookie-import', 'cookie-import-browser', 'header', 'useragent',
'upload', 'dialog-accept', 'dialog-dismiss',
'style', 'cleanup', 'prettyscreenshot',
'download', 'scrape', 'archive',
]);
export const META_COMMANDS = new Set([
@@ -104,6 +105,10 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
'useragent': { category: 'Interaction', description: 'Set user agent', usage: 'useragent <string>' },
'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' },
'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' },
// Data extraction
'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies', usage: 'download <url|@ref> [path] [--base64]' },
'scrape': { category: 'Extraction', description: 'Bulk download all media from page. Writes manifest.json', usage: 'scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]' },
'archive': { category: 'Extraction', description: 'Save complete page as MHTML via CDP', usage: 'archive [path]' },
// Visual
'screenshot': { category: 'Visual', description: 'Save screenshot (supports element crop via CSS/@ref, --clip region, --viewport)', usage: 'screenshot [--viewport] [--clip x,y,w,h] [selector|@ref] [path]' },
'pdf': { category: 'Visual', description: 'Save as PDF', usage: 'pdf [path]' },
+1
View File
@@ -49,6 +49,7 @@ export const SCOPE_WRITE = new Set([
'click', 'fill', 'select', 'hover', 'type', 'press', 'scroll', 'wait',
'upload', 'viewport', 'newtab', 'closetab',
'dialog-accept', 'dialog-dismiss',
'download', 'scrape', 'archive',
]);
/** Page-level power tools — JS execution, credential access, page mutations */
+223
View File
@@ -869,7 +869,230 @@ export async function handleWriteCommand(
return parts.join(' ');
}
case 'download': {
if (args.length === 0) throw new Error('Usage: download <url|@ref> [path] [--base64]');
const isBase64 = args.includes('--base64');
const filteredArgs = args.filter(a => a !== '--base64');
let url = filteredArgs[0];
const outputPath = filteredArgs[1];
// Resolve @ref to element src
if (url.startsWith('@')) {
const resolved = await bm.resolveRef(url);
if (!('locator' in resolved)) throw new Error(`Expected @ref, got CSS selector: ${url}`);
const locator = resolved.locator;
const tagName = await locator.evaluate(el => el.tagName.toLowerCase());
if (tagName === 'img') {
url = await locator.evaluate(el => {
const img = el as HTMLImageElement;
return img.currentSrc || img.src || img.getAttribute('data-src') || '';
});
} else if (tagName === 'video') {
url = await locator.evaluate(el => (el as HTMLVideoElement).currentSrc || (el as HTMLVideoElement).src || '');
} else if (tagName === 'audio') {
url = await locator.evaluate(el => (el as HTMLAudioElement).currentSrc || (el as HTMLAudioElement).src || '');
} else {
// Try src attribute on any element
url = await locator.evaluate(el => el.getAttribute('src') || '');
}
if (!url) throw new Error(`Could not extract URL from ${filteredArgs[0]} (${tagName})`);
}
// Check for HLS/DASH
if (url.includes('.m3u8') || url.includes('.mpd')) {
throw new Error('This is an HLS/DASH stream. Use yt-dlp or ffmpeg for adaptive stream downloads.');
}
// Determine output path and extension
const page = bm.getPage();
let contentType = 'application/octet-stream';
let buffer: Buffer;
if (url.startsWith('blob:')) {
// Strategy 3: Blob URL -- in-page fetch + base64
const dataUrl = await page.evaluate(async (blobUrl) => {
try {
const resp = await fetch(blobUrl);
const blob = await resp.blob();
if (blob.size > 100 * 1024 * 1024) return 'ERROR:TOO_LARGE';
return new Promise<string>((resolve, reject) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result as string);
reader.onerror = () => reject('Failed to read blob');
reader.readAsDataURL(blob);
});
} catch {
return 'ERROR:EXPIRED';
}
}, url);
if (dataUrl === 'ERROR:TOO_LARGE') throw new Error('Blob too large (>100MB). Use a different approach.');
if (dataUrl === 'ERROR:EXPIRED') throw new Error('Blob URL expired or inaccessible.');
const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/);
if (!match) throw new Error('Failed to decode blob data');
contentType = match[1];
buffer = Buffer.from(match[2], 'base64');
} else {
// Strategy 1: Direct URL via page.request.fetch()
const response = await page.request.fetch(url, { timeout: 30000 });
const status = response.status();
if (status >= 400) {
throw new Error(`Download failed: HTTP ${status} ${response.statusText()}`);
}
contentType = response.headers()['content-type'] || 'application/octet-stream';
buffer = Buffer.from(await response.body());
if (buffer.length > 200 * 1024 * 1024) {
throw new Error('File too large (>200MB).');
}
}
// --base64 mode: return inline
if (isBase64) {
if (buffer.length > 10 * 1024 * 1024) {
throw new Error('File too large for --base64 (>10MB). Use disk download + GET /file instead.');
}
const mimeType = contentType.split(';')[0].trim();
return `data:${mimeType};base64,${buffer.toString('base64')}`;
}
// Write to disk
const ext = contentType.split(';')[0].includes('/')
? mimeToExt(contentType.split(';')[0].trim())
: '.bin';
const destPath = outputPath || path.join(TEMP_DIR, `browse-download-${Date.now()}${ext}`);
validateOutputPath(destPath);
fs.writeFileSync(destPath, buffer);
const sizeKB = Math.round(buffer.length / 1024);
return `Downloaded: ${destPath} (${sizeKB}KB, ${contentType.split(';')[0].trim()})`;
}
case 'scrape': {
if (args.length === 0) throw new Error('Usage: scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]');
const mediaType = args[0];
if (!['images', 'videos', 'media'].includes(mediaType)) {
throw new Error(`Invalid type: ${mediaType}. Use: images, videos, or media`);
}
// Parse flags
const selectorIdx = args.indexOf('--selector');
const selector = selectorIdx >= 0 ? args[selectorIdx + 1] : undefined;
const dirIdx = args.indexOf('--dir');
const dir = dirIdx >= 0 ? args[dirIdx + 1] : path.join(TEMP_DIR, `browse-scrape-${Date.now()}`);
const limitIdx = args.indexOf('--limit');
const limit = Math.min(limitIdx >= 0 ? parseInt(args[limitIdx + 1], 10) || 50 : 50, 200);
validateOutputPath(dir);
fs.mkdirSync(dir, { recursive: true });
const { extractMedia } = await import('./media-extract');
const target = bm.getActiveFrameOrPage();
const filter = mediaType === 'images' ? 'images' as const
: mediaType === 'videos' ? 'videos' as const
: undefined;
const mediaResult = await extractMedia(target, { selector, filter });
// Collect URLs to download
const urls: Array<{ url: string; type: string }> = [];
const seen = new Set<string>();
for (const img of mediaResult.images) {
const url = img.currentSrc || img.src || img.dataSrc;
if (url && !seen.has(url) && !url.startsWith('data:')) {
seen.add(url);
urls.push({ url, type: 'image' });
}
}
for (const vid of mediaResult.videos) {
const url = vid.currentSrc || vid.src;
if (url && !seen.has(url) && !url.startsWith('blob:') && !vid.isHLS && !vid.isDASH) {
seen.add(url);
urls.push({ url, type: 'video' });
}
}
for (const bg of mediaResult.backgroundImages) {
if (bg.url && !seen.has(bg.url)) {
seen.add(bg.url);
urls.push({ url: bg.url, type: 'image' });
}
}
const toDownload = urls.slice(0, limit);
const page = bm.getPage();
const manifest: any = {
url: page.url(),
scraped_at: new Date().toISOString(),
files: [] as any[],
total_size: 0,
succeeded: 0,
failed: 0,
};
const lines: string[] = [];
for (let i = 0; i < toDownload.length; i++) {
const { url, type } = toDownload[i];
try {
const response = await page.request.fetch(url, { timeout: 30000 });
if (response.status() >= 400) throw new Error(`HTTP ${response.status()}`);
const ct = response.headers()['content-type'] || 'application/octet-stream';
const ext = mimeToExt(ct.split(';')[0].trim());
const filename = `${type}-${String(i + 1).padStart(3, '0')}${ext}`;
const filePath = path.join(dir, filename);
const body = Buffer.from(await response.body());
try {
fs.writeFileSync(filePath, body);
} catch (writeErr: any) {
throw new Error(`Disk write failed: ${writeErr.message}`);
}
manifest.files.push({ path: filename, src: url, size: body.length, type: ct.split(';')[0].trim() });
manifest.total_size += body.length;
manifest.succeeded++;
lines.push(` [${i + 1}/${toDownload.length}] ${filename} (${Math.round(body.length / 1024)}KB)`);
} catch (err: any) {
manifest.files.push({ path: null, src: url, size: 0, type: '', error: err.message });
manifest.failed++;
lines.push(` [${i + 1}/${toDownload.length}] FAILED: ${err.message}`);
}
// 100ms delay between downloads
if (i < toDownload.length - 1) await new Promise(r => setTimeout(r, 100));
}
// Write manifest
fs.writeFileSync(path.join(dir, 'manifest.json'), JSON.stringify(manifest, null, 2));
return `Scraped ${toDownload.length} items to ${dir}/\n${lines.join('\n')}\n\nSummary: ${manifest.succeeded} succeeded, ${manifest.failed} failed, ${Math.round(manifest.total_size / 1024)}KB total`;
}
case 'archive': {
const page = bm.getPage();
const outputPath = args[0] || path.join(TEMP_DIR, `browse-archive-${Date.now()}.mhtml`);
validateOutputPath(outputPath);
try {
const cdp = await page.context().newCDPSession(page);
const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
await cdp.detach();
fs.writeFileSync(outputPath, data);
return `Archive saved: ${outputPath} (${Math.round(data.length / 1024)}KB, MHTML)`;
} catch (err: any) {
throw new Error(`MHTML archive requires Chromium CDP. Use 'text' or 'html' for raw page content. (${err.message})`);
}
}
default:
throw new Error(`Unknown write command: ${command}`);
}
}
/** Map MIME type to file extension. */
function mimeToExt(mime: string): string {
const map: Record<string, string> = {
'image/png': '.png', 'image/jpeg': '.jpg', 'image/gif': '.gif',
'image/webp': '.webp', 'image/svg+xml': '.svg', 'image/avif': '.avif',
'video/mp4': '.mp4', 'video/webm': '.webm', 'video/quicktime': '.mov',
'audio/mpeg': '.mp3', 'audio/wav': '.wav', 'audio/ogg': '.ogg',
'application/pdf': '.pdf', 'application/json': '.json',
'text/html': '.html', 'text/plain': '.txt',
};
return map[mime] || '.bin';
}