diff --git a/browse/src/commands.ts b/browse/src/commands.ts index 944b397c..eacdf0cd 100644 --- a/browse/src/commands.ts +++ b/browse/src/commands.ts @@ -25,6 +25,7 @@ export const WRITE_COMMANDS = new Set([ 'viewport', 'cookie', 'cookie-import', 'cookie-import-browser', 'header', 'useragent', 'upload', 'dialog-accept', 'dialog-dismiss', 'style', 'cleanup', 'prettyscreenshot', + 'download', 'scrape', 'archive', ]); export const META_COMMANDS = new Set([ @@ -104,6 +105,10 @@ export const COMMAND_DESCRIPTIONS: Record' }, 'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' }, 'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' }, + // Data extraction + 'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies', usage: 'download [path] [--base64]' }, + 'scrape': { category: 'Extraction', description: 'Bulk download all media from page. Writes manifest.json', usage: 'scrape [--selector sel] [--dir path] [--limit N]' }, + 'archive': { category: 'Extraction', description: 'Save complete page as MHTML via CDP', usage: 'archive [path]' }, // Visual 'screenshot': { category: 'Visual', description: 'Save screenshot (supports element crop via CSS/@ref, --clip region, --viewport)', usage: 'screenshot [--viewport] [--clip x,y,w,h] [selector|@ref] [path]' }, 'pdf': { category: 'Visual', description: 'Save as PDF', usage: 'pdf [path]' }, diff --git a/browse/src/token-registry.ts b/browse/src/token-registry.ts index 8e6976b1..56d3234d 100644 --- a/browse/src/token-registry.ts +++ b/browse/src/token-registry.ts @@ -49,6 +49,7 @@ export const SCOPE_WRITE = new Set([ 'click', 'fill', 'select', 'hover', 'type', 'press', 'scroll', 'wait', 'upload', 'viewport', 'newtab', 'closetab', 'dialog-accept', 'dialog-dismiss', + 'download', 'scrape', 'archive', ]); /** Page-level power tools — JS execution, credential access, page mutations */ diff --git a/browse/src/write-commands.ts b/browse/src/write-commands.ts index 07899616..4f7d0d31 100644 --- a/browse/src/write-commands.ts +++ b/browse/src/write-commands.ts @@ -869,7 +869,230 @@ export async function handleWriteCommand( return parts.join(' '); } + case 'download': { + if (args.length === 0) throw new Error('Usage: download [path] [--base64]'); + const isBase64 = args.includes('--base64'); + const filteredArgs = args.filter(a => a !== '--base64'); + let url = filteredArgs[0]; + const outputPath = filteredArgs[1]; + + // Resolve @ref to element src + if (url.startsWith('@')) { + const resolved = await bm.resolveRef(url); + if (!('locator' in resolved)) throw new Error(`Expected @ref, got CSS selector: ${url}`); + const locator = resolved.locator; + const tagName = await locator.evaluate(el => el.tagName.toLowerCase()); + if (tagName === 'img') { + url = await locator.evaluate(el => { + const img = el as HTMLImageElement; + return img.currentSrc || img.src || img.getAttribute('data-src') || ''; + }); + } else if (tagName === 'video') { + url = await locator.evaluate(el => (el as HTMLVideoElement).currentSrc || (el as HTMLVideoElement).src || ''); + } else if (tagName === 'audio') { + url = await locator.evaluate(el => (el as HTMLAudioElement).currentSrc || (el as HTMLAudioElement).src || ''); + } else { + // Try src attribute on any element + url = await locator.evaluate(el => el.getAttribute('src') || ''); + } + if (!url) throw new Error(`Could not extract URL from ${filteredArgs[0]} (${tagName})`); + } + + // Check for HLS/DASH + if (url.includes('.m3u8') || url.includes('.mpd')) { + throw new Error('This is an HLS/DASH stream. Use yt-dlp or ffmpeg for adaptive stream downloads.'); + } + + // Determine output path and extension + const page = bm.getPage(); + let contentType = 'application/octet-stream'; + let buffer: Buffer; + + if (url.startsWith('blob:')) { + // Strategy 3: Blob URL -- in-page fetch + base64 + const dataUrl = await page.evaluate(async (blobUrl) => { + try { + const resp = await fetch(blobUrl); + const blob = await resp.blob(); + if (blob.size > 100 * 1024 * 1024) return 'ERROR:TOO_LARGE'; + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onloadend = () => resolve(reader.result as string); + reader.onerror = () => reject('Failed to read blob'); + reader.readAsDataURL(blob); + }); + } catch { + return 'ERROR:EXPIRED'; + } + }, url); + + if (dataUrl === 'ERROR:TOO_LARGE') throw new Error('Blob too large (>100MB). Use a different approach.'); + if (dataUrl === 'ERROR:EXPIRED') throw new Error('Blob URL expired or inaccessible.'); + + const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/); + if (!match) throw new Error('Failed to decode blob data'); + contentType = match[1]; + buffer = Buffer.from(match[2], 'base64'); + } else { + // Strategy 1: Direct URL via page.request.fetch() + const response = await page.request.fetch(url, { timeout: 30000 }); + const status = response.status(); + if (status >= 400) { + throw new Error(`Download failed: HTTP ${status} ${response.statusText()}`); + } + contentType = response.headers()['content-type'] || 'application/octet-stream'; + buffer = Buffer.from(await response.body()); + if (buffer.length > 200 * 1024 * 1024) { + throw new Error('File too large (>200MB).'); + } + } + + // --base64 mode: return inline + if (isBase64) { + if (buffer.length > 10 * 1024 * 1024) { + throw new Error('File too large for --base64 (>10MB). Use disk download + GET /file instead.'); + } + const mimeType = contentType.split(';')[0].trim(); + return `data:${mimeType};base64,${buffer.toString('base64')}`; + } + + // Write to disk + const ext = contentType.split(';')[0].includes('/') + ? mimeToExt(contentType.split(';')[0].trim()) + : '.bin'; + const destPath = outputPath || path.join(TEMP_DIR, `browse-download-${Date.now()}${ext}`); + validateOutputPath(destPath); + fs.writeFileSync(destPath, buffer); + const sizeKB = Math.round(buffer.length / 1024); + return `Downloaded: ${destPath} (${sizeKB}KB, ${contentType.split(';')[0].trim()})`; + } + + case 'scrape': { + if (args.length === 0) throw new Error('Usage: scrape [--selector sel] [--dir path] [--limit N]'); + const mediaType = args[0]; + if (!['images', 'videos', 'media'].includes(mediaType)) { + throw new Error(`Invalid type: ${mediaType}. Use: images, videos, or media`); + } + + // Parse flags + const selectorIdx = args.indexOf('--selector'); + const selector = selectorIdx >= 0 ? args[selectorIdx + 1] : undefined; + const dirIdx = args.indexOf('--dir'); + const dir = dirIdx >= 0 ? args[dirIdx + 1] : path.join(TEMP_DIR, `browse-scrape-${Date.now()}`); + const limitIdx = args.indexOf('--limit'); + const limit = Math.min(limitIdx >= 0 ? parseInt(args[limitIdx + 1], 10) || 50 : 50, 200); + + validateOutputPath(dir); + fs.mkdirSync(dir, { recursive: true }); + + const { extractMedia } = await import('./media-extract'); + const target = bm.getActiveFrameOrPage(); + const filter = mediaType === 'images' ? 'images' as const + : mediaType === 'videos' ? 'videos' as const + : undefined; + const mediaResult = await extractMedia(target, { selector, filter }); + + // Collect URLs to download + const urls: Array<{ url: string; type: string }> = []; + const seen = new Set(); + + for (const img of mediaResult.images) { + const url = img.currentSrc || img.src || img.dataSrc; + if (url && !seen.has(url) && !url.startsWith('data:')) { + seen.add(url); + urls.push({ url, type: 'image' }); + } + } + for (const vid of mediaResult.videos) { + const url = vid.currentSrc || vid.src; + if (url && !seen.has(url) && !url.startsWith('blob:') && !vid.isHLS && !vid.isDASH) { + seen.add(url); + urls.push({ url, type: 'video' }); + } + } + for (const bg of mediaResult.backgroundImages) { + if (bg.url && !seen.has(bg.url)) { + seen.add(bg.url); + urls.push({ url: bg.url, type: 'image' }); + } + } + + const toDownload = urls.slice(0, limit); + const page = bm.getPage(); + const manifest: any = { + url: page.url(), + scraped_at: new Date().toISOString(), + files: [] as any[], + total_size: 0, + succeeded: 0, + failed: 0, + }; + + const lines: string[] = []; + for (let i = 0; i < toDownload.length; i++) { + const { url, type } = toDownload[i]; + try { + const response = await page.request.fetch(url, { timeout: 30000 }); + if (response.status() >= 400) throw new Error(`HTTP ${response.status()}`); + const ct = response.headers()['content-type'] || 'application/octet-stream'; + const ext = mimeToExt(ct.split(';')[0].trim()); + const filename = `${type}-${String(i + 1).padStart(3, '0')}${ext}`; + const filePath = path.join(dir, filename); + const body = Buffer.from(await response.body()); + try { + fs.writeFileSync(filePath, body); + } catch (writeErr: any) { + throw new Error(`Disk write failed: ${writeErr.message}`); + } + manifest.files.push({ path: filename, src: url, size: body.length, type: ct.split(';')[0].trim() }); + manifest.total_size += body.length; + manifest.succeeded++; + lines.push(` [${i + 1}/${toDownload.length}] ${filename} (${Math.round(body.length / 1024)}KB)`); + } catch (err: any) { + manifest.files.push({ path: null, src: url, size: 0, type: '', error: err.message }); + manifest.failed++; + lines.push(` [${i + 1}/${toDownload.length}] FAILED: ${err.message}`); + } + // 100ms delay between downloads + if (i < toDownload.length - 1) await new Promise(r => setTimeout(r, 100)); + } + + // Write manifest + fs.writeFileSync(path.join(dir, 'manifest.json'), JSON.stringify(manifest, null, 2)); + + return `Scraped ${toDownload.length} items to ${dir}/\n${lines.join('\n')}\n\nSummary: ${manifest.succeeded} succeeded, ${manifest.failed} failed, ${Math.round(manifest.total_size / 1024)}KB total`; + } + + case 'archive': { + const page = bm.getPage(); + const outputPath = args[0] || path.join(TEMP_DIR, `browse-archive-${Date.now()}.mhtml`); + validateOutputPath(outputPath); + + try { + const cdp = await page.context().newCDPSession(page); + const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' }); + await cdp.detach(); + fs.writeFileSync(outputPath, data); + return `Archive saved: ${outputPath} (${Math.round(data.length / 1024)}KB, MHTML)`; + } catch (err: any) { + throw new Error(`MHTML archive requires Chromium CDP. Use 'text' or 'html' for raw page content. (${err.message})`); + } + } + default: throw new Error(`Unknown write command: ${command}`); } } + +/** Map MIME type to file extension. */ +function mimeToExt(mime: string): string { + const map: Record = { + 'image/png': '.png', 'image/jpeg': '.jpg', 'image/gif': '.gif', + 'image/webp': '.webp', 'image/svg+xml': '.svg', 'image/avif': '.avif', + 'video/mp4': '.mp4', 'video/webm': '.webm', 'video/quicktime': '.mov', + 'audio/mpeg': '.mp3', 'audio/wav': '.wav', 'audio/ogg': '.ogg', + 'application/pdf': '.pdf', 'application/json': '.json', + 'text/html': '.html', 'text/plain': '.txt', + }; + return map[mime] || '.bin'; +}