feat: add download, scrape, and archive commands

download: fetch any URL or @ref element to disk using browser session cookies via page.request.fetch(). Supports blob: URLs via in-page base64 conversion. --base64 flag returns inline data URI (cap 10MB). Detects HLS/DASH and rejects with yt-dlp hint. scrape: bulk media download composing media discovery + download loop. Sequential with 100ms delay, URL deduplication, configurable --limit. Writes manifest.json with per-file metadata for machine consumption. archive: saves complete page as MHTML via CDP Page.captureSnapshot. No silent fallback -- errors clearly if CDP unavailable. All three are WRITE scope (write to disk, blocked in watch mode). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-06 13:45:35 +02:00 · 2026-04-07 19:02:34 -10:00
parent cc63edb006
commit c1581c9b0a
3 changed files with 229 additions and 0 deletions
@@ -25,6 +25,7 @@ export const WRITE_COMMANDS = new Set([
  'viewport', 'cookie', 'cookie-import', 'cookie-import-browser', 'header', 'useragent',
  'upload', 'dialog-accept', 'dialog-dismiss',
  'style', 'cleanup', 'prettyscreenshot',
+  'download', 'scrape', 'archive',
 ]);

 export const META_COMMANDS = new Set([
@@ -104,6 +105,10 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
  'useragent': { category: 'Interaction', description: 'Set user agent', usage: 'useragent <string>' },
  'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' },
  'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' },
+  // Data extraction
+  'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies', usage: 'download <url|@ref> [path] [--base64]' },
+  'scrape':   { category: 'Extraction', description: 'Bulk download all media from page. Writes manifest.json', usage: 'scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]' },
+  'archive':  { category: 'Extraction', description: 'Save complete page as MHTML via CDP', usage: 'archive [path]' },
  // Visual
  'screenshot': { category: 'Visual', description: 'Save screenshot (supports element crop via CSS/@ref, --clip region, --viewport)', usage: 'screenshot [--viewport] [--clip x,y,w,h] [selector|@ref] [path]' },
  'pdf':     { category: 'Visual', description: 'Save as PDF', usage: 'pdf [path]' },
@@ -49,6 +49,7 @@ export const SCOPE_WRITE = new Set([
  'click', 'fill', 'select', 'hover', 'type', 'press', 'scroll', 'wait',
  'upload', 'viewport', 'newtab', 'closetab',
  'dialog-accept', 'dialog-dismiss',
+  'download', 'scrape', 'archive',
 ]);

 /** Page-level power tools — JS execution, credential access, page mutations */
@@ -869,7 +869,230 @@ export async function handleWriteCommand(
      return parts.join(' ');
    }

+    case 'download': {
+      if (args.length === 0) throw new Error('Usage: download <url|@ref> [path] [--base64]');
+      const isBase64 = args.includes('--base64');
+      const filteredArgs = args.filter(a => a !== '--base64');
+      let url = filteredArgs[0];
+      const outputPath = filteredArgs[1];
+
+      // Resolve @ref to element src
+      if (url.startsWith('@')) {
+        const resolved = await bm.resolveRef(url);
+        if (!('locator' in resolved)) throw new Error(`Expected @ref, got CSS selector: ${url}`);
+        const locator = resolved.locator;
+        const tagName = await locator.evaluate(el => el.tagName.toLowerCase());
+        if (tagName === 'img') {
+          url = await locator.evaluate(el => {
+            const img = el as HTMLImageElement;
+            return img.currentSrc || img.src || img.getAttribute('data-src') || '';
+          });
+        } else if (tagName === 'video') {
+          url = await locator.evaluate(el => (el as HTMLVideoElement).currentSrc || (el as HTMLVideoElement).src || '');
+        } else if (tagName === 'audio') {
+          url = await locator.evaluate(el => (el as HTMLAudioElement).currentSrc || (el as HTMLAudioElement).src || '');
+        } else {
+          // Try src attribute on any element
+          url = await locator.evaluate(el => el.getAttribute('src') || '');
+        }
+        if (!url) throw new Error(`Could not extract URL from ${filteredArgs[0]} (${tagName})`);
+      }
+
+      // Check for HLS/DASH
+      if (url.includes('.m3u8') || url.includes('.mpd')) {
+        throw new Error('This is an HLS/DASH stream. Use yt-dlp or ffmpeg for adaptive stream downloads.');
+      }
+
+      // Determine output path and extension
+      const page = bm.getPage();
+      let contentType = 'application/octet-stream';
+      let buffer: Buffer;
+
+      if (url.startsWith('blob:')) {
+        // Strategy 3: Blob URL -- in-page fetch + base64
+        const dataUrl = await page.evaluate(async (blobUrl) => {
+          try {
+            const resp = await fetch(blobUrl);
+            const blob = await resp.blob();
+            if (blob.size > 100 * 1024 * 1024) return 'ERROR:TOO_LARGE';
+            return new Promise<string>((resolve, reject) => {
+              const reader = new FileReader();
+              reader.onloadend = () => resolve(reader.result as string);
+              reader.onerror = () => reject('Failed to read blob');
+              reader.readAsDataURL(blob);
+            });
+          } catch {
+            return 'ERROR:EXPIRED';
+          }
+        }, url);
+
+        if (dataUrl === 'ERROR:TOO_LARGE') throw new Error('Blob too large (>100MB). Use a different approach.');
+        if (dataUrl === 'ERROR:EXPIRED') throw new Error('Blob URL expired or inaccessible.');
+
+        const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/);
+        if (!match) throw new Error('Failed to decode blob data');
+        contentType = match[1];
+        buffer = Buffer.from(match[2], 'base64');
+      } else {
+        // Strategy 1: Direct URL via page.request.fetch()
+        const response = await page.request.fetch(url, { timeout: 30000 });
+        const status = response.status();
+        if (status >= 400) {
+          throw new Error(`Download failed: HTTP ${status} ${response.statusText()}`);
+        }
+        contentType = response.headers()['content-type'] || 'application/octet-stream';
+        buffer = Buffer.from(await response.body());
+        if (buffer.length > 200 * 1024 * 1024) {
+          throw new Error('File too large (>200MB).');
+        }
+      }
+
+      // --base64 mode: return inline
+      if (isBase64) {
+        if (buffer.length > 10 * 1024 * 1024) {
+          throw new Error('File too large for --base64 (>10MB). Use disk download + GET /file instead.');
+        }
+        const mimeType = contentType.split(';')[0].trim();
+        return `data:${mimeType};base64,${buffer.toString('base64')}`;
+      }
+
+      // Write to disk
+      const ext = contentType.split(';')[0].includes('/')
+        ? mimeToExt(contentType.split(';')[0].trim())
+        : '.bin';
+      const destPath = outputPath || path.join(TEMP_DIR, `browse-download-${Date.now()}${ext}`);
+      validateOutputPath(destPath);
+      fs.writeFileSync(destPath, buffer);
+      const sizeKB = Math.round(buffer.length / 1024);
+      return `Downloaded: ${destPath} (${sizeKB}KB, ${contentType.split(';')[0].trim()})`;
+    }
+
+    case 'scrape': {
+      if (args.length === 0) throw new Error('Usage: scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]');
+      const mediaType = args[0];
+      if (!['images', 'videos', 'media'].includes(mediaType)) {
+        throw new Error(`Invalid type: ${mediaType}. Use: images, videos, or media`);
+      }
+
+      // Parse flags
+      const selectorIdx = args.indexOf('--selector');
+      const selector = selectorIdx >= 0 ? args[selectorIdx + 1] : undefined;
+      const dirIdx = args.indexOf('--dir');
+      const dir = dirIdx >= 0 ? args[dirIdx + 1] : path.join(TEMP_DIR, `browse-scrape-${Date.now()}`);
+      const limitIdx = args.indexOf('--limit');
+      const limit = Math.min(limitIdx >= 0 ? parseInt(args[limitIdx + 1], 10) || 50 : 50, 200);
+
+      validateOutputPath(dir);
+      fs.mkdirSync(dir, { recursive: true });
+
+      const { extractMedia } = await import('./media-extract');
+      const target = bm.getActiveFrameOrPage();
+      const filter = mediaType === 'images' ? 'images' as const
+        : mediaType === 'videos' ? 'videos' as const
+        : undefined;
+      const mediaResult = await extractMedia(target, { selector, filter });
+
+      // Collect URLs to download
+      const urls: Array<{ url: string; type: string }> = [];
+      const seen = new Set<string>();
+
+      for (const img of mediaResult.images) {
+        const url = img.currentSrc || img.src || img.dataSrc;
+        if (url && !seen.has(url) && !url.startsWith('data:')) {
+          seen.add(url);
+          urls.push({ url, type: 'image' });
+        }
+      }
+      for (const vid of mediaResult.videos) {
+        const url = vid.currentSrc || vid.src;
+        if (url && !seen.has(url) && !url.startsWith('blob:') && !vid.isHLS && !vid.isDASH) {
+          seen.add(url);
+          urls.push({ url, type: 'video' });
+        }
+      }
+      for (const bg of mediaResult.backgroundImages) {
+        if (bg.url && !seen.has(bg.url)) {
+          seen.add(bg.url);
+          urls.push({ url: bg.url, type: 'image' });
+        }
+      }
+
+      const toDownload = urls.slice(0, limit);
+      const page = bm.getPage();
+      const manifest: any = {
+        url: page.url(),
+        scraped_at: new Date().toISOString(),
+        files: [] as any[],
+        total_size: 0,
+        succeeded: 0,
+        failed: 0,
+      };
+
+      const lines: string[] = [];
+      for (let i = 0; i < toDownload.length; i++) {
+        const { url, type } = toDownload[i];
+        try {
+          const response = await page.request.fetch(url, { timeout: 30000 });
+          if (response.status() >= 400) throw new Error(`HTTP ${response.status()}`);
+          const ct = response.headers()['content-type'] || 'application/octet-stream';
+          const ext = mimeToExt(ct.split(';')[0].trim());
+          const filename = `${type}-${String(i + 1).padStart(3, '0')}${ext}`;
+          const filePath = path.join(dir, filename);
+          const body = Buffer.from(await response.body());
+          try {
+            fs.writeFileSync(filePath, body);
+          } catch (writeErr: any) {
+            throw new Error(`Disk write failed: ${writeErr.message}`);
+          }
+          manifest.files.push({ path: filename, src: url, size: body.length, type: ct.split(';')[0].trim() });
+          manifest.total_size += body.length;
+          manifest.succeeded++;
+          lines.push(`  [${i + 1}/${toDownload.length}] ${filename} (${Math.round(body.length / 1024)}KB)`);
+        } catch (err: any) {
+          manifest.files.push({ path: null, src: url, size: 0, type: '', error: err.message });
+          manifest.failed++;
+          lines.push(`  [${i + 1}/${toDownload.length}] FAILED: ${err.message}`);
+        }
+        // 100ms delay between downloads
+        if (i < toDownload.length - 1) await new Promise(r => setTimeout(r, 100));
+      }
+
+      // Write manifest
+      fs.writeFileSync(path.join(dir, 'manifest.json'), JSON.stringify(manifest, null, 2));
+
+      return `Scraped ${toDownload.length} items to ${dir}/\n${lines.join('\n')}\n\nSummary: ${manifest.succeeded} succeeded, ${manifest.failed} failed, ${Math.round(manifest.total_size / 1024)}KB total`;
+    }
+
+    case 'archive': {
+      const page = bm.getPage();
+      const outputPath = args[0] || path.join(TEMP_DIR, `browse-archive-${Date.now()}.mhtml`);
+      validateOutputPath(outputPath);
+
+      try {
+        const cdp = await page.context().newCDPSession(page);
+        const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
+        await cdp.detach();
+        fs.writeFileSync(outputPath, data);
+        return `Archive saved: ${outputPath} (${Math.round(data.length / 1024)}KB, MHTML)`;
+      } catch (err: any) {
+        throw new Error(`MHTML archive requires Chromium CDP. Use 'text' or 'html' for raw page content. (${err.message})`);
+      }
+    }
+
    default:
      throw new Error(`Unknown write command: ${command}`);
  }
 }
+
+/** Map MIME type to file extension. */
+function mimeToExt(mime: string): string {
+  const map: Record<string, string> = {
+    'image/png': '.png', 'image/jpeg': '.jpg', 'image/gif': '.gif',
+    'image/webp': '.webp', 'image/svg+xml': '.svg', 'image/avif': '.avif',
+    'video/mp4': '.mp4', 'video/webm': '.webm', 'video/quicktime': '.mov',
+    'audio/mpeg': '.mp3', 'audio/wav': '.wav', 'audio/ogg': '.ogg',
+    'application/pdf': '.pdf', 'application/json': '.json',
+    'text/html': '.html', 'text/plain': '.txt',
+  };
+  return map[mime] || '.bin';
+}