mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 13:45:35 +02:00
feat: add download, scrape, and archive commands
download: fetch any URL or @ref element to disk using browser session cookies via page.request.fetch(). Supports blob: URLs via in-page base64 conversion. --base64 flag returns inline data URI (cap 10MB). Detects HLS/DASH and rejects with yt-dlp hint. scrape: bulk media download composing media discovery + download loop. Sequential with 100ms delay, URL deduplication, configurable --limit. Writes manifest.json with per-file metadata for machine consumption. archive: saves complete page as MHTML via CDP Page.captureSnapshot. No silent fallback -- errors clearly if CDP unavailable. All three are WRITE scope (write to disk, blocked in watch mode). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ export const WRITE_COMMANDS = new Set([
|
||||
'viewport', 'cookie', 'cookie-import', 'cookie-import-browser', 'header', 'useragent',
|
||||
'upload', 'dialog-accept', 'dialog-dismiss',
|
||||
'style', 'cleanup', 'prettyscreenshot',
|
||||
'download', 'scrape', 'archive',
|
||||
]);
|
||||
|
||||
export const META_COMMANDS = new Set([
|
||||
@@ -104,6 +105,10 @@ export const COMMAND_DESCRIPTIONS: Record<string, { category: string; descriptio
|
||||
'useragent': { category: 'Interaction', description: 'Set user agent', usage: 'useragent <string>' },
|
||||
'dialog-accept': { category: 'Interaction', description: 'Auto-accept next alert/confirm/prompt. Optional text is sent as the prompt response', usage: 'dialog-accept [text]' },
|
||||
'dialog-dismiss': { category: 'Interaction', description: 'Auto-dismiss next dialog' },
|
||||
// Data extraction
|
||||
'download': { category: 'Extraction', description: 'Download URL or media element to disk using browser cookies', usage: 'download <url|@ref> [path] [--base64]' },
|
||||
'scrape': { category: 'Extraction', description: 'Bulk download all media from page. Writes manifest.json', usage: 'scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]' },
|
||||
'archive': { category: 'Extraction', description: 'Save complete page as MHTML via CDP', usage: 'archive [path]' },
|
||||
// Visual
|
||||
'screenshot': { category: 'Visual', description: 'Save screenshot (supports element crop via CSS/@ref, --clip region, --viewport)', usage: 'screenshot [--viewport] [--clip x,y,w,h] [selector|@ref] [path]' },
|
||||
'pdf': { category: 'Visual', description: 'Save as PDF', usage: 'pdf [path]' },
|
||||
|
||||
@@ -49,6 +49,7 @@ export const SCOPE_WRITE = new Set([
|
||||
'click', 'fill', 'select', 'hover', 'type', 'press', 'scroll', 'wait',
|
||||
'upload', 'viewport', 'newtab', 'closetab',
|
||||
'dialog-accept', 'dialog-dismiss',
|
||||
'download', 'scrape', 'archive',
|
||||
]);
|
||||
|
||||
/** Page-level power tools — JS execution, credential access, page mutations */
|
||||
|
||||
@@ -869,7 +869,230 @@ export async function handleWriteCommand(
|
||||
return parts.join(' ');
|
||||
}
|
||||
|
||||
case 'download': {
|
||||
if (args.length === 0) throw new Error('Usage: download <url|@ref> [path] [--base64]');
|
||||
const isBase64 = args.includes('--base64');
|
||||
const filteredArgs = args.filter(a => a !== '--base64');
|
||||
let url = filteredArgs[0];
|
||||
const outputPath = filteredArgs[1];
|
||||
|
||||
// Resolve @ref to element src
|
||||
if (url.startsWith('@')) {
|
||||
const resolved = await bm.resolveRef(url);
|
||||
if (!('locator' in resolved)) throw new Error(`Expected @ref, got CSS selector: ${url}`);
|
||||
const locator = resolved.locator;
|
||||
const tagName = await locator.evaluate(el => el.tagName.toLowerCase());
|
||||
if (tagName === 'img') {
|
||||
url = await locator.evaluate(el => {
|
||||
const img = el as HTMLImageElement;
|
||||
return img.currentSrc || img.src || img.getAttribute('data-src') || '';
|
||||
});
|
||||
} else if (tagName === 'video') {
|
||||
url = await locator.evaluate(el => (el as HTMLVideoElement).currentSrc || (el as HTMLVideoElement).src || '');
|
||||
} else if (tagName === 'audio') {
|
||||
url = await locator.evaluate(el => (el as HTMLAudioElement).currentSrc || (el as HTMLAudioElement).src || '');
|
||||
} else {
|
||||
// Try src attribute on any element
|
||||
url = await locator.evaluate(el => el.getAttribute('src') || '');
|
||||
}
|
||||
if (!url) throw new Error(`Could not extract URL from ${filteredArgs[0]} (${tagName})`);
|
||||
}
|
||||
|
||||
// Check for HLS/DASH
|
||||
if (url.includes('.m3u8') || url.includes('.mpd')) {
|
||||
throw new Error('This is an HLS/DASH stream. Use yt-dlp or ffmpeg for adaptive stream downloads.');
|
||||
}
|
||||
|
||||
// Determine output path and extension
|
||||
const page = bm.getPage();
|
||||
let contentType = 'application/octet-stream';
|
||||
let buffer: Buffer;
|
||||
|
||||
if (url.startsWith('blob:')) {
|
||||
// Strategy 3: Blob URL -- in-page fetch + base64
|
||||
const dataUrl = await page.evaluate(async (blobUrl) => {
|
||||
try {
|
||||
const resp = await fetch(blobUrl);
|
||||
const blob = await resp.blob();
|
||||
if (blob.size > 100 * 1024 * 1024) return 'ERROR:TOO_LARGE';
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
const reader = new FileReader();
|
||||
reader.onloadend = () => resolve(reader.result as string);
|
||||
reader.onerror = () => reject('Failed to read blob');
|
||||
reader.readAsDataURL(blob);
|
||||
});
|
||||
} catch {
|
||||
return 'ERROR:EXPIRED';
|
||||
}
|
||||
}, url);
|
||||
|
||||
if (dataUrl === 'ERROR:TOO_LARGE') throw new Error('Blob too large (>100MB). Use a different approach.');
|
||||
if (dataUrl === 'ERROR:EXPIRED') throw new Error('Blob URL expired or inaccessible.');
|
||||
|
||||
const match = dataUrl.match(/^data:([^;]+);base64,(.+)$/);
|
||||
if (!match) throw new Error('Failed to decode blob data');
|
||||
contentType = match[1];
|
||||
buffer = Buffer.from(match[2], 'base64');
|
||||
} else {
|
||||
// Strategy 1: Direct URL via page.request.fetch()
|
||||
const response = await page.request.fetch(url, { timeout: 30000 });
|
||||
const status = response.status();
|
||||
if (status >= 400) {
|
||||
throw new Error(`Download failed: HTTP ${status} ${response.statusText()}`);
|
||||
}
|
||||
contentType = response.headers()['content-type'] || 'application/octet-stream';
|
||||
buffer = Buffer.from(await response.body());
|
||||
if (buffer.length > 200 * 1024 * 1024) {
|
||||
throw new Error('File too large (>200MB).');
|
||||
}
|
||||
}
|
||||
|
||||
// --base64 mode: return inline
|
||||
if (isBase64) {
|
||||
if (buffer.length > 10 * 1024 * 1024) {
|
||||
throw new Error('File too large for --base64 (>10MB). Use disk download + GET /file instead.');
|
||||
}
|
||||
const mimeType = contentType.split(';')[0].trim();
|
||||
return `data:${mimeType};base64,${buffer.toString('base64')}`;
|
||||
}
|
||||
|
||||
// Write to disk
|
||||
const ext = contentType.split(';')[0].includes('/')
|
||||
? mimeToExt(contentType.split(';')[0].trim())
|
||||
: '.bin';
|
||||
const destPath = outputPath || path.join(TEMP_DIR, `browse-download-${Date.now()}${ext}`);
|
||||
validateOutputPath(destPath);
|
||||
fs.writeFileSync(destPath, buffer);
|
||||
const sizeKB = Math.round(buffer.length / 1024);
|
||||
return `Downloaded: ${destPath} (${sizeKB}KB, ${contentType.split(';')[0].trim()})`;
|
||||
}
|
||||
|
||||
case 'scrape': {
|
||||
if (args.length === 0) throw new Error('Usage: scrape <images|videos|media> [--selector sel] [--dir path] [--limit N]');
|
||||
const mediaType = args[0];
|
||||
if (!['images', 'videos', 'media'].includes(mediaType)) {
|
||||
throw new Error(`Invalid type: ${mediaType}. Use: images, videos, or media`);
|
||||
}
|
||||
|
||||
// Parse flags
|
||||
const selectorIdx = args.indexOf('--selector');
|
||||
const selector = selectorIdx >= 0 ? args[selectorIdx + 1] : undefined;
|
||||
const dirIdx = args.indexOf('--dir');
|
||||
const dir = dirIdx >= 0 ? args[dirIdx + 1] : path.join(TEMP_DIR, `browse-scrape-${Date.now()}`);
|
||||
const limitIdx = args.indexOf('--limit');
|
||||
const limit = Math.min(limitIdx >= 0 ? parseInt(args[limitIdx + 1], 10) || 50 : 50, 200);
|
||||
|
||||
validateOutputPath(dir);
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
|
||||
const { extractMedia } = await import('./media-extract');
|
||||
const target = bm.getActiveFrameOrPage();
|
||||
const filter = mediaType === 'images' ? 'images' as const
|
||||
: mediaType === 'videos' ? 'videos' as const
|
||||
: undefined;
|
||||
const mediaResult = await extractMedia(target, { selector, filter });
|
||||
|
||||
// Collect URLs to download
|
||||
const urls: Array<{ url: string; type: string }> = [];
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const img of mediaResult.images) {
|
||||
const url = img.currentSrc || img.src || img.dataSrc;
|
||||
if (url && !seen.has(url) && !url.startsWith('data:')) {
|
||||
seen.add(url);
|
||||
urls.push({ url, type: 'image' });
|
||||
}
|
||||
}
|
||||
for (const vid of mediaResult.videos) {
|
||||
const url = vid.currentSrc || vid.src;
|
||||
if (url && !seen.has(url) && !url.startsWith('blob:') && !vid.isHLS && !vid.isDASH) {
|
||||
seen.add(url);
|
||||
urls.push({ url, type: 'video' });
|
||||
}
|
||||
}
|
||||
for (const bg of mediaResult.backgroundImages) {
|
||||
if (bg.url && !seen.has(bg.url)) {
|
||||
seen.add(bg.url);
|
||||
urls.push({ url: bg.url, type: 'image' });
|
||||
}
|
||||
}
|
||||
|
||||
const toDownload = urls.slice(0, limit);
|
||||
const page = bm.getPage();
|
||||
const manifest: any = {
|
||||
url: page.url(),
|
||||
scraped_at: new Date().toISOString(),
|
||||
files: [] as any[],
|
||||
total_size: 0,
|
||||
succeeded: 0,
|
||||
failed: 0,
|
||||
};
|
||||
|
||||
const lines: string[] = [];
|
||||
for (let i = 0; i < toDownload.length; i++) {
|
||||
const { url, type } = toDownload[i];
|
||||
try {
|
||||
const response = await page.request.fetch(url, { timeout: 30000 });
|
||||
if (response.status() >= 400) throw new Error(`HTTP ${response.status()}`);
|
||||
const ct = response.headers()['content-type'] || 'application/octet-stream';
|
||||
const ext = mimeToExt(ct.split(';')[0].trim());
|
||||
const filename = `${type}-${String(i + 1).padStart(3, '0')}${ext}`;
|
||||
const filePath = path.join(dir, filename);
|
||||
const body = Buffer.from(await response.body());
|
||||
try {
|
||||
fs.writeFileSync(filePath, body);
|
||||
} catch (writeErr: any) {
|
||||
throw new Error(`Disk write failed: ${writeErr.message}`);
|
||||
}
|
||||
manifest.files.push({ path: filename, src: url, size: body.length, type: ct.split(';')[0].trim() });
|
||||
manifest.total_size += body.length;
|
||||
manifest.succeeded++;
|
||||
lines.push(` [${i + 1}/${toDownload.length}] ${filename} (${Math.round(body.length / 1024)}KB)`);
|
||||
} catch (err: any) {
|
||||
manifest.files.push({ path: null, src: url, size: 0, type: '', error: err.message });
|
||||
manifest.failed++;
|
||||
lines.push(` [${i + 1}/${toDownload.length}] FAILED: ${err.message}`);
|
||||
}
|
||||
// 100ms delay between downloads
|
||||
if (i < toDownload.length - 1) await new Promise(r => setTimeout(r, 100));
|
||||
}
|
||||
|
||||
// Write manifest
|
||||
fs.writeFileSync(path.join(dir, 'manifest.json'), JSON.stringify(manifest, null, 2));
|
||||
|
||||
return `Scraped ${toDownload.length} items to ${dir}/\n${lines.join('\n')}\n\nSummary: ${manifest.succeeded} succeeded, ${manifest.failed} failed, ${Math.round(manifest.total_size / 1024)}KB total`;
|
||||
}
|
||||
|
||||
case 'archive': {
|
||||
const page = bm.getPage();
|
||||
const outputPath = args[0] || path.join(TEMP_DIR, `browse-archive-${Date.now()}.mhtml`);
|
||||
validateOutputPath(outputPath);
|
||||
|
||||
try {
|
||||
const cdp = await page.context().newCDPSession(page);
|
||||
const { data } = await cdp.send('Page.captureSnapshot', { format: 'mhtml' });
|
||||
await cdp.detach();
|
||||
fs.writeFileSync(outputPath, data);
|
||||
return `Archive saved: ${outputPath} (${Math.round(data.length / 1024)}KB, MHTML)`;
|
||||
} catch (err: any) {
|
||||
throw new Error(`MHTML archive requires Chromium CDP. Use 'text' or 'html' for raw page content. (${err.message})`);
|
||||
}
|
||||
}
|
||||
|
||||
default:
|
||||
throw new Error(`Unknown write command: ${command}`);
|
||||
}
|
||||
}
|
||||
|
||||
/** Map MIME type to file extension. */
|
||||
function mimeToExt(mime: string): string {
|
||||
const map: Record<string, string> = {
|
||||
'image/png': '.png', 'image/jpeg': '.jpg', 'image/gif': '.gif',
|
||||
'image/webp': '.webp', 'image/svg+xml': '.svg', 'image/avif': '.avif',
|
||||
'video/mp4': '.mp4', 'video/webm': '.webm', 'video/quicktime': '.mov',
|
||||
'audio/mpeg': '.mp3', 'audio/wav': '.wav', 'audio/ogg': '.ogg',
|
||||
'application/pdf': '.pdf', 'application/json': '.json',
|
||||
'text/html': '.html', 'text/plain': '.txt',
|
||||
};
|
||||
return map[mime] || '.bin';
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user