/** * Write commands — navigate and interact with pages (side effects) * * goto, back, forward, reload, click, fill, select, hover, type, * press, scroll, wait, viewport, cookie, header, useragent */ import type { TabSession } from './tab-session'; import type { BrowserManager } from './browser-manager'; import { findInstalledBrowsers, importCookies, importCookiesViaCdp, hasV20Cookies, listSupportedBrowserNames } from './cookie-import-browser'; import { generatePickerCode } from './cookie-picker-routes'; import { validateNavigationUrl } from './url-validation'; import { validateOutputPath, validateReadPath } from './path-security'; import * as fs from 'fs'; import * as path from 'path'; import type { SetContentWaitUntil } from './tab-session'; import { TEMP_DIR, isPathWithin } from './platform'; import { SAFE_DIRECTORIES } from './path-security'; import { modifyStyle, undoModification, resetModifications, getModificationHistory } from './cdp-inspector'; /** * Aggressive page cleanup selectors and heuristics. * Goal: make the page readable and clean while keeping it recognizable. * Inspired by uBlock Origin filter lists, Readability.js, and reader mode heuristics. */ const CLEANUP_SELECTORS = { ads: [ // Google Ads 'ins.adsbygoogle', '[id^="google_ads"]', '[id^="div-gpt-ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]', '[data-google-query-id]', '.google-auto-placed', // Generic ad patterns (uBlock Origin common filters) '[class*="ad-banner"]', '[class*="ad-wrapper"]', '[class*="ad-container"]', '[class*="ad-slot"]', '[class*="ad-unit"]', '[class*="ad-zone"]', '[class*="ad-placement"]', '[class*="ad-holder"]', '[class*="ad-block"]', '[class*="adbox"]', '[class*="adunit"]', '[class*="adwrap"]', '[id*="ad-banner"]', '[id*="ad-wrapper"]', '[id*="ad-container"]', '[id*="ad-slot"]', '[id*="ad_banner"]', '[id*="ad_container"]', '[data-ad]', '[data-ad-slot]', '[data-ad-unit]', '[data-adunit]', '[class*="sponsored"]', '[class*="Sponsored"]', '.ad', '.ads', '.advert', '.advertisement', '#ad', '#ads', '#advert', '#advertisement', // Common ad network iframes 'iframe[src*="amazon-adsystem"]', 'iframe[src*="outbrain"]', 'iframe[src*="taboola"]', 'iframe[src*="criteo"]', 'iframe[src*="adsafeprotected"]', 'iframe[src*="moatads"]', // Promoted/sponsored content '[class*="promoted"]', '[class*="Promoted"]', '[data-testid*="promo"]', '[class*="native-ad"]', // Empty ad placeholders (divs with only ad classes, no real content) 'aside[class*="ad"]', 'section[class*="ad-"]', ], cookies: [ // Cookie consent frameworks '[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]', '[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]', '[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-wall"]', '[class*="gdpr"]', '[id*="gdpr"]', '[class*="GDPR"]', '[class*="CookieConsent"]', '[id*="CookieConsent"]', // OneTrust (very common) '#onetrust-consent-sdk', '.onetrust-pc-dark-filter', '#onetrust-banner-sdk', // Cookiebot '#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay', // TrustArc / TRUSTe '#truste-consent-track', '.truste_overlay', '.truste_box_overlay', // Quantcast '.qc-cmp2-container', '#qc-cmp2-main', // Generic patterns '[class*="cc-banner"]', '[class*="cc-window"]', '[class*="cc-overlay"]', '[class*="privacy-banner"]', '[class*="privacy-notice"]', '[id*="privacy-banner"]', '[id*="privacy-notice"]', '[class*="accept-cookies"]', '[id*="accept-cookies"]', ], overlays: [ // Paywall / subscription overlays '[class*="paywall"]', '[class*="Paywall"]', '[id*="paywall"]', '[class*="subscribe-wall"]', '[class*="subscription-wall"]', '[class*="meter-wall"]', '[class*="regwall"]', '[class*="reg-wall"]', // Newsletter / signup popups '[class*="newsletter-popup"]', '[class*="newsletter-modal"]', '[class*="signup-modal"]', '[class*="signup-popup"]', '[class*="email-capture"]', '[class*="lead-capture"]', '[class*="popup-modal"]', '[class*="modal-overlay"]', // Interstitials '[class*="interstitial"]', '[id*="interstitial"]', // Push notification prompts '[class*="push-notification"]', '[class*="notification-prompt"]', '[class*="web-push"]', // Survey / feedback popups '[class*="survey-"]', '[class*="feedback-modal"]', '[id*="survey-"]', '[class*="nps-"]', // App download banners '[class*="app-banner"]', '[class*="smart-banner"]', '[class*="app-download"]', '[id*="branch-banner"]', '.smartbanner', // Cross-promotion / "follow us" / "preferred source" widgets '[class*="promo-banner"]', '[class*="cross-promo"]', '[class*="partner-promo"]', '[class*="preferred-source"]', '[class*="google-promo"]', ], clutter: [ // Audio/podcast player widgets (not part of the article text) '[class*="audio-player"]', '[class*="podcast-player"]', '[class*="listen-widget"]', '[class*="everlit"]', '[class*="Everlit"]', 'audio', // bare audio elements // Sidebar games/puzzles widgets '[class*="puzzle"]', '[class*="daily-game"]', '[class*="games-widget"]', '[class*="crossword-promo"]', '[class*="mini-game"]', // "Most Popular" / "Trending" sidebar recirculation (not the top nav trending bar) 'aside [class*="most-popular"]', 'aside [class*="trending"]', 'aside [class*="most-read"]', 'aside [class*="recommended"]', // Related articles / recirculation at bottom '[class*="related-articles"]', '[class*="more-stories"]', '[class*="recirculation"]', '[class*="taboola"]', '[class*="outbrain"]', // Hearst-specific (SF Chronicle, etc.) '[class*="nativo"]', '[data-tb-region]', ], sticky: [ // Handled via JavaScript evaluation, not pure selectors ], social: [ '[class*="social-share"]', '[class*="share-buttons"]', '[class*="share-bar"]', '[class*="social-widget"]', '[class*="social-icons"]', '[class*="share-tools"]', 'iframe[src*="facebook.com/plugins"]', 'iframe[src*="platform.twitter"]', '[class*="fb-like"]', '[class*="tweet-button"]', '[class*="addthis"]', '[class*="sharethis"]', // Follow prompts '[class*="follow-us"]', '[class*="social-follow"]', ], }; export async function handleWriteCommand( command: string, args: string[], session: TabSession, bm: BrowserManager ): Promise { const page = session.getPage(); // Frame-aware target for locator-based operations (click, fill, etc.) const target = session.getActiveFrameOrPage(); const inFrame = session.getFrame() !== null; switch (command) { case 'goto': { if (inFrame) throw new Error('Cannot use goto inside a frame. Run \'frame main\' first.'); const url = args[0]; if (!url) throw new Error('Usage: browse goto '); // Clear loadedHtml BEFORE navigation — a timeout after the main-frame commit // must not leave stale content that could resurrect on a later context recreation. session.clearLoadedHtml(); const normalizedUrl = await validateNavigationUrl(url); const response = await page.goto(normalizedUrl, { waitUntil: 'domcontentloaded', timeout: 15000 }); const status = response?.status() || 'unknown'; return `Navigated to ${normalizedUrl} (${status})`; } case 'back': { if (inFrame) throw new Error('Cannot use back inside a frame. Run \'frame main\' first.'); session.clearLoadedHtml(); await page.goBack({ waitUntil: 'domcontentloaded', timeout: 15000 }); return `Back → ${page.url()}`; } case 'forward': { if (inFrame) throw new Error('Cannot use forward inside a frame. Run \'frame main\' first.'); session.clearLoadedHtml(); await page.goForward({ waitUntil: 'domcontentloaded', timeout: 15000 }); return `Forward → ${page.url()}`; } case 'reload': { if (inFrame) throw new Error('Cannot use reload inside a frame. Run \'frame main\' first.'); session.clearLoadedHtml(); await page.reload({ waitUntil: 'domcontentloaded', timeout: 15000 }); return `Reloaded ${page.url()}`; } case 'load-html': { if (inFrame) throw new Error('Cannot use load-html inside a frame. Run \'frame main\' first.'); // --from-file : read inline HTML from a JSON payload. Used by // make-pdf to dodge Windows argv size limits on large rendered HTML. // The JSON shape is { html: string, waitUntil?: "load"|"domcontentloaded"|"networkidle" }. // The safe-dirs + magic-byte + size-cap checks below still apply to the // INLINE HTML content, not to the payload file path itself. let fromFilePayload: { html: string; waitUntil?: SetContentWaitUntil } | null = null; let filePath: string | undefined; let waitUntil: SetContentWaitUntil = 'domcontentloaded'; for (let i = 0; i < args.length; i++) { if (args[i] === '--from-file') { const payloadPath = args[++i]; if (!payloadPath) throw new Error('load-html: --from-file requires a path'); // Parity with the sibling `load-html ` path below (line 249): // that branch runs every `file://` target through validateReadPath // so the safe-dirs policy can't be side-stepped. Same policy must // apply here — otherwise --from-file becomes a read-anywhere escape // hatch for any caller that can pick the payload path (e.g., an // MCP caller issuing load-html with an attacker-influenced path). try { validateReadPath(path.resolve(payloadPath)); } catch { throw new Error( `load-html: --from-file ${payloadPath} must be under ${SAFE_DIRECTORIES.join(' or ')} (security policy). Copy the payload into the project tree or /tmp first.` ); } const raw = fs.readFileSync(payloadPath, 'utf8'); let json: any; try { json = JSON.parse(raw); } catch (e: any) { throw new Error(`load-html: --from-file JSON parse failed: ${e.message}`); } if (typeof json.html !== 'string') { throw new Error('load-html: --from-file JSON must have a "html" string field'); } if (json.waitUntil && json.waitUntil !== 'load' && json.waitUntil !== 'domcontentloaded' && json.waitUntil !== 'networkidle') { throw new Error(`load-html: --from-file waitUntil '${json.waitUntil}' invalid`); } fromFilePayload = { html: json.html, waitUntil: json.waitUntil }; } else if (args[i] === '--wait-until') { const val = args[++i]; if (val !== 'load' && val !== 'domcontentloaded' && val !== 'networkidle') { throw new Error(`Invalid --wait-until '${val}'. Must be one of: load, domcontentloaded, networkidle.`); } waitUntil = val; } else if (args[i].startsWith('--')) { throw new Error(`Unknown flag: ${args[i]}`); } else if (!filePath) { filePath = args[i]; } } // Inline HTML path: validate size + magic byte, then setContent directly. if (fromFilePayload) { const MAX_BYTES = parseInt(process.env.GSTACK_BROWSE_MAX_HTML_BYTES || '', 10) || (50 * 1024 * 1024); if (Buffer.byteLength(fromFilePayload.html, 'utf8') > MAX_BYTES) { throw new Error( `load-html: --from-file html too large (> ${MAX_BYTES} bytes). ` + 'Raise with GSTACK_BROWSE_MAX_HTML_BYTES=.' ); } const peek = fromFilePayload.html.trimStart(); if (!/^<[a-zA-Z!?]/.test(peek)) { throw new Error('load-html: --from-file html does not start with a valid markup opener'); } const finalWaitUntil = fromFilePayload.waitUntil ?? waitUntil; await session.setTabContent(fromFilePayload.html, { waitUntil: finalWaitUntil }); return `Loaded HTML: (inline from --from-file, ${fromFilePayload.html.length} chars)`; } if (!filePath) throw new Error('Usage: browse load-html [--wait-until load|domcontentloaded|networkidle] [--tab-id ] | load-html --from-file [--tab-id ]'); // Extension allowlist const ALLOWED_EXT = ['.html', '.htm', '.xhtml', '.svg']; const ext = path.extname(filePath).toLowerCase(); if (!ALLOWED_EXT.includes(ext)) { throw new Error( `load-html: file does not appear to be HTML. Expected .html/.htm/.xhtml/.svg, got ${ext || '(no extension)'}. Rename the file if it's really HTML.` ); } const absolutePath = path.resolve(filePath); // Safe-dirs check (reuses canonical read-side policy) try { validateReadPath(absolutePath); } catch (e: any) { throw new Error( `load-html: ${absolutePath} must be under ${SAFE_DIRECTORIES.join(' or ')} (security policy). Copy the file into the project tree or /tmp first.` ); } // stat check — reject non-file targets with actionable error let stat: fs.Stats; try { stat = await fs.promises.stat(absolutePath); } catch (e: any) { if (e.code === 'ENOENT') { throw new Error( `load-html: file not found at ${absolutePath}. Check spelling or copy the file under ${process.cwd()} or ${TEMP_DIR}.` ); } throw e; } if (stat.isDirectory()) { throw new Error(`load-html: ${absolutePath} is a directory, not a file. Pass a .html file.`); } if (!stat.isFile()) { throw new Error(`load-html: ${absolutePath} is not a regular file.`); } // Size cap const MAX_BYTES = parseInt(process.env.GSTACK_BROWSE_MAX_HTML_BYTES || '', 10) || (50 * 1024 * 1024); if (stat.size > MAX_BYTES) { throw new Error( `load-html: file too large (${stat.size} bytes > ${MAX_BYTES} cap). Raise with GSTACK_BROWSE_MAX_HTML_BYTES= or split the HTML.` ); } // Single read: Buffer → magic-byte peek → utf-8 string const buf = await fs.promises.readFile(absolutePath); // Magic-byte check: strip UTF-8 BOM + leading whitespace, then verify the first // non-whitespace byte starts a markup construct. Accepts any ...` // which setContent wraps in a full document. Rejects binary files mis-renamed .html // (first byte won't be `<`). let peek = buf.slice(0, 200); if (peek[0] === 0xEF && peek[1] === 0xBB && peek[2] === 0xBF) { peek = peek.slice(3); } const peekStr = peek.toString('utf8').trimStart(); // Valid markup opener: '<' followed by alpha (tag), '!' (doctype/comment), or '?' (xml prolog) const looksLikeMarkup = /^<[a-zA-Z!?]/.test(peekStr); if (!looksLikeMarkup) { const hexDump = Array.from(buf.slice(0, 16)).map(b => b.toString(16).padStart(2, '0')).join(' '); throw new Error( `load-html: ${absolutePath} has ${ext} extension but content does not look like HTML. First bytes: ${hexDump}` ); } const html = buf.toString('utf8'); await session.setTabContent(html, { waitUntil }); return `Loaded HTML: ${absolutePath} (${stat.size} bytes)`; } case 'click': { const selector = args[0]; if (!selector) throw new Error('Usage: browse click '); // Auto-route: if ref points to a real