From cc63edb006cc3a836efda6838c01f13fc7a5feb3 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 7 Apr 2026 19:01:09 -1000 Subject: [PATCH] feat: add media and data commands for page content extraction media command: discovers all img/video/audio/background-image elements on the page. Returns JSON with URLs, dimensions, srcset, loading state, HLS/DASH detection. Supports --images/--videos/--audio filters and optional CSS selector scoping. data command: extracts structured data embedded in pages (JSON-LD, Open Graph, Twitter Cards, meta tags). One command returns product prices, article metadata, social share info without DOM scraping. Both are READ scope with untrusted content wrapping. Shared media-extract.ts helper for reuse by the upcoming scrape command. Co-Authored-By: Claude Opus 4.6 (1M context) --- browse/src/commands.ts | 4 + browse/src/media-extract.ts | 177 +++++++++++++++++++++++++++++++++++ browse/src/read-commands.ts | 70 ++++++++++++++ browse/src/token-registry.ts | 1 + 4 files changed, 252 insertions(+) create mode 100644 browse/src/media-extract.ts diff --git a/browse/src/commands.ts b/browse/src/commands.ts index ceb089f3..944b397c 100644 --- a/browse/src/commands.ts +++ b/browse/src/commands.ts @@ -16,6 +16,7 @@ export const READ_COMMANDS = new Set([ 'console', 'network', 'cookies', 'storage', 'perf', 'dialog', 'is', 'inspect', + 'media', 'data', ]); export const WRITE_COMMANDS = new Set([ @@ -46,6 +47,7 @@ export const ALL_COMMANDS = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...MET export const PAGE_CONTENT_COMMANDS = new Set([ 'text', 'html', 'links', 'forms', 'accessibility', 'attrs', 'console', 'dialog', + 'media', 'data', ]); /** Wrap output from untrusted-content commands with trust boundary markers */ @@ -70,6 +72,8 @@ export const COMMAND_DESCRIPTIONS: Record' }, 'eval': { category: 'Inspection', description: 'Run JavaScript from file and return result as string (path must be under /tmp or cwd)', usage: 'eval ' }, diff --git a/browse/src/media-extract.ts b/browse/src/media-extract.ts new file mode 100644 index 00000000..4ff9b252 --- /dev/null +++ b/browse/src/media-extract.ts @@ -0,0 +1,177 @@ +/** + * Media extraction helper — shared between `media` (read) and `scrape` (write) commands. + * + * Runs page.evaluate() to discover all media elements on the page: + * - with src, srcset, currentSrc, alt, dimensions, loading, data-src + * -