mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
b73f364411
* refactor: extract path-security.ts shared module validateOutputPath, validateReadPath, and SAFE_DIRECTORIES were duplicated across write-commands.ts, meta-commands.ts, and read-commands.ts. Extract to a single shared module with re-exports for backward compatibility. Also adds validateTempPath() for the upcoming GET /file endpoint (TEMP_DIR only, not cwd, to prevent remote agents from reading project files). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: default paired agents to full access, split SCOPE_CONTROL The trust boundary for paired agents is the pairing ceremony itself, not the scope. An agent with write scope can already click anything and navigate anywhere. Gating js/cookies behind --admin was security theater. Changes: - Default pair scopes: read+write+admin+meta (was read+write) - New SCOPE_CONTROL for browser-wide destructive ops (stop, restart, disconnect, state, handoff, resume, connect) - --admin flag now grants control scope (backward compat) - New --restrict flag for limited access (e.g., --restrict read) - Updated hint text: "re-pair with --control" instead of "--admin" Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add media and data commands for page content extraction media command: discovers all img/video/audio/background-image elements on the page. Returns JSON with URLs, dimensions, srcset, loading state, HLS/DASH detection. Supports --images/--videos/--audio filters and optional CSS selector scoping. data command: extracts structured data embedded in pages (JSON-LD, Open Graph, Twitter Cards, meta tags). One command returns product prices, article metadata, social share info without DOM scraping. Both are READ scope with untrusted content wrapping. Shared media-extract.ts helper for reuse by the upcoming scrape command. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add download, scrape, and archive commands download: fetch any URL or @ref element to disk using browser session cookies via page.request.fetch(). Supports blob: URLs via in-page base64 conversion. --base64 flag returns inline data URI (cap 10MB). Detects HLS/DASH and rejects with yt-dlp hint. scrape: bulk media download composing media discovery + download loop. Sequential with 100ms delay, URL deduplication, configurable --limit. Writes manifest.json with per-file metadata for machine consumption. archive: saves complete page as MHTML via CDP Page.captureSnapshot. No silent fallback -- errors clearly if CDP unavailable. All three are WRITE scope (write to disk, blocked in watch mode). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add GET /file endpoint for remote agent file retrieval Remote paired agents can now retrieve downloaded files over HTTP. TEMP_DIR only (not cwd) to prevent project file exfiltration. - Bearer token auth (root or scoped with read scope) - Path validation via validateTempPath() (symlink-aware) - 200MB size cap - Extension-based MIME detection - Zero-copy streaming via Bun.file() Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add scroll --times N for automated repeated scrolling Extends the scroll command with --times N flag for infinite feed scraping. Scrolls N times with configurable --wait delay (default 1000ms) between each scroll for content loading. Usage: scroll --times 10 scroll --times 5 --wait 2000 scroll --times 3 .feed-container Composable with scrape: scroll to load content, then scrape images. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add network response body capture (--capture/--export/--bodies) The killer feature for social media scraping. Extends the existing network command to intercept API response bodies: network --capture [--filter graphql] # start capturing network --capture stop # stop network --export /tmp/api.jsonl # export as JSONL network --bodies # show summary Uses page.on('response') listener with URL pattern filtering. SizeCappedBuffer (50MB total, 5MB per-entry cap) evicts oldest entries when full. Binary responses stored as base64, text as-is. This lets agents tap Instagram's GraphQL API, TikTok's hydration data, and any SPA's internal API responses instead of fragile DOM scraping. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add screenshot --base64 for inline image return Returns data:image/png;base64,... instead of writing to disk. Cap at 10MB. Works with all screenshot modes (element, clip, viewport). Eliminates the two-step screenshot+file-serve dance for remote agents. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: add data platform tests and media fixture Tests for SizeCappedBuffer (eviction, export, summary), validateTempPath (TEMP_DIR only, rejects cwd), command registration (all new commands in correct scope sets), and MIME mapping source checks. Rich HTML fixture with: standard images, lazy-loaded images, srcset, video with sources + HLS, audio, CSS background-images, JSON-LD, Open Graph, Twitter Cards, and meta tags. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * docs: regenerate SKILL.md with Extraction category Add Extraction category to browse command table ordering. Regenerate SKILL.md files to include media, data, download, scrape, archive commands in the generated documentation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: bump version and changelog (v0.16.0.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
180 lines
5.1 KiB
TypeScript
180 lines
5.1 KiB
TypeScript
/**
|
|
* Network response body capture — SizeCappedBuffer + capture lifecycle.
|
|
*
|
|
* Architecture:
|
|
* page.on('response') listener → filter by URL pattern → store body
|
|
* SizeCappedBuffer: evicts oldest entries when total size exceeds cap
|
|
* Export: writes JSONL file (one response per line)
|
|
*
|
|
* Memory management:
|
|
* - 50MB total buffer cap (configurable)
|
|
* - 5MB per-entry body cap (larger responses stored as metadata only)
|
|
* - Binary responses stored as base64
|
|
* - Text responses stored as-is
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import type { Response as PlaywrightResponse } from 'playwright';
|
|
|
|
export interface CapturedResponse {
|
|
url: string;
|
|
status: number;
|
|
headers: Record<string, string>;
|
|
body: string;
|
|
contentType: string;
|
|
timestamp: number;
|
|
size: number;
|
|
bodyTruncated: boolean;
|
|
}
|
|
|
|
const MAX_BUFFER_SIZE = 50 * 1024 * 1024; // 50MB total
|
|
const MAX_ENTRY_SIZE = 5 * 1024 * 1024; // 5MB per response body
|
|
|
|
export class SizeCappedBuffer {
|
|
private entries: CapturedResponse[] = [];
|
|
private totalSize = 0;
|
|
private readonly maxSize: number;
|
|
|
|
constructor(maxSize = MAX_BUFFER_SIZE) {
|
|
this.maxSize = maxSize;
|
|
}
|
|
|
|
push(entry: CapturedResponse): void {
|
|
// Evict oldest entries until we have room
|
|
while (this.entries.length > 0 && this.totalSize + entry.size > this.maxSize) {
|
|
const evicted = this.entries.shift()!;
|
|
this.totalSize -= evicted.size;
|
|
}
|
|
this.entries.push(entry);
|
|
this.totalSize += entry.size;
|
|
}
|
|
|
|
toArray(): CapturedResponse[] {
|
|
return [...this.entries];
|
|
}
|
|
|
|
get length(): number {
|
|
return this.entries.length;
|
|
}
|
|
|
|
get byteSize(): number {
|
|
return this.totalSize;
|
|
}
|
|
|
|
clear(): void {
|
|
this.entries = [];
|
|
this.totalSize = 0;
|
|
}
|
|
|
|
/** Export to JSONL file. */
|
|
exportToFile(filePath: string): number {
|
|
const lines = this.entries.map(e => JSON.stringify(e));
|
|
fs.writeFileSync(filePath, lines.join('\n') + '\n');
|
|
return this.entries.length;
|
|
}
|
|
|
|
/** Summary of captured responses (URL, status, size). */
|
|
summary(): string {
|
|
if (this.entries.length === 0) return 'No captured responses.';
|
|
const lines = this.entries.map((e, i) =>
|
|
` [${i + 1}] ${e.status} ${e.url.slice(0, 100)} (${Math.round(e.size / 1024)}KB${e.bodyTruncated ? ', truncated' : ''})`
|
|
);
|
|
return `${this.entries.length} responses (${Math.round(this.totalSize / 1024)}KB total):\n${lines.join('\n')}`;
|
|
}
|
|
}
|
|
|
|
/** Global capture state. */
|
|
let captureBuffer = new SizeCappedBuffer();
|
|
let captureActive = false;
|
|
let captureFilter: RegExp | null = null;
|
|
let captureListener: ((response: PlaywrightResponse) => Promise<void>) | null = null;
|
|
|
|
export function isCaptureActive(): boolean {
|
|
return captureActive;
|
|
}
|
|
|
|
export function getCaptureBuffer(): SizeCappedBuffer {
|
|
return captureBuffer;
|
|
}
|
|
|
|
/** Create the response listener function. */
|
|
function createResponseListener(filter: RegExp | null): (response: PlaywrightResponse) => Promise<void> {
|
|
return async (response: PlaywrightResponse) => {
|
|
const url = response.url();
|
|
if (filter && !filter.test(url)) return;
|
|
|
|
// Skip non-content responses (redirects, 204, etc.)
|
|
const status = response.status();
|
|
if (status === 204 || status === 301 || status === 302 || status === 304) return;
|
|
|
|
const contentType = response.headers()['content-type'] || '';
|
|
let body = '';
|
|
let bodySize = 0;
|
|
let truncated = false;
|
|
|
|
try {
|
|
const rawBody = await response.body();
|
|
bodySize = rawBody.length;
|
|
|
|
if (bodySize > MAX_ENTRY_SIZE) {
|
|
truncated = true;
|
|
body = '';
|
|
} else if (contentType.includes('json') || contentType.includes('text') || contentType.includes('xml') || contentType.includes('html')) {
|
|
body = rawBody.toString('utf-8');
|
|
} else {
|
|
body = rawBody.toString('base64');
|
|
}
|
|
} catch {
|
|
// Response body may be unavailable (e.g., streaming, aborted)
|
|
body = '';
|
|
truncated = true;
|
|
}
|
|
|
|
const entry: CapturedResponse = {
|
|
url,
|
|
status,
|
|
headers: response.headers(),
|
|
body,
|
|
contentType,
|
|
timestamp: Date.now(),
|
|
size: bodySize,
|
|
bodyTruncated: truncated,
|
|
};
|
|
|
|
captureBuffer.push(entry);
|
|
};
|
|
}
|
|
|
|
/** Start capturing response bodies. */
|
|
export function startCapture(filterPattern?: string): { filter: string | null } {
|
|
captureFilter = filterPattern ? new RegExp(filterPattern) : null;
|
|
captureActive = true;
|
|
captureListener = createResponseListener(captureFilter);
|
|
return { filter: filterPattern || null };
|
|
}
|
|
|
|
/** Get the active listener (to attach to page). */
|
|
export function getCaptureListener(): ((response: PlaywrightResponse) => Promise<void>) | null {
|
|
return captureListener;
|
|
}
|
|
|
|
/** Stop capturing. */
|
|
export function stopCapture(): { count: number; sizeKB: number } {
|
|
captureActive = false;
|
|
captureListener = null;
|
|
return {
|
|
count: captureBuffer.length,
|
|
sizeKB: Math.round(captureBuffer.byteSize / 1024),
|
|
};
|
|
}
|
|
|
|
/** Clear the capture buffer. */
|
|
export function clearCapture(): void {
|
|
captureBuffer.clear();
|
|
}
|
|
|
|
/** Export captured responses to JSONL file. */
|
|
export function exportCapture(filePath: string): number {
|
|
return captureBuffer.exportToFile(filePath);
|
|
}
|