From 8a60d99c74deba54e6710b6158b57861efff1d51 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 26 Apr 2026 05:06:27 -0700 Subject: [PATCH] feat(browser-skills): bundled hackernews-frontpage reference skill Smallest interesting browser-skill: scrapes HN front page, returns 30 stories as JSON. No auth, stable HTML, fully fixture-tested. Files: SKILL.md frontmatter + prose script.ts exports parseStoriesFromHtml(html) main: goto + html + parse + JSON.stringify _lib/browse-client.ts vendored copy of the SDK fixtures/hn-2026-04-26.html captured front page (5 stories) script.test.ts 13 assertions against the fixture The parser is a pure function over HTML so script.test.ts runs without a daemon (just imports parseStoriesFromHtml and asserts). This exercises every Phase 1 component end-to-end: - browse-client SDK (script imports browse from ./_lib/) - 3-tier lookup (hackernews-frontpage lives in the bundled tier) - scoped tokens (read+write is enough for goto + html) - spawn lifecycle (\$B skill run hackernews-frontpage) - file-fixture testing (\$B skill test hackernews-frontpage) Co-Authored-By: Claude Opus 4.7 (1M context) --- browser-skills/hackernews-frontpage/SKILL.md | 52 ++++ .../_lib/browse-client.ts | 257 ++++++++++++++++++ .../fixtures/hn-2026-04-26.html | 52 ++++ .../hackernews-frontpage/script.test.ts | 105 +++++++ browser-skills/hackernews-frontpage/script.ts | 132 +++++++++ 5 files changed, 598 insertions(+) create mode 100644 browser-skills/hackernews-frontpage/SKILL.md create mode 100644 browser-skills/hackernews-frontpage/_lib/browse-client.ts create mode 100644 browser-skills/hackernews-frontpage/fixtures/hn-2026-04-26.html create mode 100644 browser-skills/hackernews-frontpage/script.test.ts create mode 100644 browser-skills/hackernews-frontpage/script.ts diff --git a/browser-skills/hackernews-frontpage/SKILL.md b/browser-skills/hackernews-frontpage/SKILL.md new file mode 100644 index 00000000..aa90258f --- /dev/null +++ b/browser-skills/hackernews-frontpage/SKILL.md @@ -0,0 +1,52 @@ +--- +name: hackernews-frontpage +description: Scrape the Hacker News front page (titles, points, comment counts). +host: news.ycombinator.com +trusted: true +source: human +version: 1.0.0 +args: [] +triggers: + - scrape hacker news frontpage + - scrape hn frontpage + - get hn top stories + - latest hacker news stories +--- + +# Hacker News front-page scraper + +Scrapes the Hacker News (`news.ycombinator.com`) front page and returns the +top 30 stories as JSON. Each story has its rank, title, link URL, point count, +and comment count. + +## Usage + +``` +$ $B skill run hackernews-frontpage +{ + "stories": [ + { "rank": 1, "title": "...", "url": "...", "points": 412, "comments": 87 }, + ... + ], + "count": 30 +} +``` + +## How it works + +1. Navigates to `https://news.ycombinator.com` via the daemon. +2. Reads the page HTML. +3. Parses each story row (HN's stable `tr.athing` structure) into a typed + `Story` record. +4. Emits a single JSON document on stdout. + +## Why this is the reference skill + +`hackernews-frontpage` is the smallest interesting browser-skill: no auth, +stable HTML, deterministic output, file-fixture-friendly. Every Phase 1 +component (SDK, scoped tokens, three-tier lookup, spawn lifecycle) is +exercised by `$B skill run hackernews-frontpage` and the bundled +`script.test.ts`. + +When the HN HTML rotates and our selectors break, the test fails against the +captured fixture before users notice. That's the point. diff --git a/browser-skills/hackernews-frontpage/_lib/browse-client.ts b/browser-skills/hackernews-frontpage/_lib/browse-client.ts new file mode 100644 index 00000000..a33681f7 --- /dev/null +++ b/browser-skills/hackernews-frontpage/_lib/browse-client.ts @@ -0,0 +1,257 @@ +/** + * browse-client — canonical SDK that browser-skill scripts import to drive the + * gstack daemon over loopback HTTP. + * + * Distribution model: + * This file is the canonical source. Each browser-skill ships a sibling + * copy at `/_lib/browse-client.ts` (Phase 2's generator copies it + * alongside every generated skill; Phase 1's bundled `hackernews-frontpage` + * reference skill ships a hand-copied version). The skill imports the + * sibling via relative path: `import { browse } from './_lib/browse-client'`. + * + * Why per-skill copies and not a single global SDK: each skill is fully + * portable (copy the directory anywhere, it runs), version drift is + * impossible (the SDK is frozen at the version the skill was authored + * against), no npm publish workflow, no fixed-path tilde imports. + * + * Auth resolution: + * 1. GSTACK_PORT + GSTACK_SKILL_TOKEN env vars (set by `$B skill run` when + * spawning the script). The token is a per-spawn scoped capability bound + * to read+write commands; it expires when the spawn ends. + * 2. State file fallback: read `BROWSE_STATE_FILE` env or `/.gstack/browse.json` + * and use the `port` + `token` (the daemon root token). This path exists + * for developers running a skill directly via `bun run script.ts` outside + * the harness — your own authority, not an agent's. + * + * Trust: + * The SDK exposes only the daemon's existing HTTP surface (POST /command). + * No new capabilities. The token's scopes (read+write for spawned skills, + * full root for standalone debug) determine what actually executes. + * + * Zero side effects on import. Safe to import from tests or plain scripts. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as cp from 'child_process'; + +export interface BrowseClientOptions { + /** Override port. Default: GSTACK_PORT env or state file. */ + port?: number; + /** Override token. Default: GSTACK_SKILL_TOKEN env, then state file root token. */ + token?: string; + /** Tab id to target (every command can scope to a tab). Default: BROWSE_TAB env or undefined (active tab). */ + tabId?: number; + /** Per-request timeout in milliseconds. Default: 30_000. */ + timeoutMs?: number; + /** Override state-file path. Default: BROWSE_STATE_FILE env or /.gstack/browse.json. */ + stateFile?: string; +} + +interface ResolvedAuth { + port: number; + token: string; + source: 'env' | 'state-file'; +} + +/** Resolve the daemon port + token. Throws a clear error if neither path works. */ +export function resolveBrowseAuth(opts: BrowseClientOptions = {}): ResolvedAuth { + if (opts.port !== undefined && opts.token !== undefined) { + return { port: opts.port, token: opts.token, source: 'env' }; + } + + // 1. Env vars (set by $B skill run when spawning). + const envPort = process.env.GSTACK_PORT; + const envToken = process.env.GSTACK_SKILL_TOKEN; + if (envPort && envToken) { + const port = opts.port ?? parseInt(envPort, 10); + if (!isNaN(port)) { + return { port, token: opts.token ?? envToken, source: 'env' }; + } + } + + // 2. State file fallback (developer running `bun run script.ts` directly). + const stateFile = opts.stateFile ?? process.env.BROWSE_STATE_FILE ?? defaultStateFile(); + if (stateFile && fs.existsSync(stateFile)) { + try { + const data = JSON.parse(fs.readFileSync(stateFile, 'utf-8')); + if (typeof data.port === 'number' && typeof data.token === 'string') { + return { + port: opts.port ?? data.port, + token: opts.token ?? data.token, + source: 'state-file', + }; + } + } catch { + // fall through to error + } + } + + throw new Error( + 'browse-client: cannot find daemon port + token. Either spawn via `$B skill run` ' + + '(sets GSTACK_PORT + GSTACK_SKILL_TOKEN) or run from a project with a live daemon ' + + '(.gstack/browse.json must exist).' + ); +} + +function defaultStateFile(): string | null { + try { + const proc = cp.spawnSync('git', ['rev-parse', '--show-toplevel'], { encoding: 'utf-8', timeout: 2000 }); + const root = proc.status === 0 ? proc.stdout.trim() : null; + const base = root || process.cwd(); + return path.join(base, '.gstack', 'browse.json'); + } catch { + return path.join(process.cwd(), '.gstack', 'browse.json'); + } +} + +export class BrowseClientError extends Error { + constructor( + message: string, + public readonly status?: number, + public readonly body?: string, + ) { + super(message); + this.name = 'BrowseClientError'; + } +} + +/** + * Thin client over the daemon's POST /command endpoint. + * + * Convenience methods cover the common cases (goto, click, text, snapshot, + * etc.). For anything not exposed as a method, use `command(cmd, args)`. + */ +export class BrowseClient { + readonly port: number; + readonly token: string; + readonly tabId?: number; + readonly timeoutMs: number; + + constructor(opts: BrowseClientOptions = {}) { + const auth = resolveBrowseAuth(opts); + this.port = auth.port; + this.token = auth.token; + this.tabId = opts.tabId ?? (process.env.BROWSE_TAB ? parseInt(process.env.BROWSE_TAB, 10) : undefined); + this.timeoutMs = opts.timeoutMs ?? 30_000; + } + + // ─── Low-level dispatch ───────────────────────────────────────── + + /** Send an arbitrary command; returns raw response text. Throws on non-2xx. */ + async command(cmd: string, args: string[] = []): Promise { + const body = JSON.stringify({ + command: cmd, + args, + ...(this.tabId !== undefined && !isNaN(this.tabId) ? { tabId: this.tabId } : {}), + }); + + let resp: Response; + try { + resp = await fetch(`http://127.0.0.1:${this.port}/command`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${this.token}`, + }, + body, + signal: AbortSignal.timeout(this.timeoutMs), + }); + } catch (err: any) { + if (err.name === 'TimeoutError' || err.name === 'AbortError') { + throw new BrowseClientError(`browse-client: command "${cmd}" timed out after ${this.timeoutMs}ms`); + } + if (err.code === 'ECONNREFUSED') { + throw new BrowseClientError(`browse-client: daemon not running on port ${this.port}`); + } + throw new BrowseClientError(`browse-client: ${err.message ?? err}`); + } + + const text = await resp.text(); + if (!resp.ok) { + let message = `browse-client: command "${cmd}" failed with status ${resp.status}`; + try { + const parsed = JSON.parse(text); + if (parsed.error) message += `: ${parsed.error}`; + } catch { + if (text) message += `: ${text.slice(0, 200)}`; + } + throw new BrowseClientError(message, resp.status, text); + } + return text; + } + + // ─── Navigation ───────────────────────────────────────────────── + + async goto(url: string): Promise { return this.command('goto', [url]); } + async wait(arg: string): Promise { return this.command('wait', [arg]); } + + // ─── Reading ──────────────────────────────────────────────────── + + async text(selector?: string): Promise { + return this.command('text', selector ? [selector] : []); + } + async html(selector?: string): Promise { + return this.command('html', selector ? [selector] : []); + } + async links(): Promise { return this.command('links'); } + async forms(): Promise { return this.command('forms'); } + async accessibility(): Promise { return this.command('accessibility'); } + async attrs(selector: string): Promise { return this.command('attrs', [selector]); } + async media(...flags: string[]): Promise { return this.command('media', flags); } + async data(...flags: string[]): Promise { return this.command('data', flags); } + + // ─── Interaction ──────────────────────────────────────────────── + + async click(selector: string): Promise { return this.command('click', [selector]); } + async fill(selector: string, value: string): Promise { return this.command('fill', [selector, value]); } + async select(selector: string, value: string): Promise { return this.command('select', [selector, value]); } + async hover(selector: string): Promise { return this.command('hover', [selector]); } + async type(text: string): Promise { return this.command('type', [text]); } + async press(key: string): Promise { return this.command('press', [key]); } + async scroll(selector?: string): Promise { + return this.command('scroll', selector ? [selector] : []); + } + + // ─── Snapshot + screenshot ────────────────────────────────────── + + /** Snapshot returns the ARIA tree. Pass flags like '-i' (interactive only), '-c' (compact). */ + async snapshot(...flags: string[]): Promise { return this.command('snapshot', flags); } + async screenshot(...args: string[]): Promise { return this.command('screenshot', args); } +} + +/** + * Default singleton. Lazily resolves auth on first method call so a script can + * import `browse` and immediately call `await browse.goto(...)` without + * threading through a constructor. + */ +class LazyBrowseClient { + private inner: BrowseClient | null = null; + private get(): BrowseClient { + if (!this.inner) this.inner = new BrowseClient(); + return this.inner; + } + // Mirror the BrowseClient surface; each method delegates to a freshly resolved instance. + command(cmd: string, args: string[] = []) { return this.get().command(cmd, args); } + goto(url: string) { return this.get().goto(url); } + wait(arg: string) { return this.get().wait(arg); } + text(selector?: string) { return this.get().text(selector); } + html(selector?: string) { return this.get().html(selector); } + links() { return this.get().links(); } + forms() { return this.get().forms(); } + accessibility() { return this.get().accessibility(); } + attrs(selector: string) { return this.get().attrs(selector); } + media(...flags: string[]) { return this.get().media(...flags); } + data(...flags: string[]) { return this.get().data(...flags); } + click(selector: string) { return this.get().click(selector); } + fill(selector: string, value: string) { return this.get().fill(selector, value); } + select(selector: string, value: string) { return this.get().select(selector, value); } + hover(selector: string) { return this.get().hover(selector); } + type(text: string) { return this.get().type(text); } + press(key: string) { return this.get().press(key); } + scroll(selector?: string) { return this.get().scroll(selector); } + snapshot(...flags: string[]) { return this.get().snapshot(...flags); } + screenshot(...args: string[]) { return this.get().screenshot(...args); } +} + +export const browse = new LazyBrowseClient(); diff --git a/browser-skills/hackernews-frontpage/fixtures/hn-2026-04-26.html b/browser-skills/hackernews-frontpage/fixtures/hn-2026-04-26.html new file mode 100644 index 00000000..072ef349 --- /dev/null +++ b/browser-skills/hackernews-frontpage/fixtures/hn-2026-04-26.html @@ -0,0 +1,52 @@ +Hacker News +
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
1.Show HN: A toy compiler in 200 lines (example.com)
+ 412 points by alice 3 hours ago | hide | 87 comments
2.Database internals: writing an LSM tree (example.org)
+ 298 points by bob 4 hours ago | hide | 152 comments
3.Acme (YC W26) is hiring senior engineers (remote) (example.com)
+ 5 hours ago
4.Ask HN: What's your most underrated tool?
+ 156 points by carol 6 hours ago | hide | discuss
5.Why quantum & chess engines disagree (example.io)
+ 73 points by dave 7 hours ago | hide | 12 comments
+
diff --git a/browser-skills/hackernews-frontpage/script.test.ts b/browser-skills/hackernews-frontpage/script.test.ts new file mode 100644 index 00000000..e921b276 --- /dev/null +++ b/browser-skills/hackernews-frontpage/script.test.ts @@ -0,0 +1,105 @@ +/** + * hackernews-frontpage script tests — exercise parseStoriesFromHtml against + * the bundled HN fixture. No daemon, no network: the parser is a pure function + * over HTML, so we test it directly. + */ + +import { describe, it, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import { parseStoriesFromHtml } from './script'; + +const FIXTURE = fs.readFileSync( + path.join(__dirname, 'fixtures', 'hn-2026-04-26.html'), + 'utf-8', +); + +describe('parseStoriesFromHtml against bundled HN fixture', () => { + it('returns 5 stories (matching the fixture)', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories).toHaveLength(5); + }); + + it('assigns 1-based ranks in document order', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories.map(s => s.rank)).toEqual([1, 2, 3, 4, 5]); + }); + + it('extracts ids matching the tr.athing[id] attribute', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories.map(s => s.id)).toEqual([ + '40000001', '40000002', '40000003', '40000004', '40000005', + ]); + }); + + it('extracts titles and decodes HTML entities', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories[0].title).toBe('Show HN: A toy compiler in 200 lines'); + expect(stories[1].title).toBe('Database internals: writing an LSM tree'); + expect(stories[3].title).toBe("Ask HN: What's your most underrated tool?"); + expect(stories[4].title).toBe('Why quantum & chess engines disagree'); + }); + + it('extracts URLs and decodes ampersands', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories[0].url).toBe('https://example.com/blog-post-1'); + expect(stories[1].url).toBe('https://example.org/database-internals'); + expect(stories[4].url).toBe('https://example.io/quantum&chess'); + }); + + it('parses point counts as numbers', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories[0].points).toBe(412); + expect(stories[1].points).toBe(298); + expect(stories[3].points).toBe(156); + expect(stories[4].points).toBe(73); + }); + + it('parses comment counts as numbers', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories[0].comments).toBe(87); + expect(stories[1].comments).toBe(152); + expect(stories[4].comments).toBe(12); + }); + + it('treats "discuss" links as 0 comments', () => { + const stories = parseStoriesFromHtml(FIXTURE); + expect(stories[3].comments).toBe(0); + }); + + it('returns null points + null comments for job postings', () => { + const stories = parseStoriesFromHtml(FIXTURE); + // Story #3 is the YC-hiring row in the fixture. + expect(stories[2].title).toContain('YC W26'); + expect(stories[2].points).toBeNull(); + expect(stories[2].comments).toBeNull(); + }); + + it('returns [] for empty HTML', () => { + expect(parseStoriesFromHtml('')).toEqual([]); + }); + + it('returns [] for HTML with no story rows', () => { + expect(parseStoriesFromHtml('

nothing here

')).toEqual([]); + }); + + it('does not fabricate stories from arbitrary tr.athing rows missing titleline', () => { + const html = 'nothing'; + expect(parseStoriesFromHtml(html)).toEqual([]); + }); +}); + +describe('output shape', () => { + it('every story has all required keys', () => { + const stories = parseStoriesFromHtml(FIXTURE); + for (const s of stories) { + expect(typeof s.rank).toBe('number'); + expect(typeof s.id).toBe('string'); + expect(typeof s.title).toBe('string'); + expect(typeof s.url).toBe('string'); + // points/comments may be null for job rows + expect(s.points === null || typeof s.points === 'number').toBe(true); + expect(s.comments === null || typeof s.comments === 'number').toBe(true); + } + }); +}); diff --git a/browser-skills/hackernews-frontpage/script.ts b/browser-skills/hackernews-frontpage/script.ts new file mode 100644 index 00000000..106142d7 --- /dev/null +++ b/browser-skills/hackernews-frontpage/script.ts @@ -0,0 +1,132 @@ +/** + * hackernews-frontpage — scrape the HN front page and emit JSON. + * + * Output protocol: + * stdout = a single JSON document on success: { stories: Story[], count } + * stderr = anything we want logged (currently nothing) + * exit 0 on success, nonzero on parse / network failure. + * + * The parser logic (`parseStoriesFromHtml`) is exported so script.test.ts can + * exercise it against bundled HTML fixtures without spinning up the daemon. + */ + +import { browse } from './_lib/browse-client'; + +export interface Story { + /** 1-based rank as displayed on HN. */ + rank: number; + /** HN item id (the integer in `tr.athing[id]`). */ + id: string; + title: string; + /** Outbound URL the title links to. */ + url: string; + /** null when the row has no score (job postings). */ + points: number | null; + /** null when the row has no comments link (job postings). */ + comments: number | null; +} + +export interface Output { + stories: Story[]; + count: number; +} + +const FRONT_PAGE_URL = 'https://news.ycombinator.com/'; + +/** + * Parse HN front-page HTML into Story[]. + * + * HN's structure is stable: each story is a pair of rows. + * + * N. + * ... + * title ... + * + * + * N points + * ... N comments + * + * + * Job postings ("Foo (YC X25) is hiring...") omit the score and comments — + * those fields come back as null. + */ +export function parseStoriesFromHtml(html: string): Story[] { + const stories: Story[] = []; + + // Match each `tr.athing` row, capturing the id attribute and the row body. + const rowRegex = /]*\bclass="athing[^"]*"[^>]*\bid="(\d+)"[^>]*>([\s\S]*?)<\/tr>/g; + + let match: RegExpExecArray | null; + let rank = 0; + while ((match = rowRegex.exec(html)) !== null) { + rank++; + const id = match[1]; + const rowBody = match[2]; + + // Title link: title + const titleMatch = rowBody.match(/]*>\s*]*>([\s\S]*?)<\/a>/); + if (!titleMatch) continue; + const url = decodeHtmlEntities(titleMatch[1]); + const title = stripTags(decodeHtmlEntities(titleMatch[2])).trim(); + + // The next sibling tr should hold the subtext row. Bound the lookahead + // to before the next story (tr.spacer marks the gap, then tr.athing). + // Bug if we don't bound: the score from story N+1 leaks into story N + // when story N is a job posting (no score of its own). + const subtextStart = match.index + match[0].length; + const tail = html.slice(subtextStart); + const spacerIdx = tail.search(/]*\bclass="spacer\b/); + const nextAthingIdx = tail.search(/]*\bclass="athing\b/); + const candidates = [spacerIdx, nextAthingIdx].filter(i => i >= 0); + const boundary = candidates.length > 0 ? Math.min(...candidates) : tail.length; + const subtextSlice = tail.slice(0, boundary); + + let points: number | null = null; + let comments: number | null = null; + + const scoreMatch = subtextSlice.match(/]*>(\d+)\s*points?<\/span>/); + if (scoreMatch) points = parseInt(scoreMatch[1], 10); + + // Comment count: an anchor like `N comments`, + // or `discuss` (treated as 0). Skip "hide" / "context" / "from" links. + const commentsMatch = subtextSlice.match(/]*>(\d+)\s*(?: )?\s*comments?<\/a>/); + if (commentsMatch) { + comments = parseInt(commentsMatch[1], 10); + } else if (/discuss<\/a>/.test(subtextSlice)) { + comments = 0; + } + + stories.push({ rank, id, title, url, points, comments }); + } + + return stories; +} + +function stripTags(s: string): string { + return s.replace(/<[^>]*>/g, ''); +} + +function decodeHtmlEntities(s: string): string { + return s + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/ /g, ' '); +} + +// ─── Main entry (only when run as a script, not when imported by tests) ─ + +if (import.meta.main) { + await main(); +} + +async function main(): Promise { + await browse.goto(FRONT_PAGE_URL); + const html = await browse.html(); + const stories = parseStoriesFromHtml(html); + const output: Output = { stories, count: stories.length }; + process.stdout.write(JSON.stringify(output) + '\n'); +}