mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat(browser-skills): bundled hackernews-frontpage reference skill
Smallest interesting browser-skill: scrapes HN front page, returns
30 stories as JSON. No auth, stable HTML, fully fixture-tested.
Files:
SKILL.md frontmatter + prose
script.ts exports parseStoriesFromHtml(html)
main: goto + html + parse + JSON.stringify
_lib/browse-client.ts vendored copy of the SDK
fixtures/hn-2026-04-26.html captured front page (5 stories)
script.test.ts 13 assertions against the fixture
The parser is a pure function over HTML so script.test.ts runs
without a daemon (just imports parseStoriesFromHtml and asserts).
This exercises every Phase 1 component end-to-end:
- browse-client SDK (script imports browse from ./_lib/)
- 3-tier lookup (hackernews-frontpage lives in the bundled tier)
- scoped tokens (read+write is enough for goto + html)
- spawn lifecycle (\$B skill run hackernews-frontpage)
- file-fixture testing (\$B skill test hackernews-frontpage)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
---
|
||||
name: hackernews-frontpage
|
||||
description: Scrape the Hacker News front page (titles, points, comment counts).
|
||||
host: news.ycombinator.com
|
||||
trusted: true
|
||||
source: human
|
||||
version: 1.0.0
|
||||
args: []
|
||||
triggers:
|
||||
- scrape hacker news frontpage
|
||||
- scrape hn frontpage
|
||||
- get hn top stories
|
||||
- latest hacker news stories
|
||||
---
|
||||
|
||||
# Hacker News front-page scraper
|
||||
|
||||
Scrapes the Hacker News (`news.ycombinator.com`) front page and returns the
|
||||
top 30 stories as JSON. Each story has its rank, title, link URL, point count,
|
||||
and comment count.
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
$ $B skill run hackernews-frontpage
|
||||
{
|
||||
"stories": [
|
||||
{ "rank": 1, "title": "...", "url": "...", "points": 412, "comments": 87 },
|
||||
...
|
||||
],
|
||||
"count": 30
|
||||
}
|
||||
```
|
||||
|
||||
## How it works
|
||||
|
||||
1. Navigates to `https://news.ycombinator.com` via the daemon.
|
||||
2. Reads the page HTML.
|
||||
3. Parses each story row (HN's stable `tr.athing` structure) into a typed
|
||||
`Story` record.
|
||||
4. Emits a single JSON document on stdout.
|
||||
|
||||
## Why this is the reference skill
|
||||
|
||||
`hackernews-frontpage` is the smallest interesting browser-skill: no auth,
|
||||
stable HTML, deterministic output, file-fixture-friendly. Every Phase 1
|
||||
component (SDK, scoped tokens, three-tier lookup, spawn lifecycle) is
|
||||
exercised by `$B skill run hackernews-frontpage` and the bundled
|
||||
`script.test.ts`.
|
||||
|
||||
When the HN HTML rotates and our selectors break, the test fails against the
|
||||
captured fixture before users notice. That's the point.
|
||||
@@ -0,0 +1,257 @@
|
||||
/**
|
||||
* browse-client — canonical SDK that browser-skill scripts import to drive the
|
||||
* gstack daemon over loopback HTTP.
|
||||
*
|
||||
* Distribution model:
|
||||
* This file is the canonical source. Each browser-skill ships a sibling
|
||||
* copy at `<skill>/_lib/browse-client.ts` (Phase 2's generator copies it
|
||||
* alongside every generated skill; Phase 1's bundled `hackernews-frontpage`
|
||||
* reference skill ships a hand-copied version). The skill imports the
|
||||
* sibling via relative path: `import { browse } from './_lib/browse-client'`.
|
||||
*
|
||||
* Why per-skill copies and not a single global SDK: each skill is fully
|
||||
* portable (copy the directory anywhere, it runs), version drift is
|
||||
* impossible (the SDK is frozen at the version the skill was authored
|
||||
* against), no npm publish workflow, no fixed-path tilde imports.
|
||||
*
|
||||
* Auth resolution:
|
||||
* 1. GSTACK_PORT + GSTACK_SKILL_TOKEN env vars (set by `$B skill run` when
|
||||
* spawning the script). The token is a per-spawn scoped capability bound
|
||||
* to read+write commands; it expires when the spawn ends.
|
||||
* 2. State file fallback: read `BROWSE_STATE_FILE` env or `<git-root>/.gstack/browse.json`
|
||||
* and use the `port` + `token` (the daemon root token). This path exists
|
||||
* for developers running a skill directly via `bun run script.ts` outside
|
||||
* the harness — your own authority, not an agent's.
|
||||
*
|
||||
* Trust:
|
||||
* The SDK exposes only the daemon's existing HTTP surface (POST /command).
|
||||
* No new capabilities. The token's scopes (read+write for spawned skills,
|
||||
* full root for standalone debug) determine what actually executes.
|
||||
*
|
||||
* Zero side effects on import. Safe to import from tests or plain scripts.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as cp from 'child_process';
|
||||
|
||||
export interface BrowseClientOptions {
|
||||
/** Override port. Default: GSTACK_PORT env or state file. */
|
||||
port?: number;
|
||||
/** Override token. Default: GSTACK_SKILL_TOKEN env, then state file root token. */
|
||||
token?: string;
|
||||
/** Tab id to target (every command can scope to a tab). Default: BROWSE_TAB env or undefined (active tab). */
|
||||
tabId?: number;
|
||||
/** Per-request timeout in milliseconds. Default: 30_000. */
|
||||
timeoutMs?: number;
|
||||
/** Override state-file path. Default: BROWSE_STATE_FILE env or <git-root>/.gstack/browse.json. */
|
||||
stateFile?: string;
|
||||
}
|
||||
|
||||
interface ResolvedAuth {
|
||||
port: number;
|
||||
token: string;
|
||||
source: 'env' | 'state-file';
|
||||
}
|
||||
|
||||
/** Resolve the daemon port + token. Throws a clear error if neither path works. */
|
||||
export function resolveBrowseAuth(opts: BrowseClientOptions = {}): ResolvedAuth {
|
||||
if (opts.port !== undefined && opts.token !== undefined) {
|
||||
return { port: opts.port, token: opts.token, source: 'env' };
|
||||
}
|
||||
|
||||
// 1. Env vars (set by $B skill run when spawning).
|
||||
const envPort = process.env.GSTACK_PORT;
|
||||
const envToken = process.env.GSTACK_SKILL_TOKEN;
|
||||
if (envPort && envToken) {
|
||||
const port = opts.port ?? parseInt(envPort, 10);
|
||||
if (!isNaN(port)) {
|
||||
return { port, token: opts.token ?? envToken, source: 'env' };
|
||||
}
|
||||
}
|
||||
|
||||
// 2. State file fallback (developer running `bun run script.ts` directly).
|
||||
const stateFile = opts.stateFile ?? process.env.BROWSE_STATE_FILE ?? defaultStateFile();
|
||||
if (stateFile && fs.existsSync(stateFile)) {
|
||||
try {
|
||||
const data = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
|
||||
if (typeof data.port === 'number' && typeof data.token === 'string') {
|
||||
return {
|
||||
port: opts.port ?? data.port,
|
||||
token: opts.token ?? data.token,
|
||||
source: 'state-file',
|
||||
};
|
||||
}
|
||||
} catch {
|
||||
// fall through to error
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
'browse-client: cannot find daemon port + token. Either spawn via `$B skill run` ' +
|
||||
'(sets GSTACK_PORT + GSTACK_SKILL_TOKEN) or run from a project with a live daemon ' +
|
||||
'(.gstack/browse.json must exist).'
|
||||
);
|
||||
}
|
||||
|
||||
function defaultStateFile(): string | null {
|
||||
try {
|
||||
const proc = cp.spawnSync('git', ['rev-parse', '--show-toplevel'], { encoding: 'utf-8', timeout: 2000 });
|
||||
const root = proc.status === 0 ? proc.stdout.trim() : null;
|
||||
const base = root || process.cwd();
|
||||
return path.join(base, '.gstack', 'browse.json');
|
||||
} catch {
|
||||
return path.join(process.cwd(), '.gstack', 'browse.json');
|
||||
}
|
||||
}
|
||||
|
||||
export class BrowseClientError extends Error {
|
||||
constructor(
|
||||
message: string,
|
||||
public readonly status?: number,
|
||||
public readonly body?: string,
|
||||
) {
|
||||
super(message);
|
||||
this.name = 'BrowseClientError';
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Thin client over the daemon's POST /command endpoint.
|
||||
*
|
||||
* Convenience methods cover the common cases (goto, click, text, snapshot,
|
||||
* etc.). For anything not exposed as a method, use `command(cmd, args)`.
|
||||
*/
|
||||
export class BrowseClient {
|
||||
readonly port: number;
|
||||
readonly token: string;
|
||||
readonly tabId?: number;
|
||||
readonly timeoutMs: number;
|
||||
|
||||
constructor(opts: BrowseClientOptions = {}) {
|
||||
const auth = resolveBrowseAuth(opts);
|
||||
this.port = auth.port;
|
||||
this.token = auth.token;
|
||||
this.tabId = opts.tabId ?? (process.env.BROWSE_TAB ? parseInt(process.env.BROWSE_TAB, 10) : undefined);
|
||||
this.timeoutMs = opts.timeoutMs ?? 30_000;
|
||||
}
|
||||
|
||||
// ─── Low-level dispatch ─────────────────────────────────────────
|
||||
|
||||
/** Send an arbitrary command; returns raw response text. Throws on non-2xx. */
|
||||
async command(cmd: string, args: string[] = []): Promise<string> {
|
||||
const body = JSON.stringify({
|
||||
command: cmd,
|
||||
args,
|
||||
...(this.tabId !== undefined && !isNaN(this.tabId) ? { tabId: this.tabId } : {}),
|
||||
});
|
||||
|
||||
let resp: Response;
|
||||
try {
|
||||
resp = await fetch(`http://127.0.0.1:${this.port}/command`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': `Bearer ${this.token}`,
|
||||
},
|
||||
body,
|
||||
signal: AbortSignal.timeout(this.timeoutMs),
|
||||
});
|
||||
} catch (err: any) {
|
||||
if (err.name === 'TimeoutError' || err.name === 'AbortError') {
|
||||
throw new BrowseClientError(`browse-client: command "${cmd}" timed out after ${this.timeoutMs}ms`);
|
||||
}
|
||||
if (err.code === 'ECONNREFUSED') {
|
||||
throw new BrowseClientError(`browse-client: daemon not running on port ${this.port}`);
|
||||
}
|
||||
throw new BrowseClientError(`browse-client: ${err.message ?? err}`);
|
||||
}
|
||||
|
||||
const text = await resp.text();
|
||||
if (!resp.ok) {
|
||||
let message = `browse-client: command "${cmd}" failed with status ${resp.status}`;
|
||||
try {
|
||||
const parsed = JSON.parse(text);
|
||||
if (parsed.error) message += `: ${parsed.error}`;
|
||||
} catch {
|
||||
if (text) message += `: ${text.slice(0, 200)}`;
|
||||
}
|
||||
throw new BrowseClientError(message, resp.status, text);
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
// ─── Navigation ─────────────────────────────────────────────────
|
||||
|
||||
async goto(url: string): Promise<string> { return this.command('goto', [url]); }
|
||||
async wait(arg: string): Promise<string> { return this.command('wait', [arg]); }
|
||||
|
||||
// ─── Reading ────────────────────────────────────────────────────
|
||||
|
||||
async text(selector?: string): Promise<string> {
|
||||
return this.command('text', selector ? [selector] : []);
|
||||
}
|
||||
async html(selector?: string): Promise<string> {
|
||||
return this.command('html', selector ? [selector] : []);
|
||||
}
|
||||
async links(): Promise<string> { return this.command('links'); }
|
||||
async forms(): Promise<string> { return this.command('forms'); }
|
||||
async accessibility(): Promise<string> { return this.command('accessibility'); }
|
||||
async attrs(selector: string): Promise<string> { return this.command('attrs', [selector]); }
|
||||
async media(...flags: string[]): Promise<string> { return this.command('media', flags); }
|
||||
async data(...flags: string[]): Promise<string> { return this.command('data', flags); }
|
||||
|
||||
// ─── Interaction ────────────────────────────────────────────────
|
||||
|
||||
async click(selector: string): Promise<string> { return this.command('click', [selector]); }
|
||||
async fill(selector: string, value: string): Promise<string> { return this.command('fill', [selector, value]); }
|
||||
async select(selector: string, value: string): Promise<string> { return this.command('select', [selector, value]); }
|
||||
async hover(selector: string): Promise<string> { return this.command('hover', [selector]); }
|
||||
async type(text: string): Promise<string> { return this.command('type', [text]); }
|
||||
async press(key: string): Promise<string> { return this.command('press', [key]); }
|
||||
async scroll(selector?: string): Promise<string> {
|
||||
return this.command('scroll', selector ? [selector] : []);
|
||||
}
|
||||
|
||||
// ─── Snapshot + screenshot ──────────────────────────────────────
|
||||
|
||||
/** Snapshot returns the ARIA tree. Pass flags like '-i' (interactive only), '-c' (compact). */
|
||||
async snapshot(...flags: string[]): Promise<string> { return this.command('snapshot', flags); }
|
||||
async screenshot(...args: string[]): Promise<string> { return this.command('screenshot', args); }
|
||||
}
|
||||
|
||||
/**
|
||||
* Default singleton. Lazily resolves auth on first method call so a script can
|
||||
* import `browse` and immediately call `await browse.goto(...)` without
|
||||
* threading through a constructor.
|
||||
*/
|
||||
class LazyBrowseClient {
|
||||
private inner: BrowseClient | null = null;
|
||||
private get(): BrowseClient {
|
||||
if (!this.inner) this.inner = new BrowseClient();
|
||||
return this.inner;
|
||||
}
|
||||
// Mirror the BrowseClient surface; each method delegates to a freshly resolved instance.
|
||||
command(cmd: string, args: string[] = []) { return this.get().command(cmd, args); }
|
||||
goto(url: string) { return this.get().goto(url); }
|
||||
wait(arg: string) { return this.get().wait(arg); }
|
||||
text(selector?: string) { return this.get().text(selector); }
|
||||
html(selector?: string) { return this.get().html(selector); }
|
||||
links() { return this.get().links(); }
|
||||
forms() { return this.get().forms(); }
|
||||
accessibility() { return this.get().accessibility(); }
|
||||
attrs(selector: string) { return this.get().attrs(selector); }
|
||||
media(...flags: string[]) { return this.get().media(...flags); }
|
||||
data(...flags: string[]) { return this.get().data(...flags); }
|
||||
click(selector: string) { return this.get().click(selector); }
|
||||
fill(selector: string, value: string) { return this.get().fill(selector, value); }
|
||||
select(selector: string, value: string) { return this.get().select(selector, value); }
|
||||
hover(selector: string) { return this.get().hover(selector); }
|
||||
type(text: string) { return this.get().type(text); }
|
||||
press(key: string) { return this.get().press(key); }
|
||||
scroll(selector?: string) { return this.get().scroll(selector); }
|
||||
snapshot(...flags: string[]) { return this.get().snapshot(...flags); }
|
||||
screenshot(...args: string[]) { return this.get().screenshot(...args); }
|
||||
}
|
||||
|
||||
export const browse = new LazyBrowseClient();
|
||||
@@ -0,0 +1,52 @@
|
||||
<!DOCTYPE html><html lang="en" op="news"><head><meta charset="utf-8"><title>Hacker News</title></head>
|
||||
<body><center><table id="hnmain" border="0" cellpadding="0" cellspacing="0" width="85%" bgcolor="#f6f6ef">
|
||||
<tr><td>
|
||||
<table border="0" cellpadding="0" cellspacing="0" class="itemlist">
|
||||
<tr class="athing submission" id="40000001">
|
||||
<td align="right" valign="top" class="title"><span class="rank">1.</span></td>
|
||||
<td valign="top" class="votelinks"><center><a id="up_40000001" href="vote?id=40000001"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||
<td class="title"><span class="titleline"><a href="https://example.com/blog-post-1" rel="noreferrer">Show HN: A toy compiler in 200 lines</a> <span class="sitebit comhead"> (<a href="from?site=example.com"><span class="sitestr">example.com</span></a>)</span></span></td>
|
||||
</tr>
|
||||
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||
<span class="score" id="score_40000001">412 points</span> by <a href="user?id=alice" class="hnuser">alice</a> <span class="age" title="2026-04-26T08:15:00"><a href="item?id=40000001">3 hours ago</a></span> <span id="unv_40000001"></span> | <a href="hide?id=40000001&goto=news">hide</a> | <a href="item?id=40000001">87 comments</a> </span></td></tr>
|
||||
<tr class="spacer" style="height:5px"></tr>
|
||||
|
||||
<tr class="athing submission" id="40000002">
|
||||
<td align="right" valign="top" class="title"><span class="rank">2.</span></td>
|
||||
<td valign="top" class="votelinks"><center><a id="up_40000002" href="vote?id=40000002"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||
<td class="title"><span class="titleline"><a href="https://example.org/database-internals" rel="noreferrer">Database internals: writing an LSM tree</a> <span class="sitebit comhead"> (<a href="from?site=example.org"><span class="sitestr">example.org</span></a>)</span></span></td>
|
||||
</tr>
|
||||
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||
<span class="score" id="score_40000002">298 points</span> by <a href="user?id=bob" class="hnuser">bob</a> <span class="age" title="2026-04-26T07:42:00"><a href="item?id=40000002">4 hours ago</a></span> <span id="unv_40000002"></span> | <a href="hide?id=40000002&goto=news">hide</a> | <a href="item?id=40000002">152 comments</a> </span></td></tr>
|
||||
<tr class="spacer" style="height:5px"></tr>
|
||||
|
||||
<tr class="athing submission" id="40000003">
|
||||
<td align="right" valign="top" class="title"><span class="rank">3.</span></td>
|
||||
<td valign="top" class="votelinks"><center><a id="up_40000003" href="vote?id=40000003"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||
<td class="title"><span class="titleline"><a href="https://example.com/yc-w26-startup">Acme (YC W26) is hiring senior engineers (remote)</a> <span class="sitebit comhead"> (<a href="from?site=example.com"><span class="sitestr">example.com</span></a>)</span></span></td>
|
||||
</tr>
|
||||
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||
<span class="age" title="2026-04-26T06:00:00"><a href="item?id=40000003">5 hours ago</a></span> </span></td></tr>
|
||||
<tr class="spacer" style="height:5px"></tr>
|
||||
|
||||
<tr class="athing submission" id="40000004">
|
||||
<td align="right" valign="top" class="title"><span class="rank">4.</span></td>
|
||||
<td valign="top" class="votelinks"><center><a id="up_40000004" href="vote?id=40000004"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||
<td class="title"><span class="titleline"><a href="https://example.net/ask-hn" rel="noreferrer">Ask HN: What's your most underrated tool?</a></span></td>
|
||||
</tr>
|
||||
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||
<span class="score" id="score_40000004">156 points</span> by <a href="user?id=carol" class="hnuser">carol</a> <span class="age" title="2026-04-26T05:30:00"><a href="item?id=40000004">6 hours ago</a></span> <span id="unv_40000004"></span> | <a href="hide?id=40000004&goto=news">hide</a> | <a href="item?id=40000004">discuss</a> </span></td></tr>
|
||||
<tr class="spacer" style="height:5px"></tr>
|
||||
|
||||
<tr class="athing submission" id="40000005">
|
||||
<td align="right" valign="top" class="title"><span class="rank">5.</span></td>
|
||||
<td valign="top" class="votelinks"><center><a id="up_40000005" href="vote?id=40000005"><div class="votearrow" title="upvote"></div></a></center></td>
|
||||
<td class="title"><span class="titleline"><a href="https://example.io/quantum&chess">Why quantum & chess engines disagree</a> <span class="sitebit comhead"> (<a href="from?site=example.io"><span class="sitestr">example.io</span></a>)</span></span></td>
|
||||
</tr>
|
||||
<tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||
<span class="score" id="score_40000005">73 points</span> by <a href="user?id=dave" class="hnuser">dave</a> <span class="age" title="2026-04-26T04:00:00"><a href="item?id=40000005">7 hours ago</a></span> <span id="unv_40000005"></span> | <a href="hide?id=40000005&goto=news">hide</a> | <a href="item?id=40000005">12 comments</a> </span></td></tr>
|
||||
<tr class="spacer" style="height:5px"></tr>
|
||||
|
||||
</table>
|
||||
</td></tr>
|
||||
</table></center></body></html>
|
||||
@@ -0,0 +1,105 @@
|
||||
/**
|
||||
* hackernews-frontpage script tests — exercise parseStoriesFromHtml against
|
||||
* the bundled HN fixture. No daemon, no network: the parser is a pure function
|
||||
* over HTML, so we test it directly.
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { parseStoriesFromHtml } from './script';
|
||||
|
||||
const FIXTURE = fs.readFileSync(
|
||||
path.join(__dirname, 'fixtures', 'hn-2026-04-26.html'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
describe('parseStoriesFromHtml against bundled HN fixture', () => {
|
||||
it('returns 5 stories (matching the fixture)', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories).toHaveLength(5);
|
||||
});
|
||||
|
||||
it('assigns 1-based ranks in document order', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories.map(s => s.rank)).toEqual([1, 2, 3, 4, 5]);
|
||||
});
|
||||
|
||||
it('extracts ids matching the tr.athing[id] attribute', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories.map(s => s.id)).toEqual([
|
||||
'40000001', '40000002', '40000003', '40000004', '40000005',
|
||||
]);
|
||||
});
|
||||
|
||||
it('extracts titles and decodes HTML entities', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories[0].title).toBe('Show HN: A toy compiler in 200 lines');
|
||||
expect(stories[1].title).toBe('Database internals: writing an LSM tree');
|
||||
expect(stories[3].title).toBe("Ask HN: What's your most underrated tool?");
|
||||
expect(stories[4].title).toBe('Why quantum & chess engines disagree');
|
||||
});
|
||||
|
||||
it('extracts URLs and decodes ampersands', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories[0].url).toBe('https://example.com/blog-post-1');
|
||||
expect(stories[1].url).toBe('https://example.org/database-internals');
|
||||
expect(stories[4].url).toBe('https://example.io/quantum&chess');
|
||||
});
|
||||
|
||||
it('parses point counts as numbers', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories[0].points).toBe(412);
|
||||
expect(stories[1].points).toBe(298);
|
||||
expect(stories[3].points).toBe(156);
|
||||
expect(stories[4].points).toBe(73);
|
||||
});
|
||||
|
||||
it('parses comment counts as numbers', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories[0].comments).toBe(87);
|
||||
expect(stories[1].comments).toBe(152);
|
||||
expect(stories[4].comments).toBe(12);
|
||||
});
|
||||
|
||||
it('treats "discuss" links as 0 comments', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
expect(stories[3].comments).toBe(0);
|
||||
});
|
||||
|
||||
it('returns null points + null comments for job postings', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
// Story #3 is the YC-hiring row in the fixture.
|
||||
expect(stories[2].title).toContain('YC W26');
|
||||
expect(stories[2].points).toBeNull();
|
||||
expect(stories[2].comments).toBeNull();
|
||||
});
|
||||
|
||||
it('returns [] for empty HTML', () => {
|
||||
expect(parseStoriesFromHtml('')).toEqual([]);
|
||||
});
|
||||
|
||||
it('returns [] for HTML with no story rows', () => {
|
||||
expect(parseStoriesFromHtml('<html><body><p>nothing here</p></body></html>')).toEqual([]);
|
||||
});
|
||||
|
||||
it('does not fabricate stories from arbitrary tr.athing rows missing titleline', () => {
|
||||
const html = '<tr class="athing" id="999"><td>nothing</td></tr>';
|
||||
expect(parseStoriesFromHtml(html)).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('output shape', () => {
|
||||
it('every story has all required keys', () => {
|
||||
const stories = parseStoriesFromHtml(FIXTURE);
|
||||
for (const s of stories) {
|
||||
expect(typeof s.rank).toBe('number');
|
||||
expect(typeof s.id).toBe('string');
|
||||
expect(typeof s.title).toBe('string');
|
||||
expect(typeof s.url).toBe('string');
|
||||
// points/comments may be null for job rows
|
||||
expect(s.points === null || typeof s.points === 'number').toBe(true);
|
||||
expect(s.comments === null || typeof s.comments === 'number').toBe(true);
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,132 @@
|
||||
/**
|
||||
* hackernews-frontpage — scrape the HN front page and emit JSON.
|
||||
*
|
||||
* Output protocol:
|
||||
* stdout = a single JSON document on success: { stories: Story[], count }
|
||||
* stderr = anything we want logged (currently nothing)
|
||||
* exit 0 on success, nonzero on parse / network failure.
|
||||
*
|
||||
* The parser logic (`parseStoriesFromHtml`) is exported so script.test.ts can
|
||||
* exercise it against bundled HTML fixtures without spinning up the daemon.
|
||||
*/
|
||||
|
||||
import { browse } from './_lib/browse-client';
|
||||
|
||||
export interface Story {
|
||||
/** 1-based rank as displayed on HN. */
|
||||
rank: number;
|
||||
/** HN item id (the integer in `tr.athing[id]`). */
|
||||
id: string;
|
||||
title: string;
|
||||
/** Outbound URL the title links to. */
|
||||
url: string;
|
||||
/** null when the row has no score (job postings). */
|
||||
points: number | null;
|
||||
/** null when the row has no comments link (job postings). */
|
||||
comments: number | null;
|
||||
}
|
||||
|
||||
export interface Output {
|
||||
stories: Story[];
|
||||
count: number;
|
||||
}
|
||||
|
||||
const FRONT_PAGE_URL = 'https://news.ycombinator.com/';
|
||||
|
||||
/**
|
||||
* Parse HN front-page HTML into Story[].
|
||||
*
|
||||
* HN's structure is stable: each story is a pair of rows.
|
||||
* <tr class="athing submission" id="<itemid>">
|
||||
* <td class="rank">N.</td>
|
||||
* <td class="title">...</td>
|
||||
* <td class="title"><span class="titleline"><a href="<url>">title</a> ...</span></td>
|
||||
* </tr>
|
||||
* <tr><td colspan="2"></td><td class="subtext"><span class="subline">
|
||||
* <span class="score" id="score_<itemid>">N points</span>
|
||||
* ... <a href="item?id=<itemid>">N comments</a>
|
||||
* </span></td></tr>
|
||||
*
|
||||
* Job postings ("Foo (YC X25) is hiring...") omit the score and comments —
|
||||
* those fields come back as null.
|
||||
*/
|
||||
export function parseStoriesFromHtml(html: string): Story[] {
|
||||
const stories: Story[] = [];
|
||||
|
||||
// Match each `tr.athing` row, capturing the id attribute and the row body.
|
||||
const rowRegex = /<tr\s+[^>]*\bclass="athing[^"]*"[^>]*\bid="(\d+)"[^>]*>([\s\S]*?)<\/tr>/g;
|
||||
|
||||
let match: RegExpExecArray | null;
|
||||
let rank = 0;
|
||||
while ((match = rowRegex.exec(html)) !== null) {
|
||||
rank++;
|
||||
const id = match[1];
|
||||
const rowBody = match[2];
|
||||
|
||||
// Title link: <span class="titleline"><a href="..." ...>title</a>
|
||||
const titleMatch = rowBody.match(/<span\s+class="titleline"[^>]*>\s*<a\s+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/);
|
||||
if (!titleMatch) continue;
|
||||
const url = decodeHtmlEntities(titleMatch[1]);
|
||||
const title = stripTags(decodeHtmlEntities(titleMatch[2])).trim();
|
||||
|
||||
// The next sibling tr should hold the subtext row. Bound the lookahead
|
||||
// to before the next story (tr.spacer marks the gap, then tr.athing).
|
||||
// Bug if we don't bound: the score from story N+1 leaks into story N
|
||||
// when story N is a job posting (no score of its own).
|
||||
const subtextStart = match.index + match[0].length;
|
||||
const tail = html.slice(subtextStart);
|
||||
const spacerIdx = tail.search(/<tr\b[^>]*\bclass="spacer\b/);
|
||||
const nextAthingIdx = tail.search(/<tr\b[^>]*\bclass="athing\b/);
|
||||
const candidates = [spacerIdx, nextAthingIdx].filter(i => i >= 0);
|
||||
const boundary = candidates.length > 0 ? Math.min(...candidates) : tail.length;
|
||||
const subtextSlice = tail.slice(0, boundary);
|
||||
|
||||
let points: number | null = null;
|
||||
let comments: number | null = null;
|
||||
|
||||
const scoreMatch = subtextSlice.match(/<span\s+class="score"[^>]*>(\d+)\s*points?<\/span>/);
|
||||
if (scoreMatch) points = parseInt(scoreMatch[1], 10);
|
||||
|
||||
// Comment count: an anchor like `<a href="item?id=...">N comments</a>`,
|
||||
// or `discuss` (treated as 0). Skip "hide" / "context" / "from" links.
|
||||
const commentsMatch = subtextSlice.match(/<a\s+href="item\?id=\d+"[^>]*>(\d+)\s*(?: )?\s*comments?<\/a>/);
|
||||
if (commentsMatch) {
|
||||
comments = parseInt(commentsMatch[1], 10);
|
||||
} else if (/discuss<\/a>/.test(subtextSlice)) {
|
||||
comments = 0;
|
||||
}
|
||||
|
||||
stories.push({ rank, id, title, url, points, comments });
|
||||
}
|
||||
|
||||
return stories;
|
||||
}
|
||||
|
||||
function stripTags(s: string): string {
|
||||
return s.replace(/<[^>]*>/g, '');
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(s: string): string {
|
||||
return s
|
||||
.replace(/&/g, '&')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, "'")
|
||||
.replace(/'/g, "'")
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/ /g, ' ');
|
||||
}
|
||||
|
||||
// ─── Main entry (only when run as a script, not when imported by tests) ─
|
||||
|
||||
if (import.meta.main) {
|
||||
await main();
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
await browse.goto(FRONT_PAGE_URL);
|
||||
const html = await browse.html();
|
||||
const stories = parseStoriesFromHtml(html);
|
||||
const output: Output = { stories, count: stories.length };
|
||||
process.stdout.write(JSON.stringify(output) + '\n');
|
||||
}
|
||||
Reference in New Issue
Block a user