Files
gstack/browser-skills/hackernews-frontpage/script.ts
T
Garry Tan 8a60d99c74 feat(browser-skills): bundled hackernews-frontpage reference skill
Smallest interesting browser-skill: scrapes HN front page, returns
30 stories as JSON. No auth, stable HTML, fully fixture-tested.

Files:
  SKILL.md                          frontmatter + prose
  script.ts                         exports parseStoriesFromHtml(html)
                                    main: goto + html + parse + JSON.stringify
  _lib/browse-client.ts             vendored copy of the SDK
  fixtures/hn-2026-04-26.html       captured front page (5 stories)
  script.test.ts                    13 assertions against the fixture

The parser is a pure function over HTML so script.test.ts runs
without a daemon (just imports parseStoriesFromHtml and asserts).

This exercises every Phase 1 component end-to-end:
  - browse-client SDK (script imports browse from ./_lib/)
  - 3-tier lookup (hackernews-frontpage lives in the bundled tier)
  - scoped tokens (read+write is enough for goto + html)
  - spawn lifecycle (\$B skill run hackernews-frontpage)
  - file-fixture testing (\$B skill test hackernews-frontpage)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 05:06:27 -07:00

133 lines
4.6 KiB
TypeScript

/**
* hackernews-frontpage — scrape the HN front page and emit JSON.
*
* Output protocol:
* stdout = a single JSON document on success: { stories: Story[], count }
* stderr = anything we want logged (currently nothing)
* exit 0 on success, nonzero on parse / network failure.
*
* The parser logic (`parseStoriesFromHtml`) is exported so script.test.ts can
* exercise it against bundled HTML fixtures without spinning up the daemon.
*/
import { browse } from './_lib/browse-client';
export interface Story {
/** 1-based rank as displayed on HN. */
rank: number;
/** HN item id (the integer in `tr.athing[id]`). */
id: string;
title: string;
/** Outbound URL the title links to. */
url: string;
/** null when the row has no score (job postings). */
points: number | null;
/** null when the row has no comments link (job postings). */
comments: number | null;
}
export interface Output {
stories: Story[];
count: number;
}
const FRONT_PAGE_URL = 'https://news.ycombinator.com/';
/**
* Parse HN front-page HTML into Story[].
*
* HN's structure is stable: each story is a pair of rows.
* <tr class="athing submission" id="<itemid>">
* <td class="rank">N.</td>
* <td class="title">...</td>
* <td class="title"><span class="titleline"><a href="<url>">title</a> ...</span></td>
* </tr>
* <tr><td colspan="2"></td><td class="subtext"><span class="subline">
* <span class="score" id="score_<itemid>">N points</span>
* ... <a href="item?id=<itemid>">N comments</a>
* </span></td></tr>
*
* Job postings ("Foo (YC X25) is hiring...") omit the score and comments —
* those fields come back as null.
*/
export function parseStoriesFromHtml(html: string): Story[] {
const stories: Story[] = [];
// Match each `tr.athing` row, capturing the id attribute and the row body.
const rowRegex = /<tr\s+[^>]*\bclass="athing[^"]*"[^>]*\bid="(\d+)"[^>]*>([\s\S]*?)<\/tr>/g;
let match: RegExpExecArray | null;
let rank = 0;
while ((match = rowRegex.exec(html)) !== null) {
rank++;
const id = match[1];
const rowBody = match[2];
// Title link: <span class="titleline"><a href="..." ...>title</a>
const titleMatch = rowBody.match(/<span\s+class="titleline"[^>]*>\s*<a\s+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/);
if (!titleMatch) continue;
const url = decodeHtmlEntities(titleMatch[1]);
const title = stripTags(decodeHtmlEntities(titleMatch[2])).trim();
// The next sibling tr should hold the subtext row. Bound the lookahead
// to before the next story (tr.spacer marks the gap, then tr.athing).
// Bug if we don't bound: the score from story N+1 leaks into story N
// when story N is a job posting (no score of its own).
const subtextStart = match.index + match[0].length;
const tail = html.slice(subtextStart);
const spacerIdx = tail.search(/<tr\b[^>]*\bclass="spacer\b/);
const nextAthingIdx = tail.search(/<tr\b[^>]*\bclass="athing\b/);
const candidates = [spacerIdx, nextAthingIdx].filter(i => i >= 0);
const boundary = candidates.length > 0 ? Math.min(...candidates) : tail.length;
const subtextSlice = tail.slice(0, boundary);
let points: number | null = null;
let comments: number | null = null;
const scoreMatch = subtextSlice.match(/<span\s+class="score"[^>]*>(\d+)\s*points?<\/span>/);
if (scoreMatch) points = parseInt(scoreMatch[1], 10);
// Comment count: an anchor like `<a href="item?id=...">N comments</a>`,
// or `discuss` (treated as 0). Skip "hide" / "context" / "from" links.
const commentsMatch = subtextSlice.match(/<a\s+href="item\?id=\d+"[^>]*>(\d+)\s*(?:&nbsp;)?\s*comments?<\/a>/);
if (commentsMatch) {
comments = parseInt(commentsMatch[1], 10);
} else if (/discuss<\/a>/.test(subtextSlice)) {
comments = 0;
}
stories.push({ rank, id, title, url, points, comments });
}
return stories;
}
function stripTags(s: string): string {
return s.replace(/<[^>]*>/g, '');
}
function decodeHtmlEntities(s: string): string {
return s
.replace(/&amp;/g, '&')
.replace(/&quot;/g, '"')
.replace(/&#x27;/g, "'")
.replace(/&#39;/g, "'")
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&nbsp;/g, ' ');
}
// ─── Main entry (only when run as a script, not when imported by tests) ─
if (import.meta.main) {
await main();
}
async function main(): Promise<void> {
await browse.goto(FRONT_PAGE_URL);
const html = await browse.html();
const stories = parseStoriesFromHtml(html);
const output: Output = { stories, count: stories.length };
process.stdout.write(JSON.stringify(output) + '\n');
}