/** * hackernews-frontpage — scrape the HN front page and emit JSON. * * Output protocol: * stdout = a single JSON document on success: { stories: Story[], count } * stderr = anything we want logged (currently nothing) * exit 0 on success, nonzero on parse / network failure. * * The parser logic (`parseStoriesFromHtml`) is exported so script.test.ts can * exercise it against bundled HTML fixtures without spinning up the daemon. */ import { browse } from './_lib/browse-client'; export interface Story { /** 1-based rank as displayed on HN. */ rank: number; /** HN item id (the integer in `tr.athing[id]`). */ id: string; title: string; /** Outbound URL the title links to. */ url: string; /** null when the row has no score (job postings). */ points: number | null; /** null when the row has no comments link (job postings). */ comments: number | null; } export interface Output { stories: Story[]; count: number; } const FRONT_PAGE_URL = 'https://news.ycombinator.com/'; /** * Parse HN front-page HTML into Story[]. * * HN's structure is stable: each story is a pair of rows. * * N. * ... * title ... * * * N points * ... N comments * * * Job postings ("Foo (YC X25) is hiring...") omit the score and comments — * those fields come back as null. */ export function parseStoriesFromHtml(html: string): Story[] { const stories: Story[] = []; // Match each `tr.athing` row, capturing the id attribute and the row body. const rowRegex = /]*\bclass="athing[^"]*"[^>]*\bid="(\d+)"[^>]*>([\s\S]*?)<\/tr>/g; let match: RegExpExecArray | null; let rank = 0; while ((match = rowRegex.exec(html)) !== null) { rank++; const id = match[1]; const rowBody = match[2]; // Title link: title const titleMatch = rowBody.match(/]*>\s*]*>([\s\S]*?)<\/a>/); if (!titleMatch) continue; const url = decodeHtmlEntities(titleMatch[1]); const title = stripTags(decodeHtmlEntities(titleMatch[2])).trim(); // The next sibling tr should hold the subtext row. Bound the lookahead // to before the next story (tr.spacer marks the gap, then tr.athing). // Bug if we don't bound: the score from story N+1 leaks into story N // when story N is a job posting (no score of its own). const subtextStart = match.index + match[0].length; const tail = html.slice(subtextStart); const spacerIdx = tail.search(/]*\bclass="spacer\b/); const nextAthingIdx = tail.search(/]*\bclass="athing\b/); const candidates = [spacerIdx, nextAthingIdx].filter(i => i >= 0); const boundary = candidates.length > 0 ? Math.min(...candidates) : tail.length; const subtextSlice = tail.slice(0, boundary); let points: number | null = null; let comments: number | null = null; const scoreMatch = subtextSlice.match(/]*>(\d+)\s*points?<\/span>/); if (scoreMatch) points = parseInt(scoreMatch[1], 10); // Comment count: an anchor like `N comments`, // or `discuss` (treated as 0). Skip "hide" / "context" / "from" links. const commentsMatch = subtextSlice.match(/]*>(\d+)\s*(?: )?\s*comments?<\/a>/); if (commentsMatch) { comments = parseInt(commentsMatch[1], 10); } else if (/discuss<\/a>/.test(subtextSlice)) { comments = 0; } stories.push({ rank, id, title, url, points, comments }); } return stories; } function stripTags(s: string): string { return s.replace(/<[^>]*>/g, ''); } function decodeHtmlEntities(s: string): string { return s .replace(/&/g, '&') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/'/g, "'") .replace(/</g, '<') .replace(/>/g, '>') .replace(/ /g, ' '); } // ─── Main entry (only when run as a script, not when imported by tests) ─ if (import.meta.main) { await main(); } async function main(): Promise { await browse.goto(FRONT_PAGE_URL); const html = await browse.html(); const stories = parseStoriesFromHtml(html); const output: Output = { stories, count: stories.length }; process.stdout.write(JSON.stringify(output) + '\n'); }