/** * pdftotext wrapper — the tool behind the copy-paste CI gate. * * Codex round 2 surfaced two real problems we address here: * * #18: pdftotext (Poppler) vs pdftotext (Xpdf) vs pdftotext-next vary on * whitespace, line wrap, Unicode normalization, form feeds, and * extraction order. Cross-platform exact diffing is a non-starter. * We normalize aggressively and diff the normalized form. * * #19: the regex /(?:\b\w\s){4,}/ only catches one failure shape (letters * spaced out). It misses word-order corruption, missing whitespace * between paragraphs, and homoglyph substitution. We add a word-token * diff and a paragraph-boundary assertion on top. * * Resolution order for the pdftotext binary: * 1. $PDFTOTEXT_BIN env override * 2. `which pdftotext` on PATH * 3. standard Homebrew paths on macOS * 4. throws a friendly "install poppler" error * * The wrapper is *optional at runtime*: production renders don't need it. * Only the CI gate and unit tests invoke pdftotext. */ import { execFileSync } from "node:child_process"; import * as fs from "node:fs"; import * as os from "node:os"; import * as path from "node:path"; export class PdftotextUnavailableError extends Error { constructor(message: string) { super(message); this.name = "PdftotextUnavailableError"; } } export interface PdftotextInfo { bin: string; version: string; // "pdftotext version 24.02.0" or similar flavor: "poppler" | "xpdf" | "unknown"; } /** * Locate pdftotext. Throws PdftotextUnavailableError if none is found. */ export function resolvePdftotext(): PdftotextInfo { const envOverride = process.env.PDFTOTEXT_BIN; if (envOverride && isExecutable(envOverride)) { return describeBinary(envOverride); } // Try PATH try { const which = execFileSync("which", ["pdftotext"], { encoding: "utf8" }).trim(); if (which && isExecutable(which)) return describeBinary(which); } catch { // fall through } // Common macOS Homebrew locations const macCandidates = [ "/opt/homebrew/bin/pdftotext", // Apple Silicon "/usr/local/bin/pdftotext", // Intel Mac or Linuxbrew "/usr/bin/pdftotext", // distro package ]; for (const candidate of macCandidates) { if (isExecutable(candidate)) return describeBinary(candidate); } throw new PdftotextUnavailableError([ "pdftotext not found.", "", "make-pdf needs pdftotext to run the copy-paste CI gate.", "(Runtime rendering does NOT need it. This only affects tests.)", "", "To install:", " macOS: brew install poppler", " Ubuntu: sudo apt-get install poppler-utils", " Fedora: sudo dnf install poppler-utils", "", "Or set PDFTOTEXT_BIN to an explicit path:", " export PDFTOTEXT_BIN=/path/to/pdftotext", ].join("\n")); } function isExecutable(p: string): boolean { try { fs.accessSync(p, fs.constants.X_OK); return true; } catch { return false; } } function describeBinary(bin: string): PdftotextInfo { let version = "unknown"; let flavor: PdftotextInfo["flavor"] = "unknown"; try { // pdftotext -v writes to stderr and exits 0 on poppler, 99 on some xpdf builds. const result = execFileSync(bin, ["-v"], { encoding: "utf8", stdio: ["ignore", "pipe", "pipe"], }); version = (result || "").trim().split("\n")[0] || "unknown"; } catch (err: any) { // Many pdftotext builds exit non-zero on -v but still write to stderr. const stderr = err?.stderr?.toString?.() ?? ""; version = stderr.trim().split("\n")[0] || "unknown"; } const v = version.toLowerCase(); if (v.includes("poppler")) flavor = "poppler"; else if (v.includes("xpdf")) flavor = "xpdf"; return { bin, version, flavor }; } /** * Run pdftotext on a PDF and return the extracted text. * * Uses `-layout` by default because that's what downstream normalization * expects. Callers that need raw text can pass layout=false. */ export function pdftotext(pdfPath: string, opts?: { layout?: boolean }): string { const info = resolvePdftotext(); const layout = opts?.layout ?? true; const args: string[] = []; if (layout) args.push("-layout"); args.push(pdfPath, "-"); // "-" = stdout try { return execFileSync(info.bin, args, { encoding: "utf8", maxBuffer: 32 * 1024 * 1024, }); } catch (err: any) { throw new Error(`pdftotext failed on ${pdfPath}: ${err.message}`); } } /** * Normalize extracted text for cross-platform, cross-flavor diffing. * * What we strip / normalize: * - Unicode: NFC canonical composition (macOS emits NFD; Linux emits NFC; * this dodges the fundamental encoding diff). * - CR and CRLF → LF (Windows Xpdf emits CRLF). * - Form feeds (\f) → double newline (Poppler emits \f at page breaks). * - Trailing spaces on every line. * - Runs of 3+ blank lines → 2 blank lines. * - Leading/trailing whitespace on the whole string. * - Non-breaking space (U+00A0) → regular space. * - Zero-width space (U+200B) and zero-width non-joiner (U+200C) → empty. * - Soft hyphen (U+00AD) → empty (pdftotext -layout sometimes emits these * for hyphens: auto breaks). */ export function normalize(raw: string): string { let s = raw; s = s.normalize("NFC"); s = s.replace(/\r\n/g, "\n"); s = s.replace(/\r/g, "\n"); s = s.replace(/\f/g, "\n\n"); s = s.replace(/\u00a0/g, " "); s = s.replace(/[\u200b\u200c\u00ad]/g, ""); s = s.replace(/[ \t]+$/gm, ""); s = s.replace(/\n{3,}/g, "\n\n"); s = s.trim(); return s; } /** * The canonical copy-paste gate used in the E2E tests. * * Returns { ok: true } when all three assertions pass; returns * { ok: false, reasons: [...] } with one or more failure reasons otherwise. */ export interface GateResult { ok: boolean; reasons: string[]; extracted: string; } export function copyPasteGate(pdfPath: string, expected: string): GateResult { const extracted = normalize(pdftotext(pdfPath, { layout: true })); const expectedNorm = normalize(expected); const reasons: string[] = []; // Assertion 1: every expected paragraph appears as a whole line or // contiguous block in the extracted text. const expectedParagraphs = splitParagraphs(expectedNorm); for (const paragraph of expectedParagraphs) { const compact = collapseWhitespace(paragraph); const extractedCompact = collapseWhitespace(extracted); if (!extractedCompact.includes(compact)) { reasons.push( `expected paragraph not found in extracted text: ${truncate(paragraph, 80)}`, ); } } // Assertion 2: no "S a i l i n g"-style single-char runs. // Count groups of 4+ consecutive letter-then-space tokens. False positive // risk on things like "A B C D" (initials) — mitigate by requiring the // letters spell a known-word substring of the expected text. const fragRegex = /((?:\b\w\s){4,})/g; let fragMatch: RegExpExecArray | null; while ((fragMatch = fragRegex.exec(extracted)) !== null) { const letters = fragMatch[1].replace(/\s/g, ""); // Only flag if the reassembled letters appear in the expected text. if (expectedNorm.toLowerCase().includes(letters.toLowerCase()) && letters.length >= 4) { reasons.push( `per-glyph emission detected (the "S ai li ng" bug): "${fragMatch[1].trim()}" reassembles to "${letters}"`, ); } } // Assertion 3: paragraph boundaries preserved. Count double-newlines // in both; they should differ by no more than ±2 (header/footer noise). const expectedBreaks = (expectedNorm.match(/\n\n/g) || []).length; const extractedBreaks = (extracted.match(/\n\n/g) || []).length; if (Math.abs(expectedBreaks - extractedBreaks) > 4) { reasons.push( `paragraph boundary count drift: expected ~${expectedBreaks}, got ${extractedBreaks}`, ); } return { ok: reasons.length === 0, reasons, extracted }; } function splitParagraphs(s: string): string[] { return s.split(/\n\n+/).map(p => p.trim()).filter(p => p.length > 0); } function collapseWhitespace(s: string): string { return s.replace(/\s+/g, " ").trim(); } function truncate(s: string, n: number): string { return s.length > n ? s.slice(0, n) + "..." : s; } /** * Emit diagnostic info to stderr — useful for CI failure debugging. * Call this once before running any gate in a CI log. */ export function logDiagnostics(): void { try { const info = resolvePdftotext(); process.stderr.write( `[pdftotext] bin=${info.bin} flavor=${info.flavor} version="${info.version}" ` + `os=${os.platform()}-${os.arch()} node=${process.version}\n`, ); } catch (err: any) { process.stderr.write(`[pdftotext] unavailable: ${err.message}\n`); } }