Files
gstack/make-pdf/test/pdftotext.test.ts
T
Garry Tan 3af86348f6 feat(make-pdf): new /make-pdf skill + orchestrator binary
Turn markdown into publication-quality PDFs. $P generate input.md out.pdf
produces a PDF with 1in margins, intelligent page breaks, page numbers,
running header, CONFIDENTIAL footer, and curly quotes/em dashes — all on
Helvetica so copy-paste extraction works ("S ai li ng" bug avoided).

Architecture (per Codex round 2):
  markdown → render.ts (marked + sanitize + smartypants) → orchestrator
    → $B newtab --json → $B load-html --tab-id → $B js (poll Paged.js)
    → $B pdf --tab-id → $B closetab

browseClient.ts shells out to the compiled browse CLI rather than
duplicating Playwright. --tab-id isolation per render means parallel
$P generate calls don't race on the active tab. try/finally tab cleanup
survives Paged.js timeouts, browser crashes, and output-path failures.

Features in v1:
  --cover              left-aligned cover page (eyebrow + title + hairline rule)
  --toc                clickable static TOC (Paged.js page numbers deferred)
  --watermark <text>   diagonal DRAFT/CONFIDENTIAL layer
  --no-chapter-breaks  opt out of H1-starts-new-page
  --page-numbers       "N of M" footer (default on)
  --tagged --outline   accessible PDF + bookmark outline (default on)
  --allow-network      opt in to external image loading (default off for privacy)
  --quiet --verbose    stderr control

Design decisions locked from the /plan-design-review pass:
  - Helvetica everywhere (Chromium emits single-word Tj operators for
    system fonts; bundled webfonts emit per-glyph and break extraction).
  - Left-aligned body, flush-left paragraphs, no text-indent, 12pt gap.
  - Cover shares 1in margins with body pages; no flexbox-center, no
    inset padding.
  - The reference HTMLs at .context/designs/*.html are the implementation
    source of truth for print-css.ts.

Tests (56 unit + 1 E2E combined-features gate):
  - smartypants: code/URL-safe, verified against 10 fixtures
  - sanitizer: strips <script>/<iframe>/on*/javascript: URLs
  - render: HTML assembly, CJK fallback, cover/TOC/chapter wrap
  - print-css: all @page rules, margin variants, watermark
  - pdftotext: normalize()+copyPasteGate() cross-OS tolerance
  - browseClient: binary resolution + typed error propagation
  - combined-features gate (P0): 2-chapter fixture with smartypants +
    hyphens + ligatures + bold/italic + inline code + lists + blockquote
    passes through PDF → pdftotext → expected.txt diff

Deferred to Phase 4 (future PR): Paged.js vendored for accurate TOC page
numbers, highlight.js for syntax highlighting, drop caps, pull quotes,
two-column, CMYK, watermark visual-diff acceptance.

Plan: .context/ceo-plans/2026-04-19-perfect-pdf-generator.md
References: .context/designs/make-pdf-*.html

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 05:34:05 +08:00

107 lines
4.1 KiB
TypeScript

/**
* pdftotext unit tests — normalize() and copyPasteGate() assertions.
*
* These tests are pure unit tests of the normalization + assertion logic.
* They do NOT require pdftotext to be installed (the actual binary is
* mocked by manipulating strings directly).
*/
import { describe, expect, test } from "bun:test";
import { normalize, copyPasteGate } from "../src/pdftotext";
describe("normalize", () => {
test("strips trailing spaces", () => {
expect(normalize("hello \nworld")).toBe("hello\nworld");
});
test("collapses runs of 3+ blank lines to 2", () => {
expect(normalize("a\n\n\n\nb")).toBe("a\n\nb");
});
test("converts form feeds to double newlines (page break boundary)", () => {
expect(normalize("page1\fpage2")).toBe("page1\n\npage2");
});
test("normalizes CRLF and CR to LF (Windows Xpdf)", () => {
expect(normalize("a\r\nb\rc")).toBe("a\nb\nc");
});
test("removes soft hyphens (hyphens: auto artifact)", () => {
expect(normalize("extra\u00adordinary")).toBe("extraordinary");
});
test("replaces non-breaking space with regular space", () => {
expect(normalize("hello\u00a0world")).toBe("hello world");
});
test("strips zero-width characters", () => {
expect(normalize("a\u200bb\u200cc")).toBe("abc");
});
test("NFC-normalizes composed glyphs (macOS NFD → Linux NFC)", () => {
// "é" composed vs decomposed
const decomposed = "e\u0301";
const composed = "\u00e9";
expect(normalize(decomposed)).toBe(composed);
});
test("trims leading/trailing whitespace on whole string", () => {
expect(normalize("\n\n hello \n\n")).toBe("hello");
});
});
describe("copyPasteGate — assertion logic", () => {
// These tests exercise the gate's internal assertions by mocking the
// pdftotext step. We can't easily run the real binary in every test
// env, so we verify the assertion logic directly via fake inputs.
//
// The gate takes a PDF path — but assertion #1 (paragraph presence) and
// #2 (per-glyph emission) are string operations we can validate here.
test("flags 'S ai li ng' per-glyph emission when reassembled letters appear in source", () => {
// Build expected/extracted strings that would trip the gate.
const expected = "Sailing on the open sea.";
const extracted = "S a i l i n g on the open sea.";
// Simulate by running normalize + assertion manually; the regex is
// looked at in the gate.
const fragRegex = /((?:\b\w\s){4,})/g;
const match = fragRegex.exec(extracted);
expect(match).not.toBeNull();
if (match) {
const letters = match[1].replace(/\s/g, "");
expect(letters.toLowerCase()).toBe("sailing");
expect(expected.toLowerCase().includes(letters.toLowerCase())).toBe(true);
}
});
test("does NOT flag 'A B C D' as per-glyph when letters don't appear in source", () => {
const expected = "The quick brown fox.";
const extracted = "The quick A B C D brown fox.";
const fragRegex = /((?:\b\w\s){4,})/g;
const match = fragRegex.exec(extracted);
if (match) {
const letters = match[1].replace(/\s/g, "");
// "ABCD" is not a substring of expected
expect(expected.toLowerCase().includes(letters.toLowerCase())).toBe(false);
}
});
test("paragraph boundary count drift calculation", () => {
const expected = "para1\n\npara2\n\npara3";
const extractedOk = "para1\n\npara2\n\npara3";
const extractedTooFew = "para1 para2 para3";
const extractedTooMany = "para1\n\n\n\npara2\n\n\n\npara3\n\n\n\npara4\n\n\n\npara5";
const expectedBreaks = (expected.match(/\n\n/g) || []).length;
const okBreaks = (extractedOk.match(/\n\n/g) || []).length;
const tooFewBreaks = (extractedTooFew.match(/\n\n/g) || []).length;
const tooManyBreaksNormalized = (normalize(extractedTooMany).match(/\n\n/g) || []).length;
expect(Math.abs(expectedBreaks - okBreaks)).toBeLessThanOrEqual(4);
expect(Math.abs(expectedBreaks - tooFewBreaks)).toBeGreaterThan(1);
// After normalize, 3+ newlines become 2, so the count matches
expect(Math.abs(expectedBreaks - tooManyBreaksNormalized)).toBeLessThanOrEqual(4);
});
});