fix(make-pdf): pre-landing review wave — fence fidelity, injection hardening, Windows paths, transport rework

Review army (6 specialists + red team) findings, all fixed:

- Indented fences replay byte-for-byte and indented diagram fences are NOT
  extracted (red-team conf-9: the pre-pass reconstructed fences at column 0,
  splitting any list containing fenced code — every ordinary document).
- String.replace $-pattern injection killed at every seam: substituteSlots,
  mergeStyle, img/src rewrites all use function replacements (a diagram label
  containing $' duplicated the document tail).
- Big-expression transport reworked: browse `eval <file>` (one spawn, any
  size, Windows-safe) replaces the 64KB chunked window-buffer eval — fixes
  the per-chunk spawn cost, the char-vs-byte argv units, AND the Windows
  32,767-char command-line ceiling in one move.
- Staged-bundle trust: content verified by hash even when the file exists,
  and the rename-failure path re-hashes the survivor (sticky-bit /tmp EPERM
  would otherwise ride a pre-planted file past the check).
- Windows drive-letter img srcs (C:/x.png) reach the local-path branch
  instead of being swallowed as unknown URL schemes.
- DOCX rasterize-failure now embeds the decoded source as visible text —
  returning the figure made diagrams vanish silently (converter drops svg).
- Fence source preserved as base64 data-gstack-source attribute (the comment
  encoding corrupted every '-->' arrow); decodeFigureSource() round-trips.
- inlineLocalImages memoizes per path; file:// uses fileURLToPath; preview
  prints a divergence note for fences/local images; --to docx strips the
  watermark div and warns about print-only flags; TOC links resolve in
  html/docx (heading ids assigned); waitForExpression sleeps instead of
  busy-spinning; escapeHtml/svg-dims deduped to single definitions;
  typography stragglers (blockquote 12pt, footnotes 10pt, 42em screen
  measure); bundle BUILD_INFO gains srcSha256 for no-node_modules drift
  detection; MAX_TARGET_PX shared guard.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-12 07:57:42 -07:00
parent 0b7b5ee0f7
commit 9db479a38d
11 changed files with 625 additions and 475 deletions
+3 -2
View File
@@ -1,7 +1,8 @@
{
"name": "gstack-diagram-render",
"sha256": "0ee91aef5a8da85c8941c26ebf2991bbeba82412644bb070d5c5dd2e23538b81",
"bytes": 9645503,
"sha256": "da9c363071afbe79e06807bd1e67dbacc1123187db7b99e2608dd4a1a9567e94",
"srcSha256": "07238fae312bc0444f62b0a0a3404a8a38c45cef505aa1528c60a0ded17cbe06",
"bytes": 9645479,
"bunVersion": "1.3.13",
"deps": {
"@excalidraw/excalidraw": "0.18.0",
File diff suppressed because one or more lines are too long
+8
View File
@@ -78,9 +78,17 @@ const html = head + inlineJs + tail;
await Bun.write(DIST_HTML, html);
const sha256 = createHash("sha256").update(html).digest("hex");
// Source fingerprint: lets the drift test catch "edited src, forgot to
// rebuild dist" WITHOUT needing node_modules for a full rebuild (the deep
// rebuild check only runs where deps are installed).
const srcSha256 = createHash("sha256")
.update(await Bun.file(ENTRY).text())
.update(await Bun.file(import.meta.path).text())
.digest("hex");
const info = {
name: "gstack-diagram-render",
sha256,
srcSha256,
bytes: Buffer.byteLength(html),
bunVersion: Bun.version,
deps,
+10 -6
View File
@@ -100,10 +100,16 @@ window.__excalidrawToSvg = async (sceneJson: string): Promise<string> => {
* targetWidthPx = placed physical width (in) × 300dpi (eng-review D6.5) —
* the bundle never guesses a viewport.
*/
window.__rasterize = async (svgText: string, targetWidthPx: number): Promise<string> => {
if (!(targetWidthPx > 0 && targetWidthPx <= 10000)) {
throw new Error(`targetWidthPx out of range: ${targetWidthPx}`);
/** Shared ceiling for rasterization targets (both window functions). */
const MAX_TARGET_PX = 10_000;
function assertTargetWidth(px: number): void {
if (!(px > 0 && px <= MAX_TARGET_PX)) {
throw new Error(`targetWidthPx out of range: ${px}`);
}
}
window.__rasterize = async (svgText: string, targetWidthPx: number): Promise<string> => {
assertTargetWidth(targetWidthPx);
const blob = new Blob([svgText], { type: "image/svg+xml;charset=utf-8" });
const url = URL.createObjectURL(blob);
try {
@@ -164,9 +170,7 @@ window.__downscaleRaster = async (
targetWidthPx: number,
mime: string,
): Promise<string> => {
if (!(targetWidthPx > 0 && targetWidthPx <= 10000)) {
throw new Error(`targetWidthPx out of range: ${targetWidthPx}`);
}
assertTargetWidth(targetWidthPx);
const img = new Image();
await new Promise<void>((resolve, reject) => {
img.onload = () => resolve();
+18 -3
View File
@@ -290,6 +290,19 @@ export function js(opts: JsOptions): string {
]).trim();
}
/**
* Evaluate a JS file in a tab (`browse eval <file>`): the argv-safe transport
* for expressions too large for a command-line element. The file must live
* under browse's safe dirs (/tmp or cwd).
*/
export function evalFile(opts: { file: string; tabId: number }): string {
return runBrowse([
"eval",
opts.file,
"--tab-id", String(opts.tabId),
]).trim();
}
/**
* Poll a boolean JS expression until it evaluates to true, or timeout.
* Returns true if it succeeded, false if timed out.
@@ -311,9 +324,11 @@ export function waitForExpression(opts: {
}
const wait = Math.min(poll, Math.max(0, deadline - Date.now()));
if (wait <= 0) break;
// Synchronous sleep is fine — this only runs once per PDF render
const end = Date.now() + wait;
while (Date.now() < end) { /* busy wait */ }
// Real sleep, not a busy-wait: this poll now runs on every diagram-render
// bundle load (and after every fence render error), exactly while Chromium
// is parsing a 9MB page on the same machine — spinning a core competes
// with the work being awaited.
Bun.sleepSync(wait);
}
return false;
}
+151 -86
View File
@@ -28,9 +28,10 @@ import * as fs from "node:fs";
import * as os from "node:os";
import * as path from "node:path";
import * as crypto from "node:crypto";
import { fileURLToPath } from "node:url";
import * as browseClient from "./browseClient";
import { sanitizeUntrustedHtml } from "./render";
import { escapeHtml, sanitizeUntrustedHtml } from "./render";
import { imageDims } from "./image-size";
// ─── Types ────────────────────────────────────────────────────────────
@@ -92,10 +93,17 @@ export class StrictModeError extends Error {
const DIAGRAM_LANGS = new Set(["mermaid", "excalidraw"]);
/**
* Extract top-level ```mermaid / ```excalidraw fences, replacing each with a
* Extract column-0 ```mermaid / ```excalidraw fences, replacing each with a
* unique placeholder token paragraph. Backtick and tilde fences, any length
* >= 3; closers must be at least as long as the opener (CommonMark). Fences
* with `render=false` in the info string are left untouched.
* with `render=false` are left untouched.
*
* Two deliberate conservatisms (red-team finding — the original version
* reconstructed fences at column 0 and restructured lists):
* - Non-diagram fences replay as their ORIGINAL raw lines, byte-for-byte
* (only a render=false flag is removed, in place, preserving indent).
* - INDENTED diagram fences (inside lists/quotes) are NOT extracted — a
* column-0 placeholder would split the list. They replay verbatim as code.
*/
export function extractDiagramFences(markdown: string): FenceExtraction {
const lines = markdown.split("\n");
@@ -104,7 +112,10 @@ export function extractDiagramFences(markdown: string): FenceExtraction {
const runId = crypto.randomBytes(4).toString("hex");
let i = 0;
let openFence: { char: string; len: number; info: string; body: string[] } | null = null;
let openFence: {
char: string; len: number; indent: number; info: string;
rawOpener: string; body: string[];
} | null = null;
let ordinal = 0;
while (i < lines.length) {
@@ -114,7 +125,7 @@ export function extractDiagramFences(markdown: string): FenceExtraction {
const close = matchFenceLine(line);
if (close && close.char === openFence.char && close.len >= openFence.len && close.info === "") {
const info = parseInfoString(openFence.info);
if (DIAGRAM_LANGS.has(info.lang) && info.render) {
if (DIAGRAM_LANGS.has(info.lang) && info.render && openFence.indent === 0) {
ordinal++;
const token = `gstack-diagram-slot-${runId}-${ordinal}`;
fences.push({
@@ -128,10 +139,9 @@ export function extractDiagramFences(markdown: string): FenceExtraction {
});
out.push("", token, "");
} else {
// Not a diagram fence (or render=false): replay verbatim, but strip
// the render=false flag so it never leaks into highlighted output.
const infoOut = info.render ? openFence.info : info.lang;
out.push(`${openFence.char.repeat(openFence.len)}${infoOut}`);
// Not extracted (other language, render=false, or indented): replay
// the ORIGINAL lines verbatim; only strip a render=false flag.
out.push(stripRenderFalse(openFence.rawOpener));
out.push(...openFence.body);
out.push(line);
}
@@ -146,7 +156,7 @@ export function extractDiagramFences(markdown: string): FenceExtraction {
const open = matchFenceLine(line);
if (open && open.info !== "") {
openFence = { char: open.char, len: open.len, info: open.info, body: [] };
openFence = { ...open, rawOpener: line, body: [] };
i++;
continue;
}
@@ -171,17 +181,22 @@ export function extractDiagramFences(markdown: string): FenceExtraction {
// Unclosed fence at EOF: replay verbatim (CommonMark treats it as code to EOF).
if (openFence) {
out.push(`${openFence.char.repeat(openFence.len)}${openFence.info}`);
out.push(openFence.rawOpener);
out.push(...openFence.body);
}
return { markdown: out.join("\n"), fences };
}
function matchFenceLine(line: string): { char: string; len: number; info: string } | null {
const m = line.match(/^ {0,3}(`{3,}|~{3,})\s*(.*)$/);
function matchFenceLine(line: string): { char: string; len: number; indent: number; info: string } | null {
const m = line.match(/^( {0,3})(`{3,}|~{3,})\s*(.*)$/);
if (!m) return null;
return { char: m[1][0], len: m[1].length, info: m[2].trim() };
return { indent: m[1].length, char: m[2][0], len: m[2].length, info: m[3].trim() };
}
/** Remove a render=false flag from a raw opener line, preserving everything else. */
function stripRenderFalse(rawOpener: string): string {
return rawOpener.replace(/\s*\brender\s*=\s*false\b/i, "");
}
/** Parse a fence info string: `mermaid`, `mermaid render=false`,
@@ -208,12 +223,12 @@ export function parseInfoString(info: string): {
export function substituteSlots(html: string, slots: Map<string, string>): string {
let s = html;
for (const [token, slotHtml] of slots) {
// Function replacement is load-bearing: slot HTML carries user/LLM-authored
// diagram label text, and string-form replace() expands $&, $', $` patterns
// inside it — a label containing "$'" would duplicate the document tail.
const wrapped = new RegExp(`<p>\\s*${token}\\s*</p>`, "g");
if (wrapped.test(s)) {
s = s.replace(new RegExp(`<p>\\s*${token}\\s*</p>`, "g"), slotHtml);
} else {
s = s.split(token).join(slotHtml);
}
const replaced = s.replace(wrapped, () => slotHtml);
s = replaced !== s ? replaced : s.split(token).join(slotHtml);
}
return s;
}
@@ -227,14 +242,19 @@ export function buildDiagnosticBlock(fence: DiagramFence, errorMessage: string):
const excerpt = fence.source.split("\n").slice(0, 8).join("\n");
const truncated = fence.source.split("\n").length > 8 ? "\n…" : "";
return [
`<figure class="diagram diagram-error" role="img" aria-label="${escapeAttr(diagramLabel(fence))} (failed to render)">`,
`<figure class="diagram diagram-error" role="img" aria-label="${escapeHtml(diagramLabel(fence))} (failed to render)">`,
`<figcaption class="diagram-error-title">Diagram failed to render (${escapeHtml(fence.lang)})</figcaption>`,
`<pre class="diagram-error-detail">${escapeHtml(errorMessage.trim())}\n\n${escapeHtml(excerpt + truncated)}</pre>`,
`</figure>`,
].join("\n");
}
/** Wrap a rendered SVG in an accessible figure (D6.4). */
/**
* Wrap a rendered SVG in an accessible figure (D6.4). The raw fence source is
* preserved base64-encoded in a data attribute — an HTML comment would need
* `--` escaping, which corrupts every mermaid arrow (`-->`) and breaks
* round-trip recovery.
*/
export function buildDiagramFigure(fence: DiagramFence, svg: string): string {
const label = diagramLabel(fence);
const cleanSvg = sanitizeUntrustedHtml(svg);
@@ -242,17 +262,27 @@ export function buildDiagramFigure(fence: DiagramFence, svg: string): string {
? `\n<figcaption class="diagram-caption">${escapeHtml(fence.title)}</figcaption>`
: "";
const pageAttr = fence.page ? ` data-gstack-page="${fence.page}"` : "";
const sourceB64 = Buffer.from(fence.source, "utf8").toString("base64");
return [
`<figure class="diagram" role="img" aria-label="${escapeAttr(label)}"${pageAttr}>`,
`<!-- gstack-diagram-source lang=${escapeAttr(fence.lang)}`,
escapeHtmlComment(fence.source),
`-->`,
`<figure class="diagram" role="img" aria-label="${escapeHtml(label)}"${pageAttr}` +
` data-gstack-lang="${escapeHtml(fence.lang)}" data-gstack-source="${sourceB64}">`,
cleanSvg,
captioned,
`</figure>`,
].join("\n");
}
/** Recover the original fence source from a rendered figure (round-trip). */
export function decodeFigureSource(figureHtml: string): string | null {
const m = figureHtml.match(/\bdata-gstack-source="([A-Za-z0-9+/=]*)"/);
if (!m) return null;
try {
return Buffer.from(m[1], "base64").toString("utf8");
} catch {
return null;
}
}
function diagramLabel(fence: DiagramFence): string {
return fence.title ?? `diagram ${fence.ordinal}`;
}
@@ -261,6 +291,11 @@ function diagramLabel(fence: DiagramFence): string {
const PAYLOAD_TMP_DIR = process.platform === "win32" ? os.tmpdir() : "/tmp";
const READY_TIMEOUT_MS = 20_000;
// Expressions bigger than this ship via `browse eval <file>` instead of argv.
// 8KB is safe on every platform (Windows CreateProcess caps the WHOLE command
// line at 32,767 chars; Linux MAX_ARG_STRLEN is ~128KiB) and the tmp-file
// round-trip costs microseconds — one spawn regardless of payload size.
const MAX_ARGV_EXPR_BYTES = 8_000;
export class RenderTab {
private constructor(
@@ -279,14 +314,37 @@ export class RenderTab {
const html = fs.readFileSync(bundleSrc);
const sha = crypto.createHash("sha256").update(html).digest("hex").slice(0, 16);
const staged = path.join(PAYLOAD_TMP_DIR, `gstack-diagram-render-${sha}.html`);
if (!fs.existsSync(staged)) {
// Never trust an existing file at the predictable shared-/tmp name: verify
// its content hash and re-stage on mismatch (a pre-planted file would
// otherwise be loaded into the render tab as the bundle).
let needsWrite = true;
if (fs.existsSync(staged)) {
try {
const existing = crypto.createHash("sha256").update(fs.readFileSync(staged)).digest("hex").slice(0, 16);
needsWrite = existing !== sha;
} catch {
needsWrite = true;
}
}
if (needsWrite) {
// Concurrent-safe: write to a unique temp name, then atomic rename.
const tmp = `${staged}.${process.pid}.${crypto.randomBytes(4).toString("hex")}`;
fs.writeFileSync(tmp, html);
try {
fs.renameSync(tmp, staged);
} catch {
fs.unlinkSync(tmp); // another process won the race — theirs is identical
} catch (renameErr) {
try { fs.unlinkSync(tmp); } catch { /* best-effort tmp cleanup */ }
// Only swallow the rename failure when the surviving file HASHES to
// the expected bundle (a concurrent writer won an OS-level race).
// Sticky-bit /tmp makes rename-over-foreign-file fail EPERM — if the
// survivor were trusted on existence alone, a pre-planted file would
// ride through the exact check added to stop it.
let survivorOk = false;
try {
const survivor = crypto.createHash("sha256").update(fs.readFileSync(staged)).digest("hex").slice(0, 16);
survivorOk = survivor === sha;
} catch { /* unreadable survivor = not ok */ }
if (!survivorOk) throw renameErr;
}
}
const tabId = browseClient.newtab();
@@ -330,36 +388,29 @@ export class RenderTab {
private js(expression: string): string {
// Large payloads (scene JSON, SVG text, data URIs) blow past argv limits —
// browseClient.js shells out with the expression as an argv element, so
// stage anything big through a tmp file the page can fetch? No: file URLs
// are unreachable from the page. Instead, chunk through a window buffer.
if (expression.length <= 100_000) {
// browseClient.js shells out with the expression as an argv element. The
// limit is BYTES, not chars (CJK content is 3x its char count in UTF-8),
// and Windows caps the whole command line at 32,767 chars — so anything
// big ships via `browse eval <file>` instead: one spawn, any size.
if (Buffer.byteLength(expression, "utf8") <= MAX_ARGV_EXPR_BYTES) {
return browseClient.js({ expression, tabId: this.tabId });
}
return this.jsViaBuffer(expression);
return this.jsViaFile(expression);
}
/**
* argv-safe path for big expressions: ship the expression into the page in
* 64KB chunks (window.__exprBuf), then eval it there. Used for multi-MB
* data URIs (photo downscaling) where a single argv would exceed OS limits.
*/
private jsViaBuffer(expression: string): string {
browseClient.js({ expression: "window.__exprBuf = ''", tabId: this.tabId });
const CHUNK = 64_000;
for (let i = 0; i < expression.length; i += CHUNK) {
const chunk = expression.slice(i, i + CHUNK);
browseClient.js({
expression: `window.__exprBuf += ${JSON.stringify(chunk)}, window.__exprBuf.length`,
tabId: this.tabId,
});
/** argv-safe path for big expressions: stage to a tmp file under browse's
* safe dirs and run `browse eval <file>` (one spawn regardless of size). */
private jsViaFile(expression: string): string {
const file = path.join(
PAYLOAD_TMP_DIR,
`gstack-diagram-expr-${process.pid}-${crypto.randomBytes(4).toString("hex")}.js`,
);
fs.writeFileSync(file, expression, "utf8");
try {
return browseClient.evalFile({ file, tabId: this.tabId });
} finally {
try { fs.unlinkSync(file); } catch { /* best-effort tmp cleanup */ }
}
// Eval the buffer as a single expression so the resulting promise is the
// statement value browse awaits. The buffer resets at the next call.
return browseClient.js({
expression: `(0, eval)(window.__exprBuf)`,
tabId: this.tabId,
});
}
close(): void {
@@ -466,21 +517,31 @@ export function rasterizeDiagramFigures(
const png = tab.call("__rasterize", svgMatch[0], targetPx);
return `<p><img src="${png}" alt="${label}"></p>`;
} catch (err: any) {
warn(`docx: diagram rasterization failed (${firstLine(err?.message ?? String(err))}); keeping source text`);
return figure;
const reason = firstLine(err?.message ?? String(err));
warn(`docx: diagram rasterization failed (${reason}); embedding source text instead`);
// The converter drops <figure>/<svg> entirely, so returning the figure
// would make the diagram vanish without a trace — the exact invisible
// failure the diagnostic contract forbids. Surface the source.
const source = decodeFigureSource(figure) ?? "(source unavailable)";
return [
`<p><strong>Diagram could not be rasterized for DOCX (${escapeHtml(reason)}) — source:</strong></p>`,
`<pre>${escapeHtml(source)}</pre>`,
].join("\n");
}
},
);
// 2. SVG data-URI images (inlined .svg files) → PNG.
out = out.replace(/<img\b[^>]*>/gi, (tag) => {
const src = tag.match(SRC_RE)?.[2] ?? tag.match(SRC_RE)?.[3] ?? "";
const m = tag.match(SRC_RE);
const src = m?.[2] ?? m?.[3] ?? "";
if (!src.startsWith("data:image/svg+xml")) return tag;
try {
const b64 = src.slice(src.indexOf(",") + 1);
const svgText = Buffer.from(b64, "base64").toString("utf8");
const png = tab.call("__rasterize", svgText, targetPx);
return tag.replace(SRC_RE, `src="${png}"`);
// Function replacement: data URIs can contain $-patterns.
return tag.replace(SRC_RE, () => `src="${png}"`);
} catch (err: any) {
warn(`docx: svg image rasterization failed (${firstLine(err?.message ?? String(err))})`);
return tag;
@@ -521,6 +582,9 @@ const SRC_RE = /\bsrc\s*=\s*("([^"]*)"|'([^']*)')/i;
export function inlineLocalImages(html: string, opts: PrepassImageOptions): string {
const maxPx = Math.round(opts.contentWidthIn * PRINT_DPI * DOWNSCALE_FACTOR);
const targetPx = Math.round(opts.contentWidthIn * PRINT_DPI);
// An image referenced N times is read/probed/downscaled once; the same data
// URI string is reused (also dedupes memory until the final join).
const memo = new Map<string, string>();
return html.replace(IMG_TAG_RE, (tag) => {
const srcMatch = tag.match(SRC_RE);
@@ -529,7 +593,11 @@ export function inlineLocalImages(html: string, opts: PrepassImageOptions): stri
if (src.startsWith("data:")) return annotateFromDataUri(tag, src);
if (/^[a-z][a-z0-9+.-]*:/i.test(src)) {
// Windows drive-letter paths (C:/x.png, C:\x.png) look like single-letter
// URL schemes — they are local paths, not URLs.
const isDrivePath = /^[a-zA-Z]:[\\/]/.test(src);
if (!isDrivePath && /^[a-z][a-z0-9+.-]*:/i.test(src)) {
// Absolute URL with a scheme (http, https, file, …)
if (opts.allowNetwork && /^https?:/i.test(src)) return tag;
if (/^https?:/i.test(src)) {
@@ -543,8 +611,13 @@ export function inlineLocalImages(html: string, opts: PrepassImageOptions): stri
}
const filePath = src.startsWith("file:")
? decodeURIComponent(new URL(src).pathname)
: path.resolve(opts.inputDir, decodeURIComponent(src));
? fileURLToPath(src)
: isDrivePath
? path.resolve(src)
: path.resolve(opts.inputDir, decodeURIComponent(src));
const cached = memo.get(filePath);
if (cached !== undefined) return rewriteImgTag(tag, cached);
if (!fs.existsSync(filePath)) {
const msg = `image not found: ${src} (resolved to ${filePath})`;
@@ -579,17 +652,26 @@ export function inlineLocalImages(html: string, opts: PrepassImageOptions): stri
}
const dataUri = `data:${mime};base64,${buf.toString("base64")}`;
let newTag = tag.replace(SRC_RE, `src="${dataUri}"`);
if (dims) {
newTag = newTag.replace(
/^<img\b/i,
`<img data-gstack-px-width="${Math.round(dims.width)}" data-gstack-px-height="${Math.round(dims.height)}"`,
);
}
return newTag;
const attrs = dims
? ` data-gstack-px-width="${Math.round(dims.width)}" data-gstack-px-height="${Math.round(dims.height)}"`
: "";
memo.set(filePath, `${dataUri}${attrs}`);
return rewriteImgTag(tag, memo.get(filePath)!);
});
}
/** Apply a memoized `dataUriattrs` payload to an img tag. */
function rewriteImgTag(tag: string, memoEntry: string): string {
const sep = memoEntry.indexOf("");
const dataUri = memoEntry.slice(0, sep);
const attrs = memoEntry.slice(sep + 1);
// Function replacement: data URIs are user-content-derived; string-form
// replace() would expand $-patterns inside them.
let out = tag.replace(SRC_RE, () => `src="${dataUri}"`);
if (attrs) out = out.replace(/^<img\b/i, () => `<img${attrs}`);
return out;
}
function annotateFromDataUri(tag: string, src: string): string {
try {
const b64 = src.slice(src.indexOf(",") + 1);
@@ -695,24 +777,7 @@ export function landscapeContentBox(opts: {
}
// ─── tiny helpers ─────────────────────────────────────────────────────
function escapeHtml(s: string): string {
return s
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&#39;");
}
function escapeAttr(s: string): string {
return escapeHtml(s);
}
/** Comments may not contain `--`; encode it so the raw source survives. */
function escapeHtmlComment(s: string): string {
return s.replace(/--/g, "-");
}
// escapeHtml is imported from ./render — single definition, no drift.
function firstLine(s: string): string {
return s.split("\n")[0].slice(0, 200);
+9 -20
View File
@@ -34,6 +34,8 @@
* passes preferCSSPageSize — the orchestrator sets it when hasLandscape.
*/
import { svgTagDims } from "./image-size";
export interface ImagePolicyOptions {
/** Physical content-box width in inches (page width minus margins). */
contentWidthIn: number;
@@ -207,24 +209,8 @@ function decideDiagramPromotion(figure: string, widthThresholdPx: number): Promo
return { promote: true, reason: `wide diagram (${Math.round(dims.width)}px)` };
}
/**
* Best-effort CSS-px dimensions of the first <svg> in a figure: explicit
* width/height attributes (px or unitless) first, else viewBox.
*/
function svgCssDims(figure: string): { width: number; height: number } | null {
const tag = figure.match(/<svg\b[^>]*>/i)?.[0];
if (!tag) return null;
const attrNum = (name: string): number | null => {
const m = tag.match(new RegExp(`\\b${name}\\s*=\\s*["']\\s*([0-9.]+)(px)?\\s*["']`, "i"));
return m ? parseFloat(m[1]) : null;
};
const w = attrNum("width");
const h = attrNum("height");
if (w && h) return { width: w, height: h };
const vb = tag.match(/\bviewBox\s*=\s*["']\s*[-0-9.]+[\s,]+[-0-9.]+[\s,]+([0-9.]+)[\s,]+([0-9.]+)\s*["']/i);
if (vb) return { width: parseFloat(vb[1]), height: parseFloat(vb[2]) };
return null;
}
/** SVG dimension probing is shared with the byte prober — see image-size.ts. */
const svgCssDims = svgTagDims;
function attrValue(tag: string, name: string): string | null {
const m = tag.match(new RegExp(`\\b${name}\\s*=\\s*"([^"]*)"`, "i"))
@@ -241,7 +227,10 @@ function num(s: string | null): number | null {
function mergeStyle(tag: string, css: string): string {
const existing = attrValue(tag, "style");
if (existing !== null) {
return tag.replace(/\bstyle\s*=\s*(".*?"|'.*?')/i, `style="${existing.replace(/"/g, "")}; ${css}"`);
// Function replacement (no $-pattern expansion from user-controlled style
// values) and the existing declarations are preserved verbatim — attrValue
// already returned the unquoted inner value.
return tag.replace(/\bstyle\s*=\s*(".*?"|'.*?')/i, () => `style="${existing}; ${css}"`);
}
return tag.replace(/^<img\b/i, `<img style="${css}"`);
return tag.replace(/^<img\b/i, () => `<img style="${css}"`);
}
+14 -3
View File
@@ -91,7 +91,18 @@ function webpDims(b: Buffer): ImageDims | null {
*/
function svgDims(b: Buffer): ImageDims | null {
const head = b.toString("utf8", 0, Math.min(b.length, 4096));
const tag = head.match(/<svg\b[^>]*>/i)?.[0];
const dims = svgTagDims(head);
return dims ? { ...dims, mime: "image/svg+xml" } : null;
}
/**
* CSS-px dimensions of the first <svg> element in a markup string: explicit
* width/height attributes (px or unitless) first, else viewBox. Shared by the
* byte prober above and image-policy's diagram-figure measurements — one
* regex, no drift.
*/
export function svgTagDims(markup: string): { width: number; height: number } | null {
const tag = markup.match(/<svg\b[^>]*>/i)?.[0];
if (!tag) return null;
const attr = (name: string): number | null => {
const m = tag.match(new RegExp(`\\b${name}\\s*=\\s*["']\\s*([0-9.]+)(px)?\\s*["']`, "i"));
@@ -99,8 +110,8 @@ function svgDims(b: Buffer): ImageDims | null {
};
const w = attr("width");
const h = attr("height");
if (w && h) return { width: w, height: h, mime: "image/svg+xml" };
if (w && h) return { width: w, height: h };
const vb = tag.match(/\bviewBox\s*=\s*["']\s*[-0-9.]+[\s,]+[-0-9.]+[\s,]+([0-9.]+)[\s,]+([0-9.]+)\s*["']/i);
if (vb) return { width: parseFloat(vb[1]), height: parseFloat(vb[2]), mime: "image/svg+xml" };
if (vb) return { width: parseFloat(vb[1]), height: parseFloat(vb[2]) };
return null;
}
+29 -1
View File
@@ -21,6 +21,7 @@ import * as crypto from "node:crypto";
import { spawn } from "node:child_process";
import { render } from "./render";
import { screenCss } from "./print-css";
import type { GenerateOptions, PreviewOptions } from "./types";
import { ExitCode } from "./types";
import * as browseClient from "./browseClient";
@@ -201,7 +202,6 @@ export async function generate(opts: GenerateOptions): Promise<string> {
// ─── --to html: write the self-contained document, no print round-trip ──
if (to === "html") {
const { screenCss } = await import("./print-css");
const withScreenLayer = finalHtml.replace(
"</style>",
`</style>\n<style>\n${screenCss()}\n</style>`,
@@ -214,6 +214,19 @@ export async function generate(opts: GenerateOptions): Promise<string> {
// ─── --to docx: content-fidelity conversion (eng-review P8) ────────────
if (to === "docx") {
// Print-only surfaces don't survive the conversion. The watermark div
// would degrade to a literal body paragraph reading "DRAFT" (worse than
// absent) — strip it. Warn once about print-only flags that were set.
finalHtml = finalHtml.replace(/<div class="watermark">[\s\S]*?<\/div>/, "");
const printOnly: string[] = [];
if (opts.watermark) printOnly.push("--watermark");
if (opts.headerTemplate) printOnly.push("--header-template");
if (opts.footerTemplate) printOnly.push("--footer-template");
if (opts.pageSize) printOnly.push("--page-size");
if (opts.margins || opts.marginTop || opts.marginRight || opts.marginBottom || opts.marginLeft) printOnly.push("--margins");
if (printOnly.length > 0) {
warn(`docx is content-fidelity: ${printOnly.join(", ")} do not apply to Word output`);
}
progress.begin("Converting to DOCX");
const { default: HTMLtoDOCX } = await import("html-to-docx");
const buf = await HTMLtoDOCX(finalHtml, null, {
@@ -311,6 +324,21 @@ export async function preview(opts: PreviewOptions): Promise<string> {
progress.begin("Rendering HTML");
const markdown = fs.readFileSync(input, "utf8");
// Preview deliberately skips the diagram/image pre-pass (no browse daemon
// round-trip — preview is the fast loop). Be loud about the divergence so
// nobody signs off on a preview that lacks what the PDF will have.
if (!opts.quiet) {
const fenceCount = extractDiagramFences(markdown).fences.length;
const hasLocalImages = /!\[[^\]]*\]\((?!https?:|data:)[^)]+\)/.test(markdown);
if (fenceCount > 0 || hasLocalImages) {
process.stderr.write(
`[make-pdf] preview note: ${fenceCount > 0 ? `${fenceCount} diagram fence(s) shown as code` : ""}` +
`${fenceCount > 0 && hasLocalImages ? "; " : ""}` +
`${hasLocalImages ? "local images may not resolve from the preview location" : ""}` +
`\`generate\` renders them fully.\n`,
);
}
}
const rendered = render({
markdown,
title: opts.title,
+18 -9
View File
@@ -12,9 +12,11 @@
* breaks copy-paste extraction.
* - All paragraphs flush-left. No first-line indent, no justify, no
* p+p indent. text-align: left everywhere. 12pt margin-bottom.
* - Cover page has the same 1in margins as every other page. No flexbox
* center, no inset padding, no vertical centering. Distinction comes
* from eyebrow + larger title + hairline rule, not from centering.
* - Cover page (v1.58.0.0 poster revision, user-directed): 56pt title,
* 13pt meta, padding-top 1.4in for poster placement. Still no flexbox
* and no vertical centering; the inset is a deliberate top-third drop.
* (Supersedes the original "no inset padding" lock from the first
* /plan-design-review — the 32pt cover read as too small in print.)
* - `@page :first` suppresses running header/footer but does NOT override
* the 1in margin.
* - No <link>, no external CSS/fonts — everything inlined.
@@ -122,8 +124,9 @@ function pageRules(size: string, margin: string, opts: PrintCssOptions): string
// Landscape named page for promoted wide diagrams/images (image-policy).
// Chromium-only — exactly the engine this pipeline always prints with.
// Honored only when the print call passes preferCSSPageSize (orchestrator
// sets it when a promotion exists). The block is flex-centered: a diagram
// alone on a rotated page should sit in the middle, not hug the header.
// sets it when a promotion exists). Vertical centering is NOT done here —
// image-policy emits a computed inline margin-top instead (see the
// .page-wide comment below for why).
`@page wide {`,
` size: ${size} landscape;`,
` margin: ${margin};`,
@@ -139,6 +142,9 @@ function pageRules(size: string, margin: string, opts: PrintCssOptions): string
` page: wide;`,
` text-align: center;`,
`}`,
// width: 100% stretch is intentional for promoted content: auto-promoted
// rasters are >=~1600px (≈190dpi at the 9in landscape box — prints fine),
// and a directive-forced small image is the user's explicit call.
`.page-wide img, .page-wide svg { width: 100%; height: auto; max-width: none; }`,
`.page-wide figure.diagram > svg { max-width: none; }`,
].filter(line => line !== "").join("\n");
@@ -153,10 +159,13 @@ function pageRules(size: string, margin: string, opts: PrintCssOptions): string
export function screenCss(): string {
return [
`@media screen {`,
` body { max-width: 52em; margin: 0 auto; padding: 2.5em 1.5em; }`,
// ~42em at 12pt ≈ 70-75 characters per line — the readable ceiling.
` body { max-width: 42em; margin: 0 auto; padding: 2.5em 1.5em; }`,
` .chapter { break-before: auto; }`,
` .watermark { display: none; }`,
` figure.diagram { overflow-x: auto; }`,
// Page numbers only exist in print; hide the empty spans + dot leaders.
` .toc li .toc-page, .toc li .toc-dots { display: none; }`,
`}`,
].join("\n");
}
@@ -362,11 +371,11 @@ function quoteRules(): string {
` padding: 0 0 0 18pt;`,
` border-left: 2pt solid #111;`,
` color: #333;`,
` font-size: 11pt;`,
` font-size: 12pt;`,
` line-height: 1.5;`,
`}`,
`blockquote p { margin-bottom: 6pt; text-align: left; }`,
`blockquote cite { display: block; margin-top: 6pt; font-style: normal; font-size: 9.5pt; color: #666; letter-spacing: 0.02em; }`,
`blockquote cite { display: block; margin-top: 6pt; font-style: normal; font-size: 10pt; color: #666; letter-spacing: 0.02em; }`,
`blockquote cite::before { content: "— "; }`,
].join("\n");
}
@@ -410,7 +419,7 @@ function listRules(): string {
function footnoteRules(): string {
return [
`.footnote-ref { font-size: 0.75em; vertical-align: super; line-height: 0; text-decoration: none; color: #0055cc; }`,
`.footnotes { margin-top: 24pt; padding-top: 12pt; border-top: 0.5pt solid #ccc; font-size: 9.5pt; line-height: 1.4; }`,
`.footnotes { margin-top: 24pt; padding-top: 12pt; border-top: 0.5pt solid #ccc; font-size: 10pt; line-height: 1.4; }`,
`.footnotes ol { padding-left: 18pt; }`,
].join("\n");
}
+24 -4
View File
@@ -112,14 +112,19 @@ export function render(opts: RenderOptions): RenderResult {
})
: "";
// TOC anchors must resolve: assign id="toc-N" to each H1-H3 in the same
// order buildTocBlock scans them, or every TOC link is a dead href (masked
// in PDFs by Chromium outline bookmarks, glaring in --to html).
const anchoredHtml = opts.toc ? addHeadingIds(typographicHtml) : typographicHtml;
const tocBlock = opts.toc
? buildTocBlock(typographicHtml)
? buildTocBlock(anchoredHtml)
: "";
// Wrap body in .chapter sections at H1 boundaries if chapter breaks are on.
const chapterHtml = opts.noChapterBreaks
? `<section class="chapter">${typographicHtml}</section>`
: wrapChaptersByH1(typographicHtml);
? `<section class="chapter">${anchoredHtml}</section>`
: wrapChaptersByH1(anchoredHtml);
const watermarkBlock = opts.watermark
? `<div class="watermark">${escapeHtml(opts.watermark)}</div>`
@@ -288,6 +293,21 @@ function buildTocBlock(html: string): string {
].join("\n");
}
/**
* Assign id="toc-N" to every H1-H3 in document order — the same order
* extractHeadings/buildTocBlock use, so anchors and entries line up by index.
* Headings that already carry an id keep it AND gain nothing (the TOC link
* targets toc-N, so we only skip tagging when one exists to avoid dupes).
*/
function addHeadingIds(html: string): string {
let i = 0;
return html.replace(/<(h[1-3])([^>]*)>/gi, (full, tag: string, attrs: string) => {
const id = `toc-${i++}`;
if (/\bid\s*=/i.test(attrs)) return full;
return `<${tag}${attrs} id="${id}">`;
});
}
function extractHeadings(html: string): Array<{ level: number; text: string }> {
const re = /<(h[1-3])[^>]*>([\s\S]*?)<\/\1>/gi;
const headings: Array<{ level: number; text: string }> = [];
@@ -362,7 +382,7 @@ function stripTags(html: string): string {
return html.replace(/<[^>]+>/g, "");
}
function escapeHtml(s: string): string {
export function escapeHtml(s: string): string {
return s
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")