fix(make-pdf): adversarial-review wave — offline posture enforced, symlink-aware confinement, bounded reads

Codex adversarial + structured review findings:

- Remote images are now BLOCKED with a visible placeholder instead of
  warn-and-keep — leaving the tag meant Chromium fetched the URL at print
  time anyway, so the offline posture was a lie (tracking pixels and
  internal-URL probes ran without --allow-network).
- The out-of-tree read check compares REAL paths: a symlink inside the input
  dir pointing at ~/.ssh/... passed the string-prefix check, including under
  --strict. Ordered after the existence check (realpath of a missing file
  false-positives on macOS /var → /private/var).
- Image reads are bounded BEFORE reading: statSync first, non-regular files
  (fifo/device/dir) and >64MB files degrade to placeholders instead of
  hanging or exhausting memory; malformed percent-encoding (foo%zz.png)
  degrades to missing-image instead of crashing decodeURIComponent.
- browse shell-outs get a 120s timeout — a wedged daemon or hostile mermaid
  source fails the run instead of hanging it.
- TOC entries link to the heading's ACTUAL id (pre-id'd raw-HTML headings
  previously got dead #toc-N links); per-side margins compose into the CSS
  @page shorthand so a landscape promotion flipping preferCSSPageSize no
  longer silently reverts --margin-left/right to defaults (Codex P2).
- The image memo is a typed object — literal NUL-byte separators had made
  diagram-prepass.ts register as binary to text tooling.

Codex structured review GATE: PASS (no P1).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-12 10:01:48 -07:00
parent 1bd984f86d
commit 26a7cab26d
6 changed files with 168 additions and 32 deletions
+3
View File
@@ -176,6 +176,9 @@ function runBrowse(args: string[]): string {
encoding: "utf8",
maxBuffer: 16 * 1024 * 1024, // 16MB; tab content can be large
stdio: ["ignore", "pipe", "pipe"],
// A wedged daemon (or a hostile mermaid source spinning the renderer)
// must fail the run, not hang it forever.
timeout: 120_000,
});
} catch (err: any) {
const exitCode = typeof err.status === "number" ? err.status : 1;
+69 -17
View File
@@ -80,6 +80,8 @@ export interface PrepassImageOptions {
* 2 × contentWidth × 300dpi down to contentWidth × 300dpi. */
const PRINT_DPI = 300;
const DOWNSCALE_FACTOR = 2;
/** Per-image read ceiling — bounds memory before any policy runs. */
const MAX_IMAGE_BYTES = 64 * 1024 * 1024;
export class StrictModeError extends Error {
constructor(msg: string) {
@@ -584,7 +586,7 @@ export function inlineLocalImages(html: string, opts: PrepassImageOptions): stri
const targetPx = Math.round(opts.contentWidthIn * PRINT_DPI);
// An image referenced N times is read/probed/downscaled once; the same data
// URI string is reused (also dedupes memory until the final join).
const memo = new Map<string, string>();
const memo = new Map<string, { dataUri: string; attrs: string }>();
return html.replace(IMG_TAG_RE, (tag) => {
const srcMatch = tag.match(SRC_RE);
@@ -601,36 +603,73 @@ export function inlineLocalImages(html: string, opts: PrepassImageOptions): stri
// Absolute URL with a scheme (http, https, file, …)
if (opts.allowNetwork && /^https?:/i.test(src)) return tag;
if (/^https?:/i.test(src)) {
const msg = `remote image not fetched (offline posture): ${src}`;
const msg = `remote image blocked (offline posture): ${src}`;
if (opts.strict) throw new StrictModeError(msg + " — re-run without --strict or pass --allow-network");
opts.warn(msg);
return tag;
// Leaving the tag would make Chromium fetch it at print time anyway —
// the warn would be a lie. Replace with a visible placeholder.
return buildBlockedRemotePlaceholder(src);
}
// file:// and friends fall through to the local path branch
if (!src.startsWith("file:")) return tag;
}
// decodeURIComponent throws on malformed escapes (foo%zz.png) — a broken
// URL must degrade to the missing-image path, not crash the run.
let decodedSrc = src;
try {
decodedSrc = decodeURIComponent(src);
} catch { /* keep raw src */ }
const filePath = src.startsWith("file:")
? fileURLToPath(src)
: isDrivePath
? path.resolve(src)
: path.resolve(opts.inputDir, decodeURIComponent(src));
: path.resolve(opts.inputDir, decodedSrc);
const cached = memo.get(filePath);
if (cached !== undefined) return rewriteImgTag(tag, cached);
if (!fs.existsSync(filePath)) {
const msg = `image not found: ${src} (resolved to ${filePath})`;
if (opts.strict) throw new StrictModeError(msg);
opts.warn(msg);
return buildMissingImagePlaceholder(src);
}
// Out-of-tree reads are legal (local CLI semantics — like pandoc) but
// never silent: an agent PDF-ing untrusted markdown should not quietly
// embed ~/.ssh/config into a shareable document. --strict makes it fatal.
const inputRoot = path.resolve(opts.inputDir) + path.sep;
if (!filePath.startsWith(inputRoot)) {
const msg = `image resolves OUTSIDE the input directory: ${src}${filePath}`;
// Compare REAL paths — a symlink inside the input dir pointing outside
// would otherwise pass a string-prefix check (Codex adversarial finding).
// Runs after the existence check: realpath of a missing file can't
// resolve, and on macOS /var vs /private/var would false-positive.
const inputRoot = safeRealpath(path.resolve(opts.inputDir)) + path.sep;
const realFilePath = safeRealpath(filePath);
if (!realFilePath.startsWith(inputRoot)) {
const msg = `image resolves OUTSIDE the input directory: ${src}${realFilePath}`;
if (opts.strict) throw new StrictModeError(msg + " — move it under the markdown's directory or drop --strict");
opts.warn(msg);
}
if (!fs.existsSync(filePath)) {
const msg = `image not found: ${src} (resolved to ${filePath})`;
// Bound the read BEFORE reading: a markdown image pointing at a special
// file (fifo, device) would hang readFileSync, and a multi-GB file would
// exhaust memory before any policy ran.
let stat: fs.Stats;
try {
stat = fs.statSync(filePath);
} catch {
opts.warn(`image unreadable: ${src}`);
return buildMissingImagePlaceholder(src);
}
if (!stat.isFile()) {
const msg = `image is not a regular file: ${src}`;
if (opts.strict) throw new StrictModeError(msg);
opts.warn(msg);
return buildMissingImagePlaceholder(src);
}
if (stat.size > MAX_IMAGE_BYTES) {
const msg = `image exceeds ${Math.round(MAX_IMAGE_BYTES / 1024 / 1024)}MB cap: ${src} (${Math.round(stat.size / 1024 / 1024)}MB)`;
if (opts.strict) throw new StrictModeError(msg);
opts.warn(msg);
return buildMissingImagePlaceholder(src);
@@ -665,20 +704,17 @@ export function inlineLocalImages(html: string, opts: PrepassImageOptions): stri
const attrs = dims
? ` data-gstack-px-width="${Math.round(dims.width)}" data-gstack-px-height="${Math.round(dims.height)}"`
: "";
memo.set(filePath, `${dataUri}${attrs}`);
memo.set(filePath, { dataUri, attrs });
return rewriteImgTag(tag, memo.get(filePath)!);
});
}
/** Apply a memoized `dataUriattrs` payload to an img tag. */
function rewriteImgTag(tag: string, memoEntry: string): string {
const sep = memoEntry.indexOf("");
const dataUri = memoEntry.slice(0, sep);
const attrs = memoEntry.slice(sep + 1);
/** Apply a memoized inline result to an img tag. */
function rewriteImgTag(tag: string, entry: { dataUri: string; attrs: string }): string {
// Function replacement: data URIs are user-content-derived; string-form
// replace() would expand $-patterns inside them.
let out = tag.replace(SRC_RE, () => `src="${dataUri}"`);
if (attrs) out = out.replace(/^<img\b/i, () => `<img${attrs}`);
let out = tag.replace(SRC_RE, () => `src="${entry.dataUri}"`);
if (entry.attrs) out = out.replace(/^<img\b/i, () => `<img${entry.attrs}`);
return out;
}
@@ -704,6 +740,22 @@ function buildMissingImagePlaceholder(src: string): string {
);
}
function buildBlockedRemotePlaceholder(src: string): string {
return (
`<span class="image-missing" role="img" aria-label="remote image blocked">` +
`[remote image blocked (use --allow-network): ${escapeHtml(src)}]</span>`
);
}
/** realpath that degrades to the input path when resolution fails. */
function safeRealpath(p: string): string {
try {
return fs.realpathSync(p);
} catch {
return p;
}
}
function mimeFromExtension(p: string): string {
switch (path.extname(p).toLowerCase()) {
case ".png": return "image/png";
+4
View File
@@ -112,6 +112,10 @@ export async function generate(opts: GenerateOptions): Promise<string> {
confidential: opts.confidential,
pageSize: opts.pageSize,
margins: opts.margins,
marginTop: opts.marginTop,
marginRight: opts.marginRight,
marginBottom: opts.marginBottom,
marginLeft: opts.marginLeft,
pageNumbers: opts.pageNumbers,
footerTemplate: opts.footerTemplate,
});
+50 -13
View File
@@ -35,6 +35,14 @@ export interface RenderOptions {
// Page layout
pageSize?: "letter" | "a4" | "legal" | "tabloid";
margins?: string;
// Per-side margins (override `margins`). Must reach the CSS @page rule:
// when a landscape promotion flips preferCSSPageSize on, the CSS margins
// are the ones Chromium honors — dropping per-side flags there would
// silently change the whole document's layout (Codex P2).
marginTop?: string;
marginRight?: string;
marginBottom?: string;
marginLeft?: string;
// Footer behavior. pageNumbers defaults to true. When footerTemplate is set,
// CSS page numbers are suppressed so the custom Chromium footer wins cleanly.
@@ -97,7 +105,9 @@ export function render(opts: RenderOptions): RenderResult {
confidential: opts.confidential !== false,
runningHeader: derivedTitle,
pageSize: opts.pageSize,
margins: opts.margins,
// Compose per-side margins into the CSS shorthand so @page stays the
// single source of truth even under preferCSSPageSize.
margins: composeMargins(opts),
pageNumbers: showPageNumbers,
};
const css = printCss(cssOptions);
@@ -114,11 +124,14 @@ export function render(opts: RenderOptions): RenderResult {
// TOC anchors must resolve: assign id="toc-N" to each H1-H3 in the same
// order buildTocBlock scans them, or every TOC link is a dead href (masked
// in PDFs by Chromium outline bookmarks, glaring in --to html).
const anchoredHtml = opts.toc ? addHeadingIds(typographicHtml) : typographicHtml;
// in PDFs by Chromium outline bookmarks, glaring in --to html). Headings
// that already carry an id keep it — the ids array records the ACTUAL id
// per heading so TOC entries always link to something real.
const anchored = opts.toc ? addHeadingIds(typographicHtml) : { html: typographicHtml, ids: [] };
const anchoredHtml = anchored.html;
const tocBlock = opts.toc
? buildTocBlock(anchoredHtml)
? buildTocBlock(anchoredHtml, anchored.ids)
: "";
// Wrap body in .chapter sections at H1 boundaries if chapter breaks are on.
@@ -267,13 +280,13 @@ function buildCoverBlock(opts: {
* Page numbers are filled in by Paged.js (when --toc is passed and Paged.js
* polyfill is injected).
*/
function buildTocBlock(html: string): string {
function buildTocBlock(html: string, ids: string[] = []): string {
const headings = extractHeadings(html);
if (headings.length === 0) return "";
const items = headings.map((h, i) => {
const level = h.level >= 2 ? "level-2" : "level-1";
const id = `toc-${i}`;
const id = ids[i] ?? `toc-${i}`;
return [
` <li class="${level}">`,
` <span class="toc-title"><a href="#${id}">${escapeHtml(h.text)}</a></span>`,
@@ -296,16 +309,23 @@ function buildTocBlock(html: string): string {
/**
* Assign id="toc-N" to every H1-H3 in document order — the same order
* extractHeadings/buildTocBlock use, so anchors and entries line up by index.
* Headings that already carry an id keep it AND gain nothing (the TOC link
* targets toc-N, so we only skip tagging when one exists to avoid dupes).
* A heading that already carries an id keeps it, and the returned ids array
* records the actual id for that slot so the TOC links to the real anchor
* instead of a nonexistent toc-N.
*/
function addHeadingIds(html: string): string {
let i = 0;
return html.replace(/<(h[1-3])([^>]*)>/gi, (full, tag: string, attrs: string) => {
const id = `toc-${i++}`;
if (/\bid\s*=/i.test(attrs)) return full;
function addHeadingIds(html: string): { html: string; ids: string[] } {
const ids: string[] = [];
const out = html.replace(/<(h[1-3])([^>]*)>/gi, (full, tag: string, attrs: string) => {
const existing = attrs.match(/\bid\s*=\s*["']([^"']*)["']/i)?.[1];
if (existing) {
ids.push(existing);
return full;
}
const id = `toc-${ids.length}`;
ids.push(id);
return `<${tag}${attrs} id="${id}">`;
});
return { html: out, ids };
}
function extractHeadings(html: string): Array<{ level: number; text: string }> {
@@ -378,6 +398,23 @@ function decodeTextEntities(s: string): string {
.replace(/&amp;/g, "&");
}
/** Compose `margin: top right bottom left` from per-side overrides + base. */
function composeMargins(opts: {
margins?: string; marginTop?: string; marginRight?: string;
marginBottom?: string; marginLeft?: string;
}): string | undefined {
const base = opts.margins ?? "1in";
if (!opts.marginTop && !opts.marginRight && !opts.marginBottom && !opts.marginLeft) {
return opts.margins;
}
return [
opts.marginTop ?? base,
opts.marginRight ?? base,
opts.marginBottom ?? base,
opts.marginLeft ?? base,
].join(" ");
}
function stripTags(html: string): string {
return html.replace(/<[^>]+>/g, "");
}
+35 -2
View File
@@ -277,14 +277,47 @@ describe("inlineLocalImages", () => {
).toThrow(StrictModeError);
});
test("remote image warns and is left untouched (offline posture)", () => {
test("remote image is BLOCKED with a visible placeholder (offline posture)", () => {
// Leaving the tag would make Chromium fetch it at print time anyway —
// the offline posture must remove the src, not just warn about it.
const warnings: string[] = [];
const tag = `<img src="https://example.com/x.png">`;
const out = inlineLocalImages(tag, { ...base, warn: (m) => warnings.push(m) });
expect(out).toBe(tag);
expect(out).not.toContain("https://example.com/x.png\"");
expect(out).toContain("remote image blocked");
expect(warnings[0]).toContain("offline");
});
test("symlink escaping the input dir is caught by the realpath check", () => {
const outside = fs.mkdtempSync(path.join(os.tmpdir(), "prepass-symlink-"));
fs.writeFileSync(path.join(outside, "secret.png"), tinyPng(5, 5));
const link = path.join(dir, "innocent.png");
try {
fs.symlinkSync(path.join(outside, "secret.png"), link);
const warnings: string[] = [];
inlineLocalImages(`<img src="innocent.png">`, { ...base, warn: (m) => warnings.push(m) });
expect(warnings.some((w) => w.includes("OUTSIDE the input directory"))).toBe(true);
} finally {
try { fs.unlinkSync(link); } catch { /* ignore */ }
fs.rmSync(outside, { recursive: true, force: true });
}
});
test("special files and oversized images degrade to placeholders, never hang", () => {
// Directory masquerading as an image — not a regular file.
fs.mkdirSync(path.join(dir, "dir.png"), { recursive: true });
const warnings: string[] = [];
const out = inlineLocalImages(`<img src="dir.png">`, { ...base, warn: (m) => warnings.push(m) });
expect(out).toContain("image-missing");
expect(warnings.some((w) => w.includes("not a regular file"))).toBe(true);
});
test("malformed percent-encoding degrades to missing-image, never throws", () => {
const warnings: string[] = [];
const out = inlineLocalImages(`<img src="foo%zz.png">`, { ...base, warn: (m) => warnings.push(m) });
expect(out).toContain("image-missing");
});
test("remote image + --allow-network passes silently", () => {
const warnings: string[] = [];
const tag = `<img src="https://example.com/x.png">`;
+7
View File
@@ -264,6 +264,13 @@ describe("printCss", () => {
expect(css).toContain("margin: 72pt");
});
test("per-side margins reach the CSS @page rule (preferCSSPageSize parity)", () => {
// Under a landscape promotion Chromium honors the CSS margins, not the
// CDP per-side options — render() must compose them into the shorthand.
const r = render({ markdown: "# T", marginLeft: "0.5in", marginRight: "0.5in" });
expect(r.printCss).toContain("margin: 1in 0.5in 1in 0.5in");
});
test("emits letter page size by default", () => {
const css = printCss();
expect(css).toContain("size: letter");