feat(make-pdf): --to html|docx output formats

--to html writes the assembled self-contained document directly (no print
round-trip): inline vector diagrams, data-URI images, zero network
references, plus an @media screen layer for browser reading. --to docx is
the content-fidelity export (eng-review P8): html-to-docx@1.8.0 (exact pin;
pure JS, bun-compile-verified) maps headings/tables/code/lists; diagrams and
SVG images rasterize at 300dpi of the content-box width via the render tab;
diagnostic figures convert to plain p/pre so the converter can't silently
drop an error. --format keeps its page-size-alias meaning; --to is the
output format, and the CLI says so when confused.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-12 00:12:04 -07:00
parent a2c1eae16e
commit aec7e2b72b
7 changed files with 308 additions and 5 deletions
+13 -1
View File
@@ -64,9 +64,14 @@ function printUsage(): void {
lines.push(` ${info.description}`);
}
lines.push("");
lines.push("Output format:");
lines.push(" --to pdf|html|docx What to produce (default: pdf).");
lines.push(" html = single self-contained file, no network refs.");
lines.push(" docx = content fidelity, diagrams as PNG.");
lines.push("");
lines.push("Page layout:");
lines.push(" --margins <dim> All four margins (default: 1in). in, pt, cm, mm.");
lines.push(" --page-size letter|a4|legal (aliases: --format)");
lines.push(" --page-size letter|a4|legal (aliases: --format — page SIZE, not output format)");
lines.push("");
lines.push("Document structure:");
lines.push(" --cover Add a cover page.");
@@ -118,9 +123,16 @@ function generateOptionsFromFlags(parsed: ParsedArgs): GenerateOptions {
if (f[`no-${key}`] === true) return false;
return def;
};
const to = typeof f.to === "string" ? f.to.toLowerCase() : "pdf";
if (to !== "pdf" && to !== "html" && to !== "docx") {
console.error(`$P generate: invalid --to '${f.to}'. Expected pdf, html, or docx.`);
console.error("(--format is a --page-size alias, not the output format.)");
process.exit(ExitCode.BadArgs);
}
return {
input: p[0],
output: p[1],
to: to as GenerateOptions["to"],
margins: f.margins as string | undefined,
marginTop: f["margin-top"] as string | undefined,
marginRight: f["margin-right"] as string | undefined,
+67
View File
@@ -439,6 +439,73 @@ export function renderFenceSlots(
return slots;
}
// ─── DOCX rasterization (eng-review D6.5, P8) ─────────────────────────
/**
* Replace inline diagram SVGs (and svg data-URI images) with PNG <img> tags
* for the DOCX export — Word's SVG support is unreliable, so the content-
* fidelity contract embeds rasters at 300dpi of the placed width (the
* content box). Diagnostic blocks keep their text form.
*/
export function rasterizeDiagramFigures(
html: string,
tab: RenderTab,
contentWidthIn: number,
warn: (msg: string) => void,
): string {
const targetPx = Math.round(contentWidthIn * PRINT_DPI);
// 1. Rendered diagram figures → <img> with the figure's aria-label as alt.
let out = html.replace(
/<figure class="diagram"[^>]*>[\s\S]*?<\/figure>/gi,
(figure) => {
const svgMatch = figure.match(/<svg\b[\s\S]*<\/svg>/i);
if (!svgMatch) return figure;
const label = figure.match(/\baria-label\s*=\s*"([^"]*)"/i)?.[1] ?? "diagram";
try {
const png = tab.call("__rasterize", svgMatch[0], targetPx);
return `<p><img src="${png}" alt="${label}"></p>`;
} catch (err: any) {
warn(`docx: diagram rasterization failed (${firstLine(err?.message ?? String(err))}); keeping source text`);
return figure;
}
},
);
// 2. SVG data-URI images (inlined .svg files) → PNG.
out = out.replace(/<img\b[^>]*>/gi, (tag) => {
const src = tag.match(SRC_RE)?.[2] ?? tag.match(SRC_RE)?.[3] ?? "";
if (!src.startsWith("data:image/svg+xml")) return tag;
try {
const b64 = src.slice(src.indexOf(",") + 1);
const svgText = Buffer.from(b64, "base64").toString("utf8");
const png = tab.call("__rasterize", svgText, targetPx);
return tag.replace(SRC_RE, `src="${png}"`);
} catch (err: any) {
warn(`docx: svg image rasterization failed (${firstLine(err?.message ?? String(err))})`);
return tag;
}
});
return out;
}
/**
* Diagnostic figures → plain <p>/<pre> for the DOCX converter, which drops
* <figure> elements it can't map. An invisible error is the one thing the
* diagnostic contract forbids. Pure — no render tab needed.
*/
export function convertDiagnosticsForDocx(html: string): string {
return html.replace(
/<figure class="diagram diagram-error"[^>]*>([\s\S]*?)<\/figure>/gi,
(_full, body: string) => {
const title = body.match(/<figcaption[^>]*>([\s\S]*?)<\/figcaption>/i)?.[1] ?? "Diagram failed to render";
const detail = body.match(/<pre[^>]*>([\s\S]*?)<\/pre>/i)?.[1] ?? "";
return `<p><strong>${title}</strong></p>\n<pre>${detail}</pre>`;
},
);
}
// ─── Image inlining (eng-review D1 + D4 + D6.1) ───────────────────────
const IMG_TAG_RE = /<img\b[^>]*>/gi;
+50 -1
View File
@@ -27,8 +27,10 @@ import * as browseClient from "./browseClient";
import {
RenderTab,
contentWidthInches,
convertDiagnosticsForDocx,
extractDiagramFences,
inlineLocalImages,
rasterizeDiagramFigures,
renderFenceSlots,
substituteSlots,
} from "./diagram-prepass";
@@ -80,8 +82,9 @@ export async function generate(opts: GenerateOptions): Promise<string> {
throw new Error(`input file not found: ${input}`);
}
const to = opts.to ?? "pdf";
const outputPath = path.resolve(
opts.output ?? path.join(os.tmpdir(), `${deriveSlug(input)}.pdf`),
opts.output ?? path.join(os.tmpdir(), `${deriveSlug(input)}.${to}`),
);
// Stage 1: read markdown
@@ -170,10 +173,56 @@ export async function generate(opts: GenerateOptions): Promise<string> {
const policy = applyImagePolicy(finalHtml, { contentWidthIn, warn });
finalHtml = policy.html;
hasLandscape = policy.hasLandscape;
// DOCX needs rasters, not inline SVG (Word's SVG support is unreliable) —
// do it while the render tab is still open.
if (to === "docx") {
const needsRaster = /<figure class="diagram"|data:image\/svg\+xml/.test(finalHtml);
if (needsRaster) {
progress.begin("Rasterizing diagrams for DOCX");
const tab = getRenderTab();
if (tab) {
finalHtml = rasterizeDiagramFigures(finalHtml, tab, contentWidthIn, warn);
} else {
warn("docx: no render tab — diagrams keep their source text form");
}
progress.end("Rasterizing diagrams for DOCX");
}
finalHtml = convertDiagnosticsForDocx(finalHtml);
}
} finally {
renderTab?.close();
}
// ─── --to html: write the self-contained document, no print round-trip ──
if (to === "html") {
const { screenCss } = await import("./print-css");
const withScreenLayer = finalHtml.replace(
"</style>",
`</style>\n<style>\n${screenCss()}\n</style>`,
);
fs.writeFileSync(outputPath, withScreenLayer, "utf8");
const kb = Math.round(fs.statSync(outputPath).size / 1024);
progress.done(`${rendered.meta.wordCount} words · ${kb}KB · ${outputPath}`);
return outputPath;
}
// ─── --to docx: content-fidelity conversion (eng-review P8) ────────────
if (to === "docx") {
progress.begin("Converting to DOCX");
const { default: HTMLtoDOCX } = await import("html-to-docx");
const buf = await HTMLtoDOCX(finalHtml, null, {
title: rendered.meta.title,
creator: rendered.meta.author || undefined,
});
const bytes: Uint8Array = buf instanceof Uint8Array ? buf : new Uint8Array(await (buf as Blob).arrayBuffer());
fs.writeFileSync(outputPath, bytes);
progress.end("Converting to DOCX");
const kb = Math.round(fs.statSync(outputPath).size / 1024);
progress.done(`${rendered.meta.wordCount} words · ${kb}KB · ${outputPath} (content fidelity — layout is Word's)`);
return outputPath;
}
// Stage 3: write HTML to a tmp file browse can read
// (We don't actually write it; we pass inline via --from-file JSON.)
// But for preview mode and debugging, we still write to tmp.
+17
View File
@@ -137,6 +137,23 @@ function pageRules(size: string, margin: string, opts: PrintCssOptions): string
].filter(line => line !== "").join("\n");
}
/**
* Screen layer appended for `--to html` exports. The print CSS stays the
* source of truth; this only makes the same document readable in a browser
* (centered measure, padding, no print-only chapter breaks forcing scroll
* gaps). Print output is unaffected — media-scoped.
*/
export function screenCss(): string {
return [
`@media screen {`,
` body { max-width: 52em; margin: 0 auto; padding: 2.5em 1.5em; }`,
` .chapter { break-before: auto; }`,
` .watermark { display: none; }`,
` figure.diagram { overflow-x: auto; }`,
`}`,
].join("\n");
}
function rootTypography(): string {
return [
`html { lang: en; }`,
+9 -1
View File
@@ -11,9 +11,17 @@ export type FontMode = "sans"; // v1: Helvetica only. Future: "serif" | "custom"
* Options for `$P generate` — the public CLI contract.
* Matches the flag set documented in the CEO plan.
*/
export type OutputFormat = "pdf" | "html" | "docx";
export interface GenerateOptions {
input: string; // markdown input path
output?: string; // PDF output path (default: /tmp/<slug>.pdf)
output?: string; // output path (default: /tmp/<slug>.<ext>)
// Output format (NOT --format, which is a --page-size alias):
// pdf — print-quality PDF via Chromium (default)
// html — single self-contained file, zero network references
// docx — content-fidelity Word document (diagrams embedded as PNG)
to?: OutputFormat;
// Page layout
margins?: string; // "1in" | "72pt" | "25mm" | "2.54cm"