mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 23:30:09 +02:00
feat(make-pdf): --to html|docx output formats
--to html writes the assembled self-contained document directly (no print round-trip): inline vector diagrams, data-URI images, zero network references, plus an @media screen layer for browser reading. --to docx is the content-fidelity export (eng-review P8): html-to-docx@1.8.0 (exact pin; pure JS, bun-compile-verified) maps headings/tables/code/lists; diagrams and SVG images rasterize at 300dpi of the content-box width via the render tab; diagnostic figures convert to plain p/pre so the converter can't silently drop an error. --format keeps its page-size-alias meaning; --to is the output format, and the CLI says so when confused. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+13
-1
@@ -64,9 +64,14 @@ function printUsage(): void {
|
||||
lines.push(` ${info.description}`);
|
||||
}
|
||||
lines.push("");
|
||||
lines.push("Output format:");
|
||||
lines.push(" --to pdf|html|docx What to produce (default: pdf).");
|
||||
lines.push(" html = single self-contained file, no network refs.");
|
||||
lines.push(" docx = content fidelity, diagrams as PNG.");
|
||||
lines.push("");
|
||||
lines.push("Page layout:");
|
||||
lines.push(" --margins <dim> All four margins (default: 1in). in, pt, cm, mm.");
|
||||
lines.push(" --page-size letter|a4|legal (aliases: --format)");
|
||||
lines.push(" --page-size letter|a4|legal (aliases: --format — page SIZE, not output format)");
|
||||
lines.push("");
|
||||
lines.push("Document structure:");
|
||||
lines.push(" --cover Add a cover page.");
|
||||
@@ -118,9 +123,16 @@ function generateOptionsFromFlags(parsed: ParsedArgs): GenerateOptions {
|
||||
if (f[`no-${key}`] === true) return false;
|
||||
return def;
|
||||
};
|
||||
const to = typeof f.to === "string" ? f.to.toLowerCase() : "pdf";
|
||||
if (to !== "pdf" && to !== "html" && to !== "docx") {
|
||||
console.error(`$P generate: invalid --to '${f.to}'. Expected pdf, html, or docx.`);
|
||||
console.error("(--format is a --page-size alias, not the output format.)");
|
||||
process.exit(ExitCode.BadArgs);
|
||||
}
|
||||
return {
|
||||
input: p[0],
|
||||
output: p[1],
|
||||
to: to as GenerateOptions["to"],
|
||||
margins: f.margins as string | undefined,
|
||||
marginTop: f["margin-top"] as string | undefined,
|
||||
marginRight: f["margin-right"] as string | undefined,
|
||||
|
||||
@@ -439,6 +439,73 @@ export function renderFenceSlots(
|
||||
return slots;
|
||||
}
|
||||
|
||||
// ─── DOCX rasterization (eng-review D6.5, P8) ─────────────────────────
|
||||
|
||||
/**
|
||||
* Replace inline diagram SVGs (and svg data-URI images) with PNG <img> tags
|
||||
* for the DOCX export — Word's SVG support is unreliable, so the content-
|
||||
* fidelity contract embeds rasters at 300dpi of the placed width (the
|
||||
* content box). Diagnostic blocks keep their text form.
|
||||
*/
|
||||
export function rasterizeDiagramFigures(
|
||||
html: string,
|
||||
tab: RenderTab,
|
||||
contentWidthIn: number,
|
||||
warn: (msg: string) => void,
|
||||
): string {
|
||||
const targetPx = Math.round(contentWidthIn * PRINT_DPI);
|
||||
|
||||
// 1. Rendered diagram figures → <img> with the figure's aria-label as alt.
|
||||
let out = html.replace(
|
||||
/<figure class="diagram"[^>]*>[\s\S]*?<\/figure>/gi,
|
||||
(figure) => {
|
||||
const svgMatch = figure.match(/<svg\b[\s\S]*<\/svg>/i);
|
||||
if (!svgMatch) return figure;
|
||||
const label = figure.match(/\baria-label\s*=\s*"([^"]*)"/i)?.[1] ?? "diagram";
|
||||
try {
|
||||
const png = tab.call("__rasterize", svgMatch[0], targetPx);
|
||||
return `<p><img src="${png}" alt="${label}"></p>`;
|
||||
} catch (err: any) {
|
||||
warn(`docx: diagram rasterization failed (${firstLine(err?.message ?? String(err))}); keeping source text`);
|
||||
return figure;
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
// 2. SVG data-URI images (inlined .svg files) → PNG.
|
||||
out = out.replace(/<img\b[^>]*>/gi, (tag) => {
|
||||
const src = tag.match(SRC_RE)?.[2] ?? tag.match(SRC_RE)?.[3] ?? "";
|
||||
if (!src.startsWith("data:image/svg+xml")) return tag;
|
||||
try {
|
||||
const b64 = src.slice(src.indexOf(",") + 1);
|
||||
const svgText = Buffer.from(b64, "base64").toString("utf8");
|
||||
const png = tab.call("__rasterize", svgText, targetPx);
|
||||
return tag.replace(SRC_RE, `src="${png}"`);
|
||||
} catch (err: any) {
|
||||
warn(`docx: svg image rasterization failed (${firstLine(err?.message ?? String(err))})`);
|
||||
return tag;
|
||||
}
|
||||
});
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Diagnostic figures → plain <p>/<pre> for the DOCX converter, which drops
|
||||
* <figure> elements it can't map. An invisible error is the one thing the
|
||||
* diagnostic contract forbids. Pure — no render tab needed.
|
||||
*/
|
||||
export function convertDiagnosticsForDocx(html: string): string {
|
||||
return html.replace(
|
||||
/<figure class="diagram diagram-error"[^>]*>([\s\S]*?)<\/figure>/gi,
|
||||
(_full, body: string) => {
|
||||
const title = body.match(/<figcaption[^>]*>([\s\S]*?)<\/figcaption>/i)?.[1] ?? "Diagram failed to render";
|
||||
const detail = body.match(/<pre[^>]*>([\s\S]*?)<\/pre>/i)?.[1] ?? "";
|
||||
return `<p><strong>${title}</strong></p>\n<pre>${detail}</pre>`;
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// ─── Image inlining (eng-review D1 + D4 + D6.1) ───────────────────────
|
||||
|
||||
const IMG_TAG_RE = /<img\b[^>]*>/gi;
|
||||
|
||||
@@ -27,8 +27,10 @@ import * as browseClient from "./browseClient";
|
||||
import {
|
||||
RenderTab,
|
||||
contentWidthInches,
|
||||
convertDiagnosticsForDocx,
|
||||
extractDiagramFences,
|
||||
inlineLocalImages,
|
||||
rasterizeDiagramFigures,
|
||||
renderFenceSlots,
|
||||
substituteSlots,
|
||||
} from "./diagram-prepass";
|
||||
@@ -80,8 +82,9 @@ export async function generate(opts: GenerateOptions): Promise<string> {
|
||||
throw new Error(`input file not found: ${input}`);
|
||||
}
|
||||
|
||||
const to = opts.to ?? "pdf";
|
||||
const outputPath = path.resolve(
|
||||
opts.output ?? path.join(os.tmpdir(), `${deriveSlug(input)}.pdf`),
|
||||
opts.output ?? path.join(os.tmpdir(), `${deriveSlug(input)}.${to}`),
|
||||
);
|
||||
|
||||
// Stage 1: read markdown
|
||||
@@ -170,10 +173,56 @@ export async function generate(opts: GenerateOptions): Promise<string> {
|
||||
const policy = applyImagePolicy(finalHtml, { contentWidthIn, warn });
|
||||
finalHtml = policy.html;
|
||||
hasLandscape = policy.hasLandscape;
|
||||
|
||||
// DOCX needs rasters, not inline SVG (Word's SVG support is unreliable) —
|
||||
// do it while the render tab is still open.
|
||||
if (to === "docx") {
|
||||
const needsRaster = /<figure class="diagram"|data:image\/svg\+xml/.test(finalHtml);
|
||||
if (needsRaster) {
|
||||
progress.begin("Rasterizing diagrams for DOCX");
|
||||
const tab = getRenderTab();
|
||||
if (tab) {
|
||||
finalHtml = rasterizeDiagramFigures(finalHtml, tab, contentWidthIn, warn);
|
||||
} else {
|
||||
warn("docx: no render tab — diagrams keep their source text form");
|
||||
}
|
||||
progress.end("Rasterizing diagrams for DOCX");
|
||||
}
|
||||
finalHtml = convertDiagnosticsForDocx(finalHtml);
|
||||
}
|
||||
} finally {
|
||||
renderTab?.close();
|
||||
}
|
||||
|
||||
// ─── --to html: write the self-contained document, no print round-trip ──
|
||||
if (to === "html") {
|
||||
const { screenCss } = await import("./print-css");
|
||||
const withScreenLayer = finalHtml.replace(
|
||||
"</style>",
|
||||
`</style>\n<style>\n${screenCss()}\n</style>`,
|
||||
);
|
||||
fs.writeFileSync(outputPath, withScreenLayer, "utf8");
|
||||
const kb = Math.round(fs.statSync(outputPath).size / 1024);
|
||||
progress.done(`${rendered.meta.wordCount} words · ${kb}KB · ${outputPath}`);
|
||||
return outputPath;
|
||||
}
|
||||
|
||||
// ─── --to docx: content-fidelity conversion (eng-review P8) ────────────
|
||||
if (to === "docx") {
|
||||
progress.begin("Converting to DOCX");
|
||||
const { default: HTMLtoDOCX } = await import("html-to-docx");
|
||||
const buf = await HTMLtoDOCX(finalHtml, null, {
|
||||
title: rendered.meta.title,
|
||||
creator: rendered.meta.author || undefined,
|
||||
});
|
||||
const bytes: Uint8Array = buf instanceof Uint8Array ? buf : new Uint8Array(await (buf as Blob).arrayBuffer());
|
||||
fs.writeFileSync(outputPath, bytes);
|
||||
progress.end("Converting to DOCX");
|
||||
const kb = Math.round(fs.statSync(outputPath).size / 1024);
|
||||
progress.done(`${rendered.meta.wordCount} words · ${kb}KB · ${outputPath} (content fidelity — layout is Word's)`);
|
||||
return outputPath;
|
||||
}
|
||||
|
||||
// Stage 3: write HTML to a tmp file browse can read
|
||||
// (We don't actually write it; we pass inline via --from-file JSON.)
|
||||
// But for preview mode and debugging, we still write to tmp.
|
||||
|
||||
@@ -137,6 +137,23 @@ function pageRules(size: string, margin: string, opts: PrintCssOptions): string
|
||||
].filter(line => line !== "").join("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Screen layer appended for `--to html` exports. The print CSS stays the
|
||||
* source of truth; this only makes the same document readable in a browser
|
||||
* (centered measure, padding, no print-only chapter breaks forcing scroll
|
||||
* gaps). Print output is unaffected — media-scoped.
|
||||
*/
|
||||
export function screenCss(): string {
|
||||
return [
|
||||
`@media screen {`,
|
||||
` body { max-width: 52em; margin: 0 auto; padding: 2.5em 1.5em; }`,
|
||||
` .chapter { break-before: auto; }`,
|
||||
` .watermark { display: none; }`,
|
||||
` figure.diagram { overflow-x: auto; }`,
|
||||
`}`,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
function rootTypography(): string {
|
||||
return [
|
||||
`html { lang: en; }`,
|
||||
|
||||
@@ -11,9 +11,17 @@ export type FontMode = "sans"; // v1: Helvetica only. Future: "serif" | "custom"
|
||||
* Options for `$P generate` — the public CLI contract.
|
||||
* Matches the flag set documented in the CEO plan.
|
||||
*/
|
||||
export type OutputFormat = "pdf" | "html" | "docx";
|
||||
|
||||
export interface GenerateOptions {
|
||||
input: string; // markdown input path
|
||||
output?: string; // PDF output path (default: /tmp/<slug>.pdf)
|
||||
output?: string; // output path (default: /tmp/<slug>.<ext>)
|
||||
|
||||
// Output format (NOT --format, which is a --page-size alias):
|
||||
// pdf — print-quality PDF via Chromium (default)
|
||||
// html — single self-contained file, zero network references
|
||||
// docx — content-fidelity Word document (diagrams embedded as PNG)
|
||||
to?: OutputFormat;
|
||||
|
||||
// Page layout
|
||||
margins?: string; // "1in" | "72pt" | "25mm" | "2.54cm"
|
||||
|
||||
Reference in New Issue
Block a user