diff --git a/make-pdf/src/render.ts b/make-pdf/src/render.ts index eafe6fc8..ae5228f4 100644 --- a/make-pdf/src/render.ts +++ b/make-pdf/src/render.ts @@ -288,7 +288,7 @@ function extractHeadings(html: string): Array<{ level: number; text: string }> { let match; while ((match = re.exec(html)) !== null) { const level = parseInt(match[1].slice(1), 10); - const text = stripTags(match[2]).trim(); + const text = decodeTextEntities(stripTags(match[2]).trim()); if (text) headings.push({ level, text }); } return headings; @@ -324,7 +324,32 @@ function wrapChaptersByH1(html: string): string { function extractFirstHeading(html: string): string | null { const m = html.match(/]*>([\s\S]*?)<\/h1>/i); - return m ? stripTags(m[1]).trim() : null; + return m ? decodeTextEntities(stripTags(m[1]).trim()) : null; +} + +/** + * Decode HTML entities in plain text extracted from rendered HTML. Distinct + * from decodeTypographicEntities (which runs on in-pipeline HTML and preserves + * & because &amp; can be legitimate there). This runs on text destined + * for , cover, and TOC entries where & MUST become & or escapeHtml + * produces &amp;. + * + * Amp-last ordering: input "&#169;" decodes to "©" in the named pass, + * then the numeric pass decodes "©" to "©". Decoding & first would + * produce "©" and the numeric pass would consume it — different end state + * but risks double-decode on inputs like "&lt;". + */ +function decodeTextEntities(s: string): string { + return s + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(parseInt(n, 10))) + .replace(/&#x([0-9a-fA-F]+);/g, (_, n) => String.fromCodePoint(parseInt(n, 16))) + .replace(/&/g, "&"); } function stripTags(html: string): string { diff --git a/make-pdf/test/render.test.ts b/make-pdf/test/render.test.ts index 0e64f871..89a92537 100644 --- a/make-pdf/test/render.test.ts +++ b/make-pdf/test/render.test.ts @@ -354,3 +354,79 @@ describe("render() — pageNumbers data flow", () => { expect(result.printCss).toMatch(/@bottom-center\s*\{\s*content:\s*counter\(page\)/); }); }); + +// ─── render() — HTML entity handling in titles, cover, TOC ─────────── + +describe("render() — no double HTML entity escaping", () => { + type Case = { char: string; inTitle: string; expectedTitleMeta: string }; + + // Only characters that should flow through unchanged. `"` and `'` are + // omitted from this set because smartypants converts them to curly quotes + // before heading extraction — asserted separately below. + const cases: Case[] = [ + { char: "&", inTitle: "A & B", expectedTitleMeta: "A & B" }, + { char: "<", inTitle: "A < B", expectedTitleMeta: "A < B" }, + { char: ">", inTitle: "A > B", expectedTitleMeta: "A > B" }, + { char: "©", inTitle: "A © B", expectedTitleMeta: "A © B" }, + { char: "—", inTitle: "A — B", expectedTitleMeta: "A — B" }, + ]; + + for (const { char, inTitle, expectedTitleMeta } of cases) { + test(`"${char}" in H1 has no double-escape in <title> or cover`, () => { + const result = render({ + markdown: `# ${inTitle}\n\nBody.`, + cover: true, + author: "A", + }); + // Meta: decoded plain text. + expect(result.meta.title).toBe(expectedTitleMeta); + // HTML: <title>... never contains double-escape patterns. + expect(result.html).not.toMatch(/[^<]*&amp;/); + expect(result.html).not.toMatch(/<title>[^<]*&lt;/); + expect(result.html).not.toMatch(/<title>[^<]*&gt;/); + expect(result.html).not.toMatch(/<title>[^<]*&#\d+;/); + expect(result.html).not.toMatch(/<title>[^<]*&#x[0-9a-fA-F]+;/); + // Cover block also single-escape. + expect(result.html).not.toMatch(/class="cover-title"[^>]*>[^<]*&amp;/); + }); + } + + test('ampersand in <title> renders as exactly one "&"', () => { + const result = render({ markdown: `# Herbert & Garry\n\nBody.` }); + expect(result.html).toContain("<title>Herbert & Garry"); + expect(result.html).not.toContain("&amp;"); + }); + + test("TOC entries have no double-escape when a heading contains '&'", () => { + const result = render({ + markdown: `# Doc\n\n## Herbert & Garry\n\nBody.\n\n## Other\n\nMore.`, + toc: true, + }); + // TOC renders the heading text through escapeHtml; must be single-escaped. + expect(result.html).toContain("Herbert & Garry"); + expect(result.html).not.toContain("&amp;"); + }); + + test('numeric entity in H1 (e.g. "©") decodes cleanly to ', () => { + // Marked passes through numeric entities verbatim in the HTML output, + // so the decoder must handle them. + const result = render({ markdown: `# A © B\n\nBody.` }); + expect(result.meta.title).toBe("A © B"); + expect(result.html).toContain("<title>A © B"); + }); + + test("smartypants converts raw quotes in title BEFORE extraction (contract)", () => { + // We do NOT assert raw `"` survives — smartypants is expected to convert it. + // The contract is: no double-escape of the encoded form. + const result = render({ markdown: `# Say "hi"\n\nBody.` }); + expect(result.html).not.toContain("&quot;"); + expect(result.html).not.toContain("&#39;"); + // And contains exactly one level of escaping. + const titleMatch = result.html.match(/<title>([^<]*)<\/title>/); + expect(titleMatch).toBeTruthy(); + if (titleMatch) { + // Never contains a double-encoded entity. + expect(titleMatch[1]).not.toMatch(/&(amp|lt|gt|quot|#\d+);/); + } + }); +});