fix(make-pdf): decode HTML entities in titles and TOC to prevent double-escape

A markdown title like "# Herbert & Garry" rendered as "Herbert & Garry"
in <title>, cover block, and TOC entries. marked emits "&amp;" (correct HTML),
but extractFirstHeading and extractHeadings only stripTags — leaving the entity
intact. That string then flows through escapeHtml, producing the double-encode.

- render.ts: new decodeTextEntities helper, distinct from decodeTypographicEntities
  (which runs on in-pipeline HTML and intentionally preserves &amp;). Covers
  named entities (lt/gt/quot/apos/39/x27/amp) AND numeric (decimal + hex) so
  inputs like "&#169;" or "&#x2014;" don't create the same partial-fix bug.
  Amp-last ordering prevents double-decode on "&amp;lt;" et al.
- Apply in both extractFirstHeading and extractHeadings. extractHeadings feeds
  buildTocBlock → escapeHtml, so the TOC site had the same bug.
- render.test.ts: 8 tests covering the contract — parameterized across &, <, >,
  ©, — chars; single-escape in <title>/cover; TOC double-escape check; numeric
  entity decode; smartypants-interacts-with-quotes contract (no raw equality).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-20 21:41:32 +08:00
parent cf875d1e41
commit 70b59ec91b
2 changed files with 103 additions and 2 deletions
+27 -2
View File
@@ -288,7 +288,7 @@ function extractHeadings(html: string): Array<{ level: number; text: string }> {
let match;
while ((match = re.exec(html)) !== null) {
const level = parseInt(match[1].slice(1), 10);
const text = stripTags(match[2]).trim();
const text = decodeTextEntities(stripTags(match[2]).trim());
if (text) headings.push({ level, text });
}
return headings;
@@ -324,7 +324,32 @@ function wrapChaptersByH1(html: string): string {
function extractFirstHeading(html: string): string | null {
const m = html.match(/<h1\b[^>]*>([\s\S]*?)<\/h1>/i);
return m ? stripTags(m[1]).trim() : null;
return m ? decodeTextEntities(stripTags(m[1]).trim()) : null;
}
/**
* Decode HTML entities in plain text extracted from rendered HTML. Distinct
* from decodeTypographicEntities (which runs on in-pipeline HTML and preserves
* &amp; because &amp;amp; can be legitimate there). This runs on text destined
* for <title>, cover, and TOC entries where &amp; MUST become & or escapeHtml
* produces &amp;amp;.
*
* Amp-last ordering: input "&amp;#169;" decodes to "&#169;" in the named pass,
* then the numeric pass decodes "&#169;" to "©". Decoding &amp; first would
* produce "&#169;" and the numeric pass would consume it — different end state
* but risks double-decode on inputs like "&amp;lt;".
*/
function decodeTextEntities(s: string): string {
return s
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&apos;/g, "'")
.replace(/&#x27;/g, "'")
.replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(parseInt(n, 10)))
.replace(/&#x([0-9a-fA-F]+);/g, (_, n) => String.fromCodePoint(parseInt(n, 16)))
.replace(/&amp;/g, "&");
}
function stripTags(html: string): string {
+76
View File
@@ -354,3 +354,79 @@ describe("render() — pageNumbers data flow", () => {
expect(result.printCss).toMatch(/@bottom-center\s*\{\s*content:\s*counter\(page\)/);
});
});
// ─── render() — HTML entity handling in titles, cover, TOC ───────────
describe("render() — no double HTML entity escaping", () => {
type Case = { char: string; inTitle: string; expectedTitleMeta: string };
// Only characters that should flow through unchanged. `"` and `'` are
// omitted from this set because smartypants converts them to curly quotes
// before heading extraction — asserted separately below.
const cases: Case[] = [
{ char: "&", inTitle: "A & B", expectedTitleMeta: "A & B" },
{ char: "<", inTitle: "A < B", expectedTitleMeta: "A < B" },
{ char: ">", inTitle: "A > B", expectedTitleMeta: "A > B" },
{ char: "©", inTitle: "A © B", expectedTitleMeta: "A © B" },
{ char: "—", inTitle: "A — B", expectedTitleMeta: "A — B" },
];
for (const { char, inTitle, expectedTitleMeta } of cases) {
test(`"${char}" in H1 has no double-escape in <title> or cover`, () => {
const result = render({
markdown: `# ${inTitle}\n\nBody.`,
cover: true,
author: "A",
});
// Meta: decoded plain text.
expect(result.meta.title).toBe(expectedTitleMeta);
// HTML: <title>...</title> never contains double-escape patterns.
expect(result.html).not.toMatch(/<title>[^<]*&amp;amp;/);
expect(result.html).not.toMatch(/<title>[^<]*&amp;lt;/);
expect(result.html).not.toMatch(/<title>[^<]*&amp;gt;/);
expect(result.html).not.toMatch(/<title>[^<]*&amp;#\d+;/);
expect(result.html).not.toMatch(/<title>[^<]*&amp;#x[0-9a-fA-F]+;/);
// Cover block also single-escape.
expect(result.html).not.toMatch(/class="cover-title"[^>]*>[^<]*&amp;amp;/);
});
}
test('ampersand in <title> renders as exactly one "&amp;"', () => {
const result = render({ markdown: `# Herbert & Garry\n\nBody.` });
expect(result.html).toContain("<title>Herbert &amp; Garry</title>");
expect(result.html).not.toContain("&amp;amp;");
});
test("TOC entries have no double-escape when a heading contains '&'", () => {
const result = render({
markdown: `# Doc\n\n## Herbert & Garry\n\nBody.\n\n## Other\n\nMore.`,
toc: true,
});
// TOC renders the heading text through escapeHtml; must be single-escaped.
expect(result.html).toContain("Herbert &amp; Garry");
expect(result.html).not.toContain("&amp;amp;");
});
test('numeric entity in H1 (e.g. "&#169;") decodes cleanly to <title>', () => {
// Marked passes through numeric entities verbatim in the HTML output,
// so the decoder must handle them.
const result = render({ markdown: `# A &#169; B\n\nBody.` });
expect(result.meta.title).toBe("A © B");
expect(result.html).toContain("<title>A © B</title>");
});
test("smartypants converts raw quotes in title BEFORE extraction (contract)", () => {
// We do NOT assert raw `"` survives — smartypants is expected to convert it.
// The contract is: no double-escape of the encoded form.
const result = render({ markdown: `# Say "hi"\n\nBody.` });
expect(result.html).not.toContain("&amp;quot;");
expect(result.html).not.toContain("&amp;#39;");
// And <title> contains exactly one level of escaping.
const titleMatch = result.html.match(/<title>([^<]*)<\/title>/);
expect(titleMatch).toBeTruthy();
if (titleMatch) {
// Never contains a double-encoded entity.
expect(titleMatch[1]).not.toMatch(/&amp;(amp|lt|gt|quot|#\d+);/);
}
});
});