diff --git a/make-pdf/src/render.ts b/make-pdf/src/render.ts
index eafe6fc8..ae5228f4 100644
--- a/make-pdf/src/render.ts
+++ b/make-pdf/src/render.ts
@@ -288,7 +288,7 @@ function extractHeadings(html: string): Array<{ level: number; text: string }> {
let match;
while ((match = re.exec(html)) !== null) {
const level = parseInt(match[1].slice(1), 10);
- const text = stripTags(match[2]).trim();
+ const text = decodeTextEntities(stripTags(match[2]).trim());
if (text) headings.push({ level, text });
}
return headings;
@@ -324,7 +324,32 @@ function wrapChaptersByH1(html: string): string {
function extractFirstHeading(html: string): string | null {
const m = html.match(/
]*>([\s\S]*?)<\/h1>/i);
- return m ? stripTags(m[1]).trim() : null;
+ return m ? decodeTextEntities(stripTags(m[1]).trim()) : null;
+}
+
+/**
+ * Decode HTML entities in plain text extracted from rendered HTML. Distinct
+ * from decodeTypographicEntities (which runs on in-pipeline HTML and preserves
+ * & because & can be legitimate there). This runs on text destined
+ * for , cover, and TOC entries where & MUST become & or escapeHtml
+ * produces &.
+ *
+ * Amp-last ordering: input "©" decodes to "©" in the named pass,
+ * then the numeric pass decodes "©" to "©". Decoding & first would
+ * produce "©" and the numeric pass would consume it — different end state
+ * but risks double-decode on inputs like "<".
+ */
+function decodeTextEntities(s: string): string {
+ return s
+ .replace(/</g, "<")
+ .replace(/>/g, ">")
+ .replace(/"/g, '"')
+ .replace(/'/g, "'")
+ .replace(/'/g, "'")
+ .replace(/'/g, "'")
+ .replace(/(\d+);/g, (_, n) => String.fromCodePoint(parseInt(n, 10)))
+ .replace(/([0-9a-fA-F]+);/g, (_, n) => String.fromCodePoint(parseInt(n, 16)))
+ .replace(/&/g, "&");
}
function stripTags(html: string): string {
diff --git a/make-pdf/test/render.test.ts b/make-pdf/test/render.test.ts
index 0e64f871..89a92537 100644
--- a/make-pdf/test/render.test.ts
+++ b/make-pdf/test/render.test.ts
@@ -354,3 +354,79 @@ describe("render() — pageNumbers data flow", () => {
expect(result.printCss).toMatch(/@bottom-center\s*\{\s*content:\s*counter\(page\)/);
});
});
+
+// ─── render() — HTML entity handling in titles, cover, TOC ───────────
+
+describe("render() — no double HTML entity escaping", () => {
+ type Case = { char: string; inTitle: string; expectedTitleMeta: string };
+
+ // Only characters that should flow through unchanged. `"` and `'` are
+ // omitted from this set because smartypants converts them to curly quotes
+ // before heading extraction — asserted separately below.
+ const cases: Case[] = [
+ { char: "&", inTitle: "A & B", expectedTitleMeta: "A & B" },
+ { char: "<", inTitle: "A < B", expectedTitleMeta: "A < B" },
+ { char: ">", inTitle: "A > B", expectedTitleMeta: "A > B" },
+ { char: "©", inTitle: "A © B", expectedTitleMeta: "A © B" },
+ { char: "—", inTitle: "A — B", expectedTitleMeta: "A — B" },
+ ];
+
+ for (const { char, inTitle, expectedTitleMeta } of cases) {
+ test(`"${char}" in H1 has no double-escape in or cover`, () => {
+ const result = render({
+ markdown: `# ${inTitle}\n\nBody.`,
+ cover: true,
+ author: "A",
+ });
+ // Meta: decoded plain text.
+ expect(result.meta.title).toBe(expectedTitleMeta);
+ // HTML: ... never contains double-escape patterns.
+ expect(result.html).not.toMatch(/[^<]*&/);
+ expect(result.html).not.toMatch(/[^<]*</);
+ expect(result.html).not.toMatch(/[^<]*>/);
+ expect(result.html).not.toMatch(/[^<]*&#\d+;/);
+ expect(result.html).not.toMatch(/[^<]*&#x[0-9a-fA-F]+;/);
+ // Cover block also single-escape.
+ expect(result.html).not.toMatch(/class="cover-title"[^>]*>[^<]*&/);
+ });
+ }
+
+ test('ampersand in renders as exactly one "&"', () => {
+ const result = render({ markdown: `# Herbert & Garry\n\nBody.` });
+ expect(result.html).toContain("Herbert & Garry");
+ expect(result.html).not.toContain("&");
+ });
+
+ test("TOC entries have no double-escape when a heading contains '&'", () => {
+ const result = render({
+ markdown: `# Doc\n\n## Herbert & Garry\n\nBody.\n\n## Other\n\nMore.`,
+ toc: true,
+ });
+ // TOC renders the heading text through escapeHtml; must be single-escaped.
+ expect(result.html).toContain("Herbert & Garry");
+ expect(result.html).not.toContain("&");
+ });
+
+ test('numeric entity in H1 (e.g. "©") decodes cleanly to ', () => {
+ // Marked passes through numeric entities verbatim in the HTML output,
+ // so the decoder must handle them.
+ const result = render({ markdown: `# A © B\n\nBody.` });
+ expect(result.meta.title).toBe("A © B");
+ expect(result.html).toContain("A © B");
+ });
+
+ test("smartypants converts raw quotes in title BEFORE extraction (contract)", () => {
+ // We do NOT assert raw `"` survives — smartypants is expected to convert it.
+ // The contract is: no double-escape of the encoded form.
+ const result = render({ markdown: `# Say "hi"\n\nBody.` });
+ expect(result.html).not.toContain(""");
+ expect(result.html).not.toContain("'");
+ // And contains exactly one level of escaping.
+ const titleMatch = result.html.match(/([^<]*)<\/title>/);
+ expect(titleMatch).toBeTruthy();
+ if (titleMatch) {
+ // Never contains a double-encoded entity.
+ expect(titleMatch[1]).not.toMatch(/&(amp|lt|gt|quot|#\d+);/);
+ }
+ });
+});