refactor(epub): replace Kroki HTTP dependency with local mmdc rendering (#10) (#52)

* refactor(epub): replace Kroki HTTP dependency with local mmdc rendering (#10)

Remove httpx/tenacity dependencies and Kroki.io API calls from the EPUB
build pipeline. MermaidRenderer now invokes mmdc (mermaid-cli) as a local
subprocess, eliminating intermittent CI failures caused by network
unavailability. CI workflow installs @mermaid-js/mermaid-cli before build.

* fix(epub): add subprocess timeout and fix deduplication log count

- Add 60s timeout to mmdc subprocess.run to prevent hanging builds
  when Chromium/Puppeteer stalls in headless CI environments
- Fix misleading log that printed unique/total as equal counts;
  now logs "N unique diagrams (M total blocks)" explicitly
- Add test_render_all_timeout and strengthen deduplication assertion
This commit is contained in:
Luong NGUYEN
2026-04-06 23:44:59 +02:00
committed by GitHub
parent 6cbaa4dd52
commit e76bbe40f2
3 changed files with 205 additions and 129 deletions
+8
View File
@@ -17,6 +17,14 @@ jobs:
- name: Install uv
uses: astral-sh/setup-uv@v4
- name: Install Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install Mermaid CLI
run: npm install -g @mermaid-js/mermaid-cli
- name: Build EPUB
run: uv run scripts/build_epub.py
+85 -118
View File
@@ -1,6 +1,6 @@
#!/usr/bin/env -S uv run --script
# /// script
# dependencies = ["ebooklib", "markdown", "beautifulsoup4", "httpx", "pillow", "tenacity"]
# dependencies = ["ebooklib", "markdown", "beautifulsoup4", "pillow"]
# ///
"""
Build an EPUB from the Claude How-To markdown files.
@@ -17,8 +17,7 @@ Usage:
--root, -r Root directory containing markdown files (default: repo root)
--output, -o Output EPUB file path (default: <root>/claude-howto-guide.epub)
--verbose, -v Enable verbose logging
--timeout Timeout for API requests in seconds (default: 30)
--max-concurrent Maximum concurrent API requests (default: 10)
--mmdc-path Path to mmdc binary (default: mmdc from PATH)
The script uses inline script dependencies (PEP 723), so uv will
automatically install required packages in an isolated environment.
@@ -28,7 +27,7 @@ Output:
Features:
- Organizes chapters by folder structure (01-slash-commands, etc.)
- Renders Mermaid diagrams as PNG images via Kroki.io API (async concurrent)
- Renders Mermaid diagrams as PNG images via local mmdc CLI (no network required)
- Generates a cover image from the project logo
- Converts internal markdown links to EPUB chapter references
- Handles SVG images by replacing with styled placeholders
@@ -36,36 +35,29 @@ Features:
Requirements:
- uv (recommended) or Python 3.10+ with dependencies installed
- Internet connection for Mermaid diagram rendering
- @mermaid-js/mermaid-cli installed globally: npm install -g @mermaid-js/mermaid-cli
- Repository structure with markdown files and claude-howto-logo.png
"""
from __future__ import annotations
import argparse
import asyncio
import base64
import html
import logging
import os
import re
import shutil
import subprocess # nosec B404
import sys
import zlib
import tempfile
from dataclasses import dataclass, field
from io import BytesIO
from pathlib import Path
import httpx
import markdown
from bs4 import BeautifulSoup
from ebooklib import epub
from PIL import Image, ImageDraw, ImageFont
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
# =============================================================================
# Custom Exceptions
@@ -129,11 +121,8 @@ class EPUBConfig:
cover_title_color: tuple[int, int, int] = (78, 205, 196)
cover_subtitle_color: tuple[int, int, int] = (168, 178, 209)
# Network Settings
kroki_base_url: str = "https://kroki.io"
request_timeout: float = 30.0
max_retries: int = 3
max_concurrent_requests: int = 10
# Local rendering settings
mmdc_path: str = "mmdc"
# Font paths (platform-specific)
title_font_paths: list[str] = field(
@@ -246,7 +235,7 @@ def validate_inputs(config: EPUBConfig, logger: logging.Logger) -> None:
# =============================================================================
# Mermaid Rendering (Async with Retry)
# Mermaid Rendering (Local mmdc)
# =============================================================================
@@ -263,7 +252,7 @@ def sanitize_mermaid(mermaid_code: str) -> str:
class MermaidRenderer:
"""Async renderer for Mermaid diagrams via Kroki.io API."""
"""Renders Mermaid diagrams locally via the mmdc CLI (no network required)."""
def __init__(
self, config: EPUBConfig, state: BuildState, logger: logging.Logger
@@ -271,100 +260,85 @@ class MermaidRenderer:
self.config = config
self.state = state
self.logger = logger
self._semaphore: asyncio.Semaphore | None = None
async def _fetch_single(
self, client: httpx.AsyncClient, mermaid_code: str, index: int
) -> tuple[str, tuple[bytes, str]]:
"""Fetch a single Mermaid diagram with retry logic."""
def _resolve_mmdc(self) -> str:
"""Resolve the mmdc binary path, raising if not found."""
mmdc = shutil.which(self.config.mmdc_path) or self.config.mmdc_path
if not shutil.which(mmdc):
raise MermaidRenderError(
f"mmdc not found at '{self.config.mmdc_path}'. "
"Install it with: npm install -g @mermaid-js/mermaid-cli"
)
return mmdc
def _render_one(
self, mmdc: str, mermaid_code: str, index: int
) -> tuple[bytes, str]:
"""Render a single Mermaid diagram to PNG bytes using mmdc."""
cache_key = mermaid_code.strip()
# Check cache first
if cache_key in self.state.mermaid_cache:
self.logger.debug(f"Cache hit for diagram {index}")
return cache_key, self.state.mermaid_cache[cache_key]
return self.state.mermaid_cache[cache_key]
# Rate limit with semaphore
assert self._semaphore is not None
async with self._semaphore:
result = await self._fetch_with_retry(client, mermaid_code, index)
if result is None:
with tempfile.TemporaryDirectory() as tmpdir:
input_file = Path(tmpdir) / "diagram.mmd"
output_file = Path(tmpdir) / "diagram.png"
input_file.write_text(mermaid_code, encoding="utf-8")
try:
result = subprocess.run( # nosec B603
[
mmdc,
"-i",
str(input_file),
"-o",
str(output_file),
"-b",
"white",
],
capture_output=True,
text=True,
check=False,
timeout=60,
)
except subprocess.TimeoutExpired as exc:
raise MermaidRenderError(
f"Failed to render Mermaid diagram {index} after {self.config.max_retries} attempts"
)
return cache_key, result
f"mmdc timed out rendering diagram {index} (60s limit)"
) from exc
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=10),
retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)),
reraise=True,
)
async def _fetch_with_retry(
self, client: httpx.AsyncClient, mermaid_code: str, index: int
) -> tuple[bytes, str] | None:
"""Fetch diagram with retry logic."""
try:
compressed = zlib.compress(mermaid_code.encode("utf-8"), level=9)
encoded = base64.urlsafe_b64encode(compressed).decode("ascii")
url = f"{self.config.kroki_base_url}/mermaid/png/{encoded}"
self.logger.debug(f"Fetching diagram {index}...")
response = await client.get(url, timeout=self.config.request_timeout)
if response.status_code == 200:
self.state.mermaid_counter += 1
img_name = f"mermaid_{self.state.mermaid_counter}.png"
result = (response.content, img_name)
cache_key = mermaid_code.strip()
self.state.mermaid_cache[cache_key] = result
self.logger.info(f"Rendered diagram {index} -> {img_name}")
return result
else:
self.logger.warning(
f"Kroki API returned {response.status_code} for diagram {index}"
)
if result.returncode != 0:
raise MermaidRenderError(
f"Kroki API returned {response.status_code} for diagram {index}"
f"mmdc failed for diagram {index}: {result.stderr.strip()}"
)
except httpx.TimeoutException:
self.logger.warning(f"Timeout fetching diagram {index}, will retry...")
raise
except httpx.NetworkError as e:
self.logger.warning(
f"Network error for diagram {index}: {e}, will retry..."
)
raise
if not output_file.exists():
raise MermaidRenderError(f"mmdc produced no output for diagram {index}")
async def render_all(
png_bytes = output_file.read_bytes()
self.state.mermaid_counter += 1
img_name = f"mermaid_{self.state.mermaid_counter}.png"
entry = (png_bytes, img_name)
self.state.mermaid_cache[cache_key] = entry
self.logger.info(f"Rendered diagram {index} -> {img_name}")
return entry
def render_all(
self, diagrams: list[tuple[int, str]]
) -> dict[str, tuple[bytes, str]]:
"""Render all Mermaid diagrams concurrently."""
self._semaphore = asyncio.Semaphore(self.config.max_concurrent_requests)
"""Render all Mermaid diagrams using local mmdc."""
mmdc = self._resolve_mmdc()
results: dict[str, tuple[bytes, str]] = {}
async with httpx.AsyncClient(
follow_redirects=True,
limits=httpx.Limits(max_connections=self.config.max_concurrent_requests),
timeout=httpx.Timeout(self.config.request_timeout),
) as client:
tasks = [
self._fetch_single(client, sanitize_mermaid(code), idx)
for idx, code in diagrams
]
self.logger.info(f"Rendering {len(diagrams)} Mermaid diagrams locally...")
for idx, code in diagrams:
sanitized = sanitize_mermaid(code)
cache_key = sanitized.strip()
data = self._render_one(mmdc, sanitized, idx)
results[cache_key] = data
self.logger.info(f"Fetching {len(tasks)} Mermaid diagrams concurrently...")
# Use gather with return_exceptions=False for strict mode
completed = await asyncio.gather(*tasks)
for cache_key, data in completed:
results[cache_key] = data
success_count = len(results)
self.logger.info(
f"Successfully rendered {success_count}/{len(diagrams)} diagrams"
f"Successfully rendered {len(results)} unique diagrams ({len(diagrams)} total blocks)"
)
return results
@@ -702,7 +676,7 @@ def handle_svg_image(
svg_path = (root_path / src).resolve()
if not svg_path.is_file():
logger.warning(f"SVG file not found: {src}")
return f'<p><em>[SVG not found: {html.escape(src)}]</em></p>'
return f"<p><em>[SVG not found: {html.escape(src)}]</em></p>"
svg_key = str(svg_path)
@@ -907,12 +881,12 @@ def create_stylesheet() -> epub.EpubItem:
)
async def build_epub_async(
def build_epub_async(
config: EPUBConfig,
logger: logging.Logger,
state: BuildState | None = None,
) -> Path:
"""Build EPUB asynchronously with concurrent diagram fetching."""
"""Build EPUB with local Mermaid diagram rendering."""
state = state or BuildState()
state.reset() # Ensure clean state
@@ -940,14 +914,14 @@ async def build_epub_async(
collector = ChapterCollector(config.root_path, state)
chapter_infos = collector.collect_all_chapters(get_chapter_order())
# Extract and pre-fetch all Mermaid diagrams
# Extract and render all Mermaid diagrams locally
logger.info("Extracting Mermaid diagrams...")
md_files = [(ch.file_path, ch.file_title) for ch in chapter_infos]
all_diagrams = extract_all_mermaid_blocks(md_files, logger)
if all_diagrams:
renderer = MermaidRenderer(config, state, logger)
await renderer.render_all(all_diagrams)
renderer.render_all(all_diagrams)
# Process chapters
logger.info("Processing chapters...")
@@ -1039,7 +1013,7 @@ def create_epub(root_path: Path, output_path: Path, verbose: bool = False) -> Pa
"""Synchronous wrapper for backward compatibility."""
logger = setup_logging(verbose)
config = EPUBConfig(root_path=root_path, output_path=output_path)
return asyncio.run(build_epub_async(config, logger))
return build_epub_async(config, logger)
# =============================================================================
@@ -1070,16 +1044,10 @@ def main() -> int:
"--verbose", "-v", action="store_true", help="Enable verbose logging"
)
parser.add_argument(
"--timeout",
type=float,
default=30.0,
help="Timeout for API requests in seconds (default: 30)",
)
parser.add_argument(
"--max-concurrent",
type=int,
default=10,
help="Maximum concurrent API requests (default: 10)",
"--mmdc-path",
type=str,
default="mmdc",
help="Path to mmdc binary (default: mmdc from PATH)",
)
parser.add_argument(
"--lang",
@@ -1116,12 +1084,11 @@ def main() -> int:
output_path=output,
language=language,
title=title,
request_timeout=args.timeout,
max_concurrent_requests=args.max_concurrent,
mmdc_path=args.mmdc_path,
)
try:
result = asyncio.run(build_epub_async(config, logger))
result = build_epub_async(config, logger)
print(f"Successfully created: {result}")
return 0
except EPUBBuildError as e:
+112 -11
View File
@@ -4,7 +4,7 @@ from __future__ import annotations
import logging
from pathlib import Path
from unittest.mock import patch
from unittest.mock import MagicMock, patch
import pytest
@@ -14,6 +14,8 @@ from build_epub import (
BuildState,
ChapterCollector,
EPUBConfig,
MermaidRenderer,
MermaidRenderError,
ValidationError,
create_chapter_html,
extract_all_mermaid_blocks,
@@ -92,9 +94,7 @@ class TestEPUBConfig:
assert config.title == "Claude Code How-To Guide"
assert config.language == "en"
assert config.author == "Claude Code Community"
assert config.request_timeout == 30.0
assert config.max_concurrent_requests == 10
assert config.max_retries == 3
assert config.mmdc_path == "mmdc"
def test_custom_values(self, tmp_path: Path) -> None:
"""Test that custom values override defaults."""
@@ -102,12 +102,10 @@ class TestEPUBConfig:
root_path=tmp_path,
output_path=tmp_path / "out.epub",
title="Custom Title",
request_timeout=60.0,
max_concurrent_requests=5,
mmdc_path="/usr/local/bin/mmdc",
)
assert config.title == "Custom Title"
assert config.request_timeout == 60.0
assert config.max_concurrent_requests == 5
assert config.mmdc_path == "/usr/local/bin/mmdc"
# =============================================================================
@@ -375,6 +373,110 @@ class TestLogging:
assert logger.name == "epub_builder"
# =============================================================================
# MermaidRenderer Tests
# =============================================================================
class TestMermaidRenderer:
"""Tests for local MermaidRenderer."""
def _make_renderer(
self, tmp_path: Path, state: BuildState, logger: logging.Logger
) -> MermaidRenderer:
config = EPUBConfig(
root_path=tmp_path,
output_path=tmp_path / "out.epub",
mmdc_path="mmdc",
)
return MermaidRenderer(config, state, logger)
def test_render_all_success(
self, tmp_path: Path, state: BuildState, logger: logging.Logger
) -> None:
"""Test that render_all uses mmdc and caches results."""
fake_png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
renderer = self._make_renderer(tmp_path, state, logger)
with (
patch("shutil.which", return_value="/usr/bin/mmdc"),
patch("subprocess.run") as mock_run,
patch("pathlib.Path.exists", return_value=True),
patch("pathlib.Path.read_bytes", return_value=fake_png),
):
mock_run.return_value = MagicMock(returncode=0, stderr="")
results = renderer.render_all([(1, "graph TD\n A --> B")])
assert len(results) == 1
png_bytes, img_name = next(iter(results.values()))
assert png_bytes == fake_png
assert img_name.startswith("mermaid_")
assert img_name.endswith(".png")
def test_render_all_mmdc_not_found(
self, tmp_path: Path, state: BuildState, logger: logging.Logger
) -> None:
"""Test that missing mmdc raises MermaidRenderError."""
renderer = self._make_renderer(tmp_path, state, logger)
with (
patch("shutil.which", return_value=None),
pytest.raises(MermaidRenderError, match="mmdc not found"),
):
renderer.render_all([(1, "graph TD\n A --> B")])
def test_render_all_mmdc_failure(
self, tmp_path: Path, state: BuildState, logger: logging.Logger
) -> None:
"""Test that mmdc non-zero exit raises MermaidRenderError."""
renderer = self._make_renderer(tmp_path, state, logger)
with (
patch("shutil.which", return_value="/usr/bin/mmdc"),
patch("subprocess.run") as mock_run,
):
mock_run.return_value = MagicMock(returncode=1, stderr="parse error")
with pytest.raises(MermaidRenderError, match="mmdc failed"):
renderer.render_all([(1, "graph TD\n A --> B")])
def test_render_all_deduplication(
self, tmp_path: Path, state: BuildState, logger: logging.Logger
) -> None:
"""Test that identical diagrams are only rendered once."""
fake_png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
renderer = self._make_renderer(tmp_path, state, logger)
same_code = "graph TD\n A --> B"
with (
patch("shutil.which", return_value="/usr/bin/mmdc"),
patch("subprocess.run") as mock_run,
patch("pathlib.Path.exists", return_value=True),
patch("pathlib.Path.read_bytes", return_value=fake_png),
):
mock_run.return_value = MagicMock(returncode=0, stderr="")
results = renderer.render_all([(1, same_code), (2, same_code)])
# mmdc should only be called once for duplicate diagrams
assert mock_run.call_count == 1
# Both entries map to the same cached result
assert len(results) == 1
def test_render_all_timeout(
self, tmp_path: Path, state: BuildState, logger: logging.Logger
) -> None:
"""Test that a hung mmdc process raises MermaidRenderError."""
import subprocess as _subprocess
renderer = self._make_renderer(tmp_path, state, logger)
with (
patch("shutil.which", return_value="/usr/bin/mmdc"),
patch("subprocess.run", side_effect=_subprocess.TimeoutExpired("mmdc", 60)),
pytest.raises(MermaidRenderError, match="timed out"),
):
renderer.render_all([(1, "graph TD\n A --> B")])
# =============================================================================
# Integration Tests
# =============================================================================
@@ -383,8 +485,7 @@ class TestLogging:
class TestIntegration:
"""Integration tests for the full build process."""
@pytest.mark.asyncio
async def test_build_without_mermaid(
def test_build_without_mermaid(
self, tmp_project: Path, logger: logging.Logger
) -> None:
"""Test building an EPUB without Mermaid diagrams."""
@@ -399,7 +500,7 @@ class TestIntegration:
with patch("build_epub.get_chapter_order") as mock_order:
mock_order.return_value = [("README.md", "Introduction")]
result = await build_epub_async(config, logger)
result = build_epub_async(config, logger)
assert result.exists()
assert result.suffix == ".epub"