Files
claude-howto/scripts/check_markdown_rendering.py
T
Luong NGUYEN 553a319934 feat(scripts): add markdown rendering validator (#119) (#120)
Catches mechanical rendering bugs that look fine in a diff but render
wrong on GitHub / in the EPUB build: inner backticks inside inline-code
spans (PR #114), unescaped pipes in table cells (PR #115), stray
`$ARGUMENTS` outside code, and unmatched fences.

Scope: `**/README.md` across tutorial modules + translations (ja, uk,
vi, zh). Excludes dev tooling dirs (`.claude`, `.venv`, `openspec`, …).

Rule registry: each rule is `(Path, content) -> list[str]`. Adding a
new rule = one function + one registry entry + one test fixture.

- scripts/check_markdown_rendering.py: 4 rules, CommonMark-aware code-
  span consumption, blockquote-fence-aware masking
- scripts/tests/test_check_markdown_rendering.py: 20 unit tests,
  positive + negative per rule, currency-false-positive guard
- .pre-commit-config.yaml: new global `markdown-rendering` hook
2026-05-11 23:49:39 +02:00

256 lines
8.3 KiB
Python

#!/usr/bin/env python3
"""Validate Markdown rendering correctness in tutorial README files.
Catches mechanical rendering bugs that look fine in a diff but render wrong
on GitHub / in the EPUB build: inner backticks in inline code, unescaped
pipes in table cells, stray $ARGUMENTS / $N outside code, mismatched fences.
Scope: `**/README.md` across the repo (tutorial modules + ja/uk/vi/zh
translations). Excludes `.venv`, `node_modules`, `.git`, `blog-posts`,
`.agents`, `.claude`, and `openspec` — these are not tutorial output.
Each rule is a function `(file_path, content) -> list[str]`. To add a new
rule: write the function, append `(name, fn)` to `RULES`, add fixtures in
`tests/test_check_markdown_rendering.py`.
"""
from __future__ import annotations
import re
import sys
from collections.abc import Callable
from pathlib import Path
IGNORE_DIRS = {
".venv",
"node_modules",
".git",
"blog-posts",
"openspec",
"prompts",
".agents",
".claude",
}
Rule = Callable[[Path, str], list[str]]
def iter_readme_files() -> list[Path]:
return [
f
for f in Path().rglob("README.md")
if not any(part in IGNORE_DIRS for part in f.parts)
]
_FENCE_RE = re.compile(r"^ {0,3}(?:>\s*)*```")
def _mask_fenced_blocks(content: str) -> str:
"""Replace fenced code blocks with blank lines so line numbers survive.
Triple-backtick fences only. Recognises fences inside blockquotes
(``` > ``` ``` ```), matching how GitHub renders them. Lines inside
fences become empty so subsequent rules see no code content but still
report accurate line numbers.
"""
out: list[str] = []
in_fence = False
for line in content.split("\n"):
if _FENCE_RE.match(line):
in_fence = not in_fence
out.append("")
continue
out.append("" if in_fence else line)
return "\n".join(out)
def _strip_inline_code(line: str) -> str:
"""Remove inline code spans from a single line.
Strips double-backtick spans first (they may contain single backticks),
then single-backtick spans. HTML comments are also stripped so rules
that scan prose ignore explanatory comments.
"""
line = re.sub(r"<!--.*?-->", "", line)
line = re.sub(r"``[^`]+``", "", line)
line = re.sub(r"`[^`\n]+`", "", line)
return line
def _consume_code_spans(line: str) -> str:
"""Remove well-formed inline code spans from a single line.
Follows CommonMark inline-code semantics: a run of N backticks opens a
span; the span closes at the next matching run of N backticks. Spans
may nest different run-lengths. Anything left after consumption is a
structural mismatch (an unmatched opening run).
"""
out: list[str] = []
i = 0
n = len(line)
while i < n:
if line[i] != "`":
out.append(line[i])
i += 1
continue
run_start = i
while i < n and line[i] == "`":
i += 1
run_len = i - run_start
scan = i
while scan < n:
if line[scan] == "`":
close_start = scan
while scan < n and line[scan] == "`":
scan += 1
if scan - close_start == run_len:
i = scan
break
else:
scan += 1
else:
out.append(line[run_start:i])
return "".join(out)
def rule_backtick_in_inline_code(file_path: Path, content: str) -> list[str]:
"""Flag inline code spans that contain a literal backtick.
Bug pattern shipped in PR #114: `` `!`command`` `` — a single-backtick
span containing an inner backtick. The canonical fix is the
``` `` `text` `` ``` idiom (double-backticks + space padding).
Detection: consume well-formed code spans left-to-right per line. Any
leftover backtick is an unmatched opener — a structural mismatch that
renders wrong on GitHub.
Skips fenced code blocks.
"""
masked = _mask_fenced_blocks(content)
errors: list[str] = []
for lineno, line in enumerate(masked.split("\n"), start=1):
if "`" in _consume_code_spans(line):
errors.append(
f"{file_path}:{lineno}: backtick-in-inline-code — "
"use `` `text` `` (double-backtick + space) to display a literal backtick"
)
return errors
def rule_unescaped_pipe_in_table(file_path: Path, content: str) -> list[str]:
"""Flag table rows whose unescaped pipe count differs from the header.
A table row is `|...|...|`. The header row sets the column count; body
rows must have the same count. A bare `|` in a cell (e.g. `[color|default]`)
inflates the count and breaks rendering. The fix is `\\|`.
Pipes inside inline-code spans are ignored (they need escaping too, but
that's a separate idiom). The separator row `|---|---|` is ignored.
"""
masked = _mask_fenced_blocks(content)
errors: list[str] = []
lines = masked.split("\n")
def cell_count(line: str) -> int:
scannable = _strip_inline_code(line.strip())
return len(re.findall(r"(?<!\\)\|", scannable))
i = 0
while i < len(lines):
line = lines[i].strip()
if (
line.startswith("|")
and line.endswith("|")
and i + 1 < len(lines)
and re.match(r"^\s*\|[\s\-:|]+\|\s*$", lines[i + 1])
):
header_pipes = cell_count(line)
j = i + 2
while j < len(lines):
row = lines[j].strip()
if not (row.startswith("|") and row.endswith("|")):
break
if cell_count(row) != header_pipes:
errors.append(
f"{file_path}:{j + 1}: unescaped-pipe-in-table — "
"row cell count differs from header; escape literal `|` as `\\|`"
)
j += 1
i = j
else:
i += 1
return errors
def rule_stray_argument_placeholder(file_path: Path, content: str) -> list[str]:
"""Flag `$ARGUMENTS` appearing in prose outside code.
Tutorial snippets reference these inside fenced code blocks or inline
code (`` `$ARGUMENTS` ``). When `$ARGUMENTS` appears in bare prose it
usually means a tutorial snippet bled into the surrounding paragraph.
Single-digit `$N` was considered but dropped: shell-arg `$0`..`$9` is
syntactically indistinguishable from prose currency (`$5 today`,
`$1,000`, `$5/month`) without semantic context, and currency is
legitimate translator content. `$ARGUMENTS` is unambiguous.
"""
masked = _mask_fenced_blocks(content)
errors: list[str] = []
pattern = re.compile(r"\$ARGUMENTS\b")
for lineno, line in enumerate(masked.split("\n"), start=1):
stripped = _strip_inline_code(line)
if pattern.search(stripped):
errors.append(
f"{file_path}:{lineno}: stray-argument-placeholder — "
"wrap `$ARGUMENTS` in backticks or move into a code fence"
)
return errors
def rule_unmatched_fence(file_path: Path, content: str) -> list[str]:
"""Flag files with an odd number of triple-backtick fences.
Redundant safety net — `scripts/check_cross_references.py` also
enforces this. Kept here so the validator stays self-contained.
"""
fences = re.findall(r"^\s*```", content, re.MULTILINE)
if len(fences) % 2 != 0:
return [
f"{file_path}:1: unmatched-fence — "
"odd number of triple-backtick fences in file"
]
return []
RULES: list[tuple[str, Rule]] = [
("backtick-in-inline-code", rule_backtick_in_inline_code),
("unescaped-pipe-in-table", rule_unescaped_pipe_in_table),
("stray-argument-placeholder", rule_stray_argument_placeholder),
("unmatched-fence", rule_unmatched_fence),
]
def main() -> int:
errors: list[str] = []
files = iter_readme_files()
for file_path in files:
content = file_path.read_text(encoding="utf-8")
for _name, rule in RULES:
errors.extend(rule(file_path, content))
if errors:
print("❌ Markdown rendering errors:")
for e in errors:
print(f" - {e}")
return 1
print(
f"✅ Markdown rendering clean ({len(files)} README files, {len(RULES)} rules)"
)
return 0
if __name__ == "__main__":
sys.exit(main())