Files
claude-howto/scripts/check_cross_references.py
xiaolai 0bf754673a fix: constrain cross-reference resolution to repo root (#91)
`(file_path.parent / link_path).resolve()` could silently follow a
crafted `../../outside/path` link that escapes the repository root.
Paths that resolve outside the repo root are now skipped rather than
checked for existence.

Co-authored-by: Claude Code <noreply@anthropic.com>
2026-04-25 00:28:03 +02:00

116 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""Validate cross-references, anchors, and code fences in Markdown files."""
import re
import sys
from pathlib import Path
IGNORE_DIRS = {
".venv",
"node_modules",
".git",
"blog-posts",
"openspec",
"prompts",
".agents",
}
IGNORE_FILES = {"README.backup.md"}
def iter_md_files():
for f in Path().rglob("*.md"):
if (
not any(part in IGNORE_DIRS for part in f.parts)
and f.name not in IGNORE_FILES
):
yield f
def heading_to_anchor(heading: str) -> str:
# Match GitHub's anchor generation: strip emoji and special punctuation,
# keep Unicode letters (including Vietnamese diacritics), lowercase,
# replace spaces with hyphens, strip leading/trailing hyphens.
# 1. Remove emoji (characters outside BMP or in known emoji ranges)
heading = re.sub(
r"[\U0001F000-\U0001FFFF" # Supplementary Multilingual Plane symbols
r"\U00002702-\U000027B0" # Dingbats
r"\U0000FE00-\U0000FE0F" # Variation selectors
r"\U0000200D" # Zero-width joiner
r"\U000000A9\U000000AE" # (C) (R)
r"\U00002000-\U0000206F" # General punctuation (some emoji-adjacent)
r"]",
"",
heading,
)
# 2. Remove punctuation but keep Unicode word chars, spaces, and hyphens
anchor = re.sub(r"[^\w\s-]", "", heading.lower(), flags=re.UNICODE)
# 3. Replace spaces with hyphens
anchor = anchor.replace(" ", "-")
# 4. Strip trailing hyphens (keep leading hyphens for emoji-prefixed headings)
return anchor.rstrip("-")
def strip_code_blocks(content: str) -> str:
"""Remove fenced code blocks and inline code spans to avoid scanning example links."""
# Strip fenced code blocks (``` ... ```)
content = re.sub(r"```[^\n]*\n.*?```", "", content, flags=re.DOTALL)
# Strip inline code spans (` ... `)
content = re.sub(r"`[^`\n]+`", "", content)
return content
def main() -> int:
errors: list[str] = []
repo_root = Path().resolve()
for file_path in iter_md_files():
content = file_path.read_text(encoding="utf-8")
# Strip code blocks before scanning for links/anchors to avoid false positives
# from documentation examples inside code fences.
scannable = strip_code_blocks(content)
# Relative .md links must resolve and must stay within the repo root
for link_path in re.findall(r"\[[^\]]+\]\(([^)#]+\.md)[^)]*\)", scannable):
resolved = (file_path.parent / link_path).resolve()
if not resolved.is_relative_to(repo_root):
continue
if not resolved.exists():
errors.append(f"{file_path}: broken cross-reference → '{link_path}'")
# In-page anchors must match a real heading
anchors = re.findall(r"\[[^\]]+\]\(#([^)]+)\)", scannable)
if anchors:
headings = re.findall(r"^#{1,6}\s+(.+)$", content, re.MULTILINE)
valid_anchors = {heading_to_anchor(h) for h in headings}
errors.extend(
f"{file_path}: broken anchor → '#{anchor}'"
for anchor in anchors
if anchor not in valid_anchors
)
# Unmatched code fences (only count fences at start of line)
if len(re.findall(r"^```", content, re.MULTILINE)) % 2 != 0:
errors.append(f"{file_path}: unmatched code fences")
# All numbered lesson dirs must have README.md
for i in range(1, 11):
errors.extend(
f"{d}: missing README.md"
for d in Path().glob(f"{i:02d}-*")
if d.is_dir() and not (d / "README.md").exists()
)
if errors:
print("❌ Cross-reference errors:")
for e in errors:
print(f" - {e}")
return 1
md_count = sum(1 for _ in iter_md_files())
print(f"✅ All cross-references valid ({md_count} files checked)")
return 0
if __name__ == "__main__":
sys.exit(main())