Files
claude-howto/scripts/check_cross_references.py
T
Luong NGUYEN 89e89d4aa3 feat(zh): add Chinese translations in zh/ directory
Add Chinese (Simplified) translations for all documentation, organized
under a dedicated zh/ directory that mirrors the English folder structure.

Co-authored-by: tanqingkuang <tanqingkuang@users.noreply.github.com>

Translations originally contributed by @tanqingkuang in #45.
Restructured from *-CN.md suffix pattern into zh/ directory to prevent
the EPUB builder (scripts/build_epub.py collect_folder_files) from
picking up Chinese files via glob("*.md") inside module folders.
2026-04-06 23:08:54 +02:00

115 lines
3.8 KiB
Python

#!/usr/bin/env python3
"""Validate cross-references, anchors, and code fences in Markdown files."""
import re
import sys
import unicodedata
from pathlib import Path
IGNORE_DIRS = {
".venv",
"node_modules",
".git",
"blog-posts",
"openspec",
"prompts",
".agents",
}
IGNORE_FILES = {"README.backup.md"}
def iter_md_files():
for f in Path().rglob("*.md"):
if (
not any(part in IGNORE_DIRS for part in f.parts)
and f.name not in IGNORE_FILES
):
yield f
def heading_to_anchor(heading: str) -> str:
# Match GitHub's anchor generation: strip emoji and special punctuation,
# keep Unicode letters (including Vietnamese diacritics), lowercase,
# replace spaces with hyphens, strip leading/trailing hyphens.
# 1. Remove emoji (characters outside BMP or in known emoji ranges)
heading = re.sub(
r"[\U0001F000-\U0001FFFF" # Supplementary Multilingual Plane symbols
r"\U00002702-\U000027B0" # Dingbats
r"\U0000FE00-\U0000FE0F" # Variation selectors
r"\U0000200D" # Zero-width joiner
r"\U000000A9\U000000AE" # (C) (R)
r"\U00002000-\U0000206F" # General punctuation (some emoji-adjacent)
r"]",
"",
heading,
)
# 2. Remove punctuation but keep Unicode word chars, spaces, and hyphens
anchor = re.sub(r"[^\w\s-]", "", heading.lower(), flags=re.UNICODE)
# 3. Replace spaces with hyphens
anchor = anchor.replace(" ", "-")
# 4. Strip trailing hyphens (keep leading hyphens for emoji-prefixed headings)
return anchor.rstrip("-")
def strip_code_blocks(content: str) -> str:
"""Remove fenced code blocks and inline code spans to avoid scanning example links."""
# Strip fenced code blocks (``` ... ```)
content = re.sub(r"```[^\n]*\n.*?```", "", content, flags=re.DOTALL)
# Strip inline code spans (` ... `)
content = re.sub(r"`[^`\n]+`", "", content)
return content
def main() -> int:
errors = []
for file_path in iter_md_files():
content = file_path.read_text(encoding="utf-8")
# Strip code blocks before scanning for links/anchors to avoid false positives
# from documentation examples inside code fences.
scannable = strip_code_blocks(content)
# Relative .md links must resolve
errors.extend(
f"{file_path}: broken cross-reference → '{link_path}'"
for link_path in re.findall(r"\[[^\]]+\]\(([^)#]+\.md)[^)]*\)", scannable)
if not (file_path.parent / link_path).resolve().exists()
)
# In-page anchors must match a real heading
anchors = re.findall(r"\[[^\]]+\]\(#([^)]+)\)", scannable)
if anchors:
headings = re.findall(r"^#{1,6}\s+(.+)$", content, re.MULTILINE)
valid_anchors = {heading_to_anchor(h) for h in headings}
errors.extend(
f"{file_path}: broken anchor → '#{anchor}'"
for anchor in anchors
if anchor not in valid_anchors
)
# Unmatched code fences (only count fences at start of line)
if len(re.findall(r"^```", content, re.MULTILINE)) % 2 != 0:
errors.append(f"{file_path}: unmatched code fences")
# All numbered lesson dirs must have README.md
for i in range(1, 11):
errors.extend(
f"{d}: missing README.md"
for d in Path().glob(f"{i:02d}-*")
if d.is_dir() and not (d / "README.md").exists()
)
if errors:
print("❌ Cross-reference errors:")
for e in errors:
print(f" - {e}")
return 1
md_count = sum(1 for _ in iter_md_files())
print(f"✅ All cross-references valid ({md_count} files checked)")
return 0
if __name__ == "__main__":
sys.exit(main())