Files
claude-howto/scripts/check_cross_references.py
Luong NGUYEN 699fb39a46 ci: shift-left quality gates — add mypy to pre-commit, fix CI failures (#53)
* ci: shift-left quality gates — add mypy to pre-commit, fix CI failures

- Add mypy pre-commit hook (mirrors-mypy v1.13.0) so type checks run locally
- Add [tool.mypy] config to scripts/pyproject.toml with overrides for untyped libs (ebooklib, sync_translations)
- Add mypy>=1.8.0 to requirements-dev.txt
- Fix CI test.yml: remove continue-on-error: true from lint/security/type-check jobs (was silently swallowing failures)
- Fix CI bandit -c path: pyproject.toml → scripts/pyproject.toml
- Fix CI mypy command: use --config-file scripts/pyproject.toml
- Fix CI build-epub: add type-check to needs, fix if: success() → !failure() && !cancelled()
- Fix ruff errors in sync_translations.py (RUF013 implicit Optional, SIM102 nested if)
- Fix mypy errors: add list[str] annotations to errors vars in check_cross_references.py and check_links.py

* fix(ci): install mmdc in build-epub job and correct return type annotation

- Add npm install step for @mermaid-js/mermaid-cli before Build EPUB
  to fix CI failure (mmdc not found error)
- Fix check_translation_status() return type from list[dict] to
  tuple[list[dict], list[dict]] to match the actual return value

* fix(ci): pass --no-sandbox to Puppeteer in build-epub CI job

mmdc (Mermaid CLI) uses Puppeteer/Chromium which requires --no-sandbox
in the GitHub Actions sandboxed environment. Add --puppeteer-config flag
to build_epub.py that passes a Puppeteer JSON config file to mmdc via -p,
and use it in the CI workflow to inject the no-sandbox args.
2026-04-07 00:51:44 +02:00

114 lines
3.8 KiB
Python

#!/usr/bin/env python3
"""Validate cross-references, anchors, and code fences in Markdown files."""
import re
import sys
from pathlib import Path
IGNORE_DIRS = {
".venv",
"node_modules",
".git",
"blog-posts",
"openspec",
"prompts",
".agents",
}
IGNORE_FILES = {"README.backup.md"}
def iter_md_files():
for f in Path().rglob("*.md"):
if (
not any(part in IGNORE_DIRS for part in f.parts)
and f.name not in IGNORE_FILES
):
yield f
def heading_to_anchor(heading: str) -> str:
# Match GitHub's anchor generation: strip emoji and special punctuation,
# keep Unicode letters (including Vietnamese diacritics), lowercase,
# replace spaces with hyphens, strip leading/trailing hyphens.
# 1. Remove emoji (characters outside BMP or in known emoji ranges)
heading = re.sub(
r"[\U0001F000-\U0001FFFF" # Supplementary Multilingual Plane symbols
r"\U00002702-\U000027B0" # Dingbats
r"\U0000FE00-\U0000FE0F" # Variation selectors
r"\U0000200D" # Zero-width joiner
r"\U000000A9\U000000AE" # (C) (R)
r"\U00002000-\U0000206F" # General punctuation (some emoji-adjacent)
r"]",
"",
heading,
)
# 2. Remove punctuation but keep Unicode word chars, spaces, and hyphens
anchor = re.sub(r"[^\w\s-]", "", heading.lower(), flags=re.UNICODE)
# 3. Replace spaces with hyphens
anchor = anchor.replace(" ", "-")
# 4. Strip trailing hyphens (keep leading hyphens for emoji-prefixed headings)
return anchor.rstrip("-")
def strip_code_blocks(content: str) -> str:
"""Remove fenced code blocks and inline code spans to avoid scanning example links."""
# Strip fenced code blocks (``` ... ```)
content = re.sub(r"```[^\n]*\n.*?```", "", content, flags=re.DOTALL)
# Strip inline code spans (` ... `)
content = re.sub(r"`[^`\n]+`", "", content)
return content
def main() -> int:
errors: list[str] = []
for file_path in iter_md_files():
content = file_path.read_text(encoding="utf-8")
# Strip code blocks before scanning for links/anchors to avoid false positives
# from documentation examples inside code fences.
scannable = strip_code_blocks(content)
# Relative .md links must resolve
errors.extend(
f"{file_path}: broken cross-reference → '{link_path}'"
for link_path in re.findall(r"\[[^\]]+\]\(([^)#]+\.md)[^)]*\)", scannable)
if not (file_path.parent / link_path).resolve().exists()
)
# In-page anchors must match a real heading
anchors = re.findall(r"\[[^\]]+\]\(#([^)]+)\)", scannable)
if anchors:
headings = re.findall(r"^#{1,6}\s+(.+)$", content, re.MULTILINE)
valid_anchors = {heading_to_anchor(h) for h in headings}
errors.extend(
f"{file_path}: broken anchor → '#{anchor}'"
for anchor in anchors
if anchor not in valid_anchors
)
# Unmatched code fences (only count fences at start of line)
if len(re.findall(r"^```", content, re.MULTILINE)) % 2 != 0:
errors.append(f"{file_path}: unmatched code fences")
# All numbered lesson dirs must have README.md
for i in range(1, 11):
errors.extend(
f"{d}: missing README.md"
for d in Path().glob(f"{i:02d}-*")
if d.is_dir() and not (d / "README.md").exists()
)
if errors:
print("❌ Cross-reference errors:")
for e in errors:
print(f" - {e}")
return 1
md_count = sum(1 for _ in iter_md_files())
print(f"✅ All cross-references valid ({md_count} files checked)")
return 0
if __name__ == "__main__":
sys.exit(main())