mirror of
https://github.com/luongnv89/claude-howto.git
synced 2026-06-05 22:36:34 +02:00
refactor(ci): shift quality checks to pre-commit, CI as 2nd pass (#34)
* refactor(ci): shift quality checks to pre-commit, CI as 2nd pass
- Remove ci.yml (lint, security, pytest were only for EPUB scripts)
- Move EPUB build to pre-commit local hook (runs on .md changes)
- Add check_cross_references.py, check_mermaid.py, check_links.py scripts
- Add markdown-lint, cross-references, mermaid-syntax, link-check as
pre-commit hooks — mirrors all 4 CI doc-check jobs locally
- Remove spell check job from docs-check.yml (breaks on translations)
- Refactor docs-check.yml to reuse scripts/ instead of inline Python
- Add .markdownlint.json config shared by pre-commit and CI
- Update CONTRIBUTING.md with required dependencies and hook table
* fix(ci): resolve all CI check failures in docs-check workflow
- fix(check_cross_references): skip code blocks and inline code spans
to avoid false positives from documentation examples; fix emoji
heading anchor generation (rstrip not strip); add blog-posts,
openspec, prompts, .agents to IGNORE_DIRS; ignore README.backup.md
- fix(check_links): strip trailing Markdown punctuation from captured
URLs; add wikipedia, api.github.com to SKIP_DOMAINS; add placeholder
URL patterns to SKIP_URL_PATTERNS; add .agents/.claude to IGNORE_DIRS
- fix(check_mermaid): add --no-sandbox puppeteer config support via
MERMAID_PUPPETEER_NO_SANDBOX env var for GitHub Actions Linux runners
- fix(docs-check.yml): pass MERMAID_PUPPETEER_NO_SANDBOX=true to mermaid job
- fix(content): repair broken anchors in README.md, 09-advanced-features;
fix #plugins -> #claude-code-plugins in claude_concepts_guide.md;
remove non-existent ./docs/performance.md placeholder links; fix
dependabot alerts URL in SECURITY_REPORTING.md; update auto-mode URL
in resources.md; use placeholder pattern for 07-plugins example URL
- remove README.backup.md (stale file)
* fix(check-scripts): fix strip_code_blocks regex and URL fragment handling
- fix regex in strip_code_blocks to avoid conflicting MULTILINE+DOTALL
flags that could fail to strip indented code fences; use DOTALL only
- strip URL fragments (#section) before dispatching link checks to avoid
false-positive 404s on valid URLs with anchor fragments
* fix(check-scripts): fix anchor stripping, cross-ref enforcement, and mermaid temp file cleanup
- heading_to_anchor: use .strip("-") instead of .rstrip("-") to also strip leading hyphens
produced by emoji-prefixed headings, preventing false-positive anchor errors
- check_cross_references: always exit with main()'s return code — filesystem checks
should block pre-commit unconditionally, not silently pass on errors
- check_mermaid: wrap file-processing loop in try/finally so the puppeteer config
temp file is cleaned up even if an unexpected exception (e.g. UnicodeDecodeError) occurs
- docs-check.yml: remove now-unused CROSS_REF_STRICT env var
* fix(scripts): fix anchor stripping and mermaid output path
- Replace .strip('-') with .rstrip('-') in heading_to_anchor() so leading
hyphens from emoji-prefixed headings are preserved, matching GitHub's
anchor generation behaviour.
- Use Path.with_suffix('.svg') in check_mermaid.py instead of
str.replace('.mmd', '.svg') to avoid replacing all occurrences of .mmd
in the full temp path.
This commit is contained in:
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate cross-references, anchors, and code fences in Markdown files."""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
IGNORE_DIRS = {
|
||||
".venv",
|
||||
"node_modules",
|
||||
".git",
|
||||
"blog-posts",
|
||||
"openspec",
|
||||
"prompts",
|
||||
".agents",
|
||||
}
|
||||
IGNORE_FILES = {"README.backup.md"}
|
||||
|
||||
|
||||
def iter_md_files():
|
||||
for f in Path().rglob("*.md"):
|
||||
if (
|
||||
not any(part in IGNORE_DIRS for part in f.parts)
|
||||
and f.name not in IGNORE_FILES
|
||||
):
|
||||
yield f
|
||||
|
||||
|
||||
def heading_to_anchor(heading: str) -> str:
|
||||
# Match GitHub's anchor generation: strip non-ASCII (emoji), strip punctuation,
|
||||
# lowercase, replace spaces with hyphens, strip leading/trailing hyphens.
|
||||
heading_ascii = heading.encode("ascii", "ignore").decode()
|
||||
return re.sub(r"[^\w\s-]", "", heading_ascii.lower()).replace(" ", "-").rstrip("-")
|
||||
|
||||
|
||||
def strip_code_blocks(content: str) -> str:
|
||||
"""Remove fenced code blocks and inline code spans to avoid scanning example links."""
|
||||
# Strip fenced code blocks (``` ... ```)
|
||||
content = re.sub(r"```[^\n]*\n.*?```", "", content, flags=re.DOTALL)
|
||||
# Strip inline code spans (` ... `)
|
||||
content = re.sub(r"`[^`\n]+`", "", content)
|
||||
return content
|
||||
|
||||
|
||||
def main() -> int:
|
||||
errors = []
|
||||
|
||||
for file_path in iter_md_files():
|
||||
content = file_path.read_text()
|
||||
# Strip code blocks before scanning for links/anchors to avoid false positives
|
||||
# from documentation examples inside code fences.
|
||||
scannable = strip_code_blocks(content)
|
||||
|
||||
# Relative .md links must resolve
|
||||
errors.extend(
|
||||
f"{file_path}: broken cross-reference → '{link_path}'"
|
||||
for link_path in re.findall(r"\[[^\]]+\]\(([^)#]+\.md)[^)]*\)", scannable)
|
||||
if not (file_path.parent / link_path).resolve().exists()
|
||||
)
|
||||
|
||||
# In-page anchors must match a real heading
|
||||
anchors = re.findall(r"\[[^\]]+\]\(#([^)]+)\)", scannable)
|
||||
if anchors:
|
||||
headings = re.findall(r"^#{1,6}\s+(.+)$", content, re.MULTILINE)
|
||||
valid_anchors = {heading_to_anchor(h) for h in headings}
|
||||
errors.extend(
|
||||
f"{file_path}: broken anchor → '#{anchor}'"
|
||||
for anchor in anchors
|
||||
if anchor not in valid_anchors
|
||||
)
|
||||
|
||||
# Unmatched code fences (only count fences at start of line)
|
||||
if len(re.findall(r"^```", content, re.MULTILINE)) % 2 != 0:
|
||||
errors.append(f"{file_path}: unmatched code fences")
|
||||
|
||||
# All numbered lesson dirs must have README.md
|
||||
for i in range(1, 11):
|
||||
errors.extend(
|
||||
f"{d}: missing README.md"
|
||||
for d in Path().glob(f"{i:02d}-*")
|
||||
if d.is_dir() and not (d / "README.md").exists()
|
||||
)
|
||||
|
||||
if errors:
|
||||
print("❌ Cross-reference errors:")
|
||||
for e in errors:
|
||||
print(f" - {e}")
|
||||
return 1
|
||||
|
||||
md_count = sum(1 for _ in iter_md_files())
|
||||
print(f"✅ All cross-references valid ({md_count} files checked)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Check external URLs in Markdown files are reachable."""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
IGNORE_DIRS = {
|
||||
".venv",
|
||||
"node_modules",
|
||||
".git",
|
||||
"blog-posts",
|
||||
"openspec",
|
||||
"prompts",
|
||||
".agents",
|
||||
".claude",
|
||||
}
|
||||
TIMEOUT = 10
|
||||
# Domains/patterns to skip: badges, placeholders, and bot-blocking hosts
|
||||
SKIP_DOMAINS = {
|
||||
"shields.io",
|
||||
"img.shields.io",
|
||||
"star-history.com",
|
||||
"api.star-history.com",
|
||||
"example.com",
|
||||
"localhost",
|
||||
"127.0.0.1",
|
||||
"my-webhook.example.com",
|
||||
"git.internal",
|
||||
# Wikipedia blocks HEAD requests — GET also unreliable in CI without network
|
||||
"en.wikipedia.org",
|
||||
"wikipedia.org",
|
||||
# GitHub API requires auth — unauthenticated requests return 404 for protected endpoints
|
||||
"api.github.com",
|
||||
}
|
||||
SKIP_DOMAIN_SUFFIXES = (".example.com", ".example.org", ".internal")
|
||||
# Placeholder/template URLs that are intentionally non-resolvable
|
||||
SKIP_URL_PATTERNS = {
|
||||
"github.com/org/",
|
||||
"github.com/user/",
|
||||
"github.com/your-org/",
|
||||
"docs.example.com",
|
||||
}
|
||||
|
||||
URL_RE = re.compile(r"https?://[a-zA-Z0-9][a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%]+")
|
||||
|
||||
|
||||
def is_skipped(url: str) -> bool:
|
||||
try:
|
||||
domain = url.split("/")[2]
|
||||
except IndexError:
|
||||
return True # malformed URL
|
||||
if any(skip == domain or domain.endswith("." + skip) for skip in SKIP_DOMAINS):
|
||||
return True
|
||||
if any(domain.endswith(suffix) for suffix in SKIP_DOMAIN_SUFFIXES):
|
||||
return True
|
||||
return any(pattern in url for pattern in SKIP_URL_PATTERNS)
|
||||
|
||||
|
||||
def check_url(url: str) -> tuple[str, bool, str]:
|
||||
if is_skipped(url):
|
||||
return url, True, "skipped"
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
url, headers={"User-Agent": "Mozilla/5.0"}, method="HEAD"
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=TIMEOUT): # nosec B310
|
||||
return url, True, "ok"
|
||||
except urllib.error.HTTPError as e:
|
||||
# 403/429 often means the server is up but blocks bots — treat as ok
|
||||
if e.code in (401, 403, 405, 429):
|
||||
return url, True, f"http {e.code} (ignored)"
|
||||
return url, False, f"HTTP {e.code}"
|
||||
except Exception as e:
|
||||
return url, False, str(e)
|
||||
|
||||
|
||||
def main(strict: bool = False) -> int:
|
||||
urls: dict[str, list[str]] = {} # url -> [file, ...]
|
||||
|
||||
md_files = [
|
||||
f
|
||||
for f in Path().rglob("*.md")
|
||||
if not any(part in IGNORE_DIRS for part in f.parts)
|
||||
]
|
||||
|
||||
for file_path in md_files:
|
||||
content = file_path.read_text()
|
||||
for raw_url in URL_RE.findall(content):
|
||||
# Strip trailing Markdown/punctuation characters the regex may over-capture
|
||||
# from link syntax like [text](https://url/) or **https://url)**
|
||||
clean_url = raw_url.rstrip(")>*_`':.,;").split("#")[0]
|
||||
urls.setdefault(clean_url, []).append(str(file_path))
|
||||
|
||||
if not urls:
|
||||
print("✅ No external URLs found")
|
||||
return 0
|
||||
|
||||
errors = []
|
||||
with ThreadPoolExecutor(max_workers=10) as pool:
|
||||
futures = {pool.submit(check_url, url): url for url in urls}
|
||||
for future in as_completed(futures):
|
||||
url, ok, reason = future.result()
|
||||
if not ok:
|
||||
errors.extend(f"{f}: dead link → {url} ({reason})" for f in urls[url])
|
||||
|
||||
if errors:
|
||||
print("❌ Dead links found:")
|
||||
for e in sorted(errors):
|
||||
print(f" - {e}")
|
||||
# In non-strict mode (pre-commit), report but don't block the commit.
|
||||
# Set LINK_CHECK_STRICT=1 (as CI does) to enforce failures.
|
||||
return 1 if strict else 0
|
||||
|
||||
print(f"✅ All external URLs reachable ({len(urls)} checked)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
sys.exit(main(strict=os.environ.get("LINK_CHECK_STRICT") == "1"))
|
||||
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate Mermaid diagram syntax in Markdown files using mmdc."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess # nosec B404
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
IGNORE_DIRS = {".venv", "node_modules", ".git", "blog-posts", ".agents"}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if not shutil.which("mmdc"):
|
||||
print(
|
||||
"⚠ mmdc not found — skipping Mermaid validation (install @mermaid-js/mermaid-cli)"
|
||||
)
|
||||
return 0
|
||||
|
||||
errors = []
|
||||
checked = 0
|
||||
|
||||
# On GitHub Actions Linux runners, Chrome/Puppeteer requires --no-sandbox.
|
||||
# Write a temporary puppeteer config when MERMAID_PUPPETEER_NO_SANDBOX is set.
|
||||
puppeteer_config_path = None
|
||||
extra_args: list[str] = []
|
||||
if os.environ.get("MERMAID_PUPPETEER_NO_SANDBOX") == "true":
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".json", mode="w", delete=False
|
||||
) as pcfg:
|
||||
json.dump({"args": ["--no-sandbox", "--disable-setuid-sandbox"]}, pcfg)
|
||||
puppeteer_config_path = pcfg.name
|
||||
extra_args = ["-p", puppeteer_config_path]
|
||||
|
||||
md_files = [
|
||||
f
|
||||
for f in Path().rglob("*.md")
|
||||
if not any(part in IGNORE_DIRS for part in f.parts)
|
||||
]
|
||||
|
||||
try:
|
||||
for file_path in md_files:
|
||||
content = file_path.read_text()
|
||||
blocks = re.findall(r"```mermaid\n(.*?)```", content, re.DOTALL)
|
||||
for i, block in enumerate(blocks):
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".mmd", mode="w", delete=False
|
||||
) as tmp:
|
||||
tmp.write(block)
|
||||
tmp_path = tmp.name
|
||||
out_path = str(Path(tmp_path).with_suffix(".svg"))
|
||||
try:
|
||||
result = subprocess.run( # nosec B603 B607
|
||||
["mmdc", "-i", tmp_path, "-o", out_path, *extra_args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
errors.append(
|
||||
f"{file_path} (block {i + 1}): {result.stderr.strip()}"
|
||||
)
|
||||
else:
|
||||
checked += 1
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
Path(out_path).unlink(missing_ok=True)
|
||||
finally:
|
||||
if puppeteer_config_path:
|
||||
Path(puppeteer_config_path).unlink(missing_ok=True)
|
||||
|
||||
print(f"✅ Checked {checked} Mermaid diagram(s)")
|
||||
if errors:
|
||||
print("\n❌ Mermaid errors:")
|
||||
for e in errors:
|
||||
print(f" - {e}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user