CVEs-PoC/scripts/build_site.py

from __future__ import annotations

import argparse
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urlparse

from jinja2 import Environment, FileSystemLoader, select_autoescape

from utils import DOCS_DIR, TEMPLATES_DIR, ensure_dirs, load_blacklist, parse_trending_from_readme, is_blacklisted_repo

ROOT = DOCS_DIR.parent
README_PATH = ROOT / "README.md"
CVE_OUTPUT = DOCS_DIR / "CVE_list.json"
REMOVED_OUTPUT = DOCS_DIR / "CVE_blacklist_removed.json"
TRENDING_OUTPUT = DOCS_DIR / "trending_poc.json"


def build_env() -> Environment:
    loader = FileSystemLoader(str(TEMPLATES_DIR))
    env = Environment(loader=loader, autoescape=select_autoescape(["html", "xml"]))
    env.trim_blocks = True
    env.lstrip_blocks = True
    return env


def render(env: Environment, template_name: str, context: Dict, output_path: Path) -> None:
    html = env.get_template(template_name).render(**context)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(html, encoding="utf-8")


def normalise_block(text: str) -> str:
    text = text.replace("\r\n", "\n")
    text = re.sub(r"\n{2,}", "\n", text.strip())
    lines = [line.lstrip("- ").rstrip() for line in text.split("\n")]
    return "\n".join(line for line in lines if line)


def parse_sections(content: str) -> Dict[str, str]:
    sections: Dict[str, str] = {}
    current_header: Optional[str] = None
    buffer: List[str] = []

    for line in content.splitlines():
        header = line.strip()
        if header.startswith("### ") or header.startswith("#### "):
            if current_header is not None:
                sections[current_header] = "\n".join(buffer).strip()
            current_header = header
            buffer = []
        else:
            buffer.append(line)

    if current_header is not None:
        sections[current_header] = "\n".join(buffer).strip()

    return sections


def repo_from_url(url: str) -> str:
    try:
        parsed = urlparse(url)
        host = (parsed.netloc or "").lower()
        if host and "github" not in host:
            return ""
        path = parsed.path or url
    except Exception:
        path = url
    parts = path.strip("/").split("/")
    if len(parts) >= 2:
        return parts[1].lower()
    return (parts[-1] if parts else "").lower()


def is_blacklisted(url: str, blacklist: List[str]) -> bool:
    repo = repo_from_url(url)
    if not repo:
        return False
    for entry in blacklist:
        slug = entry.lower()
        if not slug:
            continue
        if slug.endswith("*"):
            if repo.startswith(slug[:-1]):
                return True
        elif repo == slug:
            return True
    return False


def collect_links(block: str, *, blacklist: Optional[List[str]] = None, removed: Optional[List[str]] = None) -> List[str]:
    links: List[str] = []
    blacklist = blacklist or []
    if removed is None:
        removed = []
    for raw in block.splitlines():
        entry = raw.strip()
        if not entry or "No PoCs" in entry:
            continue
        if entry.startswith("- "):
            entry = entry[2:].strip()
        if not entry:
            continue
        if is_blacklisted(entry, blacklist):
            removed.append(entry)
            continue
        if entry not in links:
            links.append(entry)
    return links


def build_cve_list(blacklist: List[str]) -> Dict[str, object]:
    cve_entries = []
    removed_by_cve: Dict[str, List[str]] = {}
    removed_seen: set[str] = set()

    for md_path in sorted(ROOT.glob("[12][0-9][0-9][0-9]/CVE-*.md")):
        content = md_path.read_text(encoding="utf-8")
        sections = parse_sections(content)
        description = normalise_block(sections.get("### Description", ""))
        removed_links: List[str] = []
        references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist, removed=removed_links)
        github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist, removed=removed_links)

        poc_entries: List[str] = []
        seen = set()
        for link in references + github_links:
            if link not in seen:
                poc_entries.append(link)
                seen.add(link)

        cve_id = md_path.stem
        if removed_links:
            removed_by_cve[cve_id] = sorted(set(removed_links))
            removed_seen.update(removed_links)

        if not poc_entries:
            continue

        cve_entries.append({
            "cve": cve_id,
            "desc": description,
            "poc": poc_entries,
        })

    return {
        "entries": cve_entries,
        "removed": {
            "removed": sorted(removed_seen),
            "by_cve": removed_by_cve,
        },
    }


def build_trending(blacklist: List[str]) -> List[Dict[str, object]]:
    rows = parse_trending_from_readme(README_PATH)
    if not rows:
        return []

    by_year: Dict[int, List[Dict[str, object]]] = {}
    for row in rows:
        year_text = row.get("year") or ""
        if not str(year_text).isdigit():
            continue
        year = int(year_text)
        url = (row.get("url") or "").strip()
        if url and is_blacklisted_repo(url, blacklist):
            continue
        stars_text = str(row.get("stars") or "").strip()
        stars = int(re.sub(r"\D", "", stars_text) or 0)
        item = {
            "year": year,
            "stars": stars,
            "updated": (row.get("updated") or "").strip(),
            "name": (row.get("name") or "").strip(),
            "url": url,
            "desc": (row.get("desc") or "").strip(),
        }
        by_year.setdefault(year, []).append(item)

    if not by_year:
        return []

    current_year = datetime.now(timezone.utc).year
    target_year = current_year if current_year in by_year else max(by_year)
    return by_year.get(target_year, [])


def write_json(path: Path, data, *, indent: Optional[int] = None) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        json.dump(data, handle, ensure_ascii=False, indent=indent)


def main() -> int:
    parser = argparse.ArgumentParser(description="Build CVE PoC site")
    parser.add_argument(
        "--html-mode",
        choices=["none", "summary", "all"],
        default="summary",
        help="Render HTML or skip it.",
    )
    args = parser.parse_args()

    ensure_dirs(DOCS_DIR)
    blacklist = load_blacklist()

    cve_payload = build_cve_list(blacklist)
    write_json(CVE_OUTPUT, cve_payload["entries"])
    write_json(REMOVED_OUTPUT, cve_payload["removed"], indent=2)

    trending_items = build_trending(blacklist)
    write_json(
        TRENDING_OUTPUT,
        {
            "generated": datetime.now(timezone.utc).isoformat(),
            "items": trending_items,
        },
        indent=2,
    )

    if args.html_mode != "none":
        env = build_env()
        render(env, "index.html", {"trending": trending_items}, DOCS_DIR / "index.html")

    print("Site generated under docs/")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())