Files
CVEs-PoC/scripts/build_site.py
2025-12-18 07:13:08 +01:00

236 lines
7.1 KiB
Python

from __future__ import annotations
import argparse
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from urllib.parse import urlparse
from jinja2 import Environment, FileSystemLoader, select_autoescape
from utils import DOCS_DIR, TEMPLATES_DIR, ensure_dirs, load_blacklist, parse_trending_from_readme, is_blacklisted_repo
ROOT = DOCS_DIR.parent
README_PATH = ROOT / "README.md"
CVE_OUTPUT = DOCS_DIR / "CVE_list.json"
REMOVED_OUTPUT = DOCS_DIR / "CVE_blacklist_removed.json"
TRENDING_OUTPUT = DOCS_DIR / "trending_poc.json"
def build_env() -> Environment:
loader = FileSystemLoader(str(TEMPLATES_DIR))
env = Environment(loader=loader, autoescape=select_autoescape(["html", "xml"]))
env.trim_blocks = True
env.lstrip_blocks = True
return env
def render(env: Environment, template_name: str, context: Dict, output_path: Path) -> None:
html = env.get_template(template_name).render(**context)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(html, encoding="utf-8")
def normalise_block(text: str) -> str:
text = text.replace("\r\n", "\n")
text = re.sub(r"\n{2,}", "\n", text.strip())
lines = [line.lstrip("- ").rstrip() for line in text.split("\n")]
return "\n".join(line for line in lines if line)
def parse_sections(content: str) -> Dict[str, str]:
sections: Dict[str, str] = {}
current_header: Optional[str] = None
buffer: List[str] = []
for line in content.splitlines():
header = line.strip()
if header.startswith("### ") or header.startswith("#### "):
if current_header is not None:
sections[current_header] = "\n".join(buffer).strip()
current_header = header
buffer = []
else:
buffer.append(line)
if current_header is not None:
sections[current_header] = "\n".join(buffer).strip()
return sections
def repo_from_url(url: str) -> str:
try:
parsed = urlparse(url)
host = (parsed.netloc or "").lower()
if host and "github" not in host:
return ""
path = parsed.path or url
except Exception:
path = url
parts = path.strip("/").split("/")
if len(parts) >= 2:
return parts[1].lower()
return (parts[-1] if parts else "").lower()
def is_blacklisted(url: str, blacklist: List[str]) -> bool:
repo = repo_from_url(url)
if not repo:
return False
for entry in blacklist:
slug = entry.lower()
if not slug:
continue
if slug.endswith("*"):
if repo.startswith(slug[:-1]):
return True
elif repo == slug:
return True
return False
def collect_links(block: str, *, blacklist: Optional[List[str]] = None, removed: Optional[List[str]] = None) -> List[str]:
links: List[str] = []
blacklist = blacklist or []
if removed is None:
removed = []
for raw in block.splitlines():
entry = raw.strip()
if not entry or "No PoCs" in entry:
continue
if entry.startswith("- "):
entry = entry[2:].strip()
if not entry:
continue
if is_blacklisted(entry, blacklist):
removed.append(entry)
continue
if entry not in links:
links.append(entry)
return links
def build_cve_list(blacklist: List[str]) -> Dict[str, object]:
cve_entries = []
removed_by_cve: Dict[str, List[str]] = {}
removed_seen: set[str] = set()
for md_path in sorted(ROOT.glob("[12][0-9][0-9][0-9]/CVE-*.md")):
content = md_path.read_text(encoding="utf-8")
sections = parse_sections(content)
description = normalise_block(sections.get("### Description", ""))
removed_links: List[str] = []
references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist, removed=removed_links)
github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist, removed=removed_links)
poc_entries: List[str] = []
seen = set()
for link in references + github_links:
if link not in seen:
poc_entries.append(link)
seen.add(link)
cve_id = md_path.stem
if removed_links:
removed_by_cve[cve_id] = sorted(set(removed_links))
removed_seen.update(removed_links)
if not poc_entries:
continue
cve_entries.append({
"cve": cve_id,
"desc": description,
"poc": poc_entries,
})
return {
"entries": cve_entries,
"removed": {
"removed": sorted(removed_seen),
"by_cve": removed_by_cve,
},
}
def build_trending(blacklist: List[str]) -> List[Dict[str, object]]:
rows = parse_trending_from_readme(README_PATH)
if not rows:
return []
by_year: Dict[int, List[Dict[str, object]]] = {}
for row in rows:
year_text = row.get("year") or ""
if not str(year_text).isdigit():
continue
year = int(year_text)
url = (row.get("url") or "").strip()
if url and is_blacklisted_repo(url, blacklist):
continue
stars_text = str(row.get("stars") or "").strip()
stars = int(re.sub(r"\D", "", stars_text) or 0)
item = {
"year": year,
"stars": stars,
"updated": (row.get("updated") or "").strip(),
"name": (row.get("name") or "").strip(),
"url": url,
"desc": (row.get("desc") or "").strip(),
}
by_year.setdefault(year, []).append(item)
if not by_year:
return []
current_year = datetime.now(timezone.utc).year
target_year = current_year if current_year in by_year else max(by_year)
return by_year.get(target_year, [])
def write_json(path: Path, data, *, indent: Optional[int] = None) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as handle:
json.dump(data, handle, ensure_ascii=False, indent=indent)
def main() -> int:
parser = argparse.ArgumentParser(description="Build CVE PoC site")
parser.add_argument(
"--html-mode",
choices=["none", "summary", "all"],
default="summary",
help="Render HTML or skip it.",
)
args = parser.parse_args()
ensure_dirs(DOCS_DIR)
blacklist = load_blacklist()
cve_payload = build_cve_list(blacklist)
write_json(CVE_OUTPUT, cve_payload["entries"])
write_json(REMOVED_OUTPUT, cve_payload["removed"], indent=2)
trending_items = build_trending(blacklist)
write_json(
TRENDING_OUTPUT,
{
"generated": datetime.now(timezone.utc).isoformat(),
"items": trending_items,
},
indent=2,
)
if args.html_mode != "none":
env = build_env()
render(env, "index.html", {"trending": trending_items}, DOCS_DIR / "index.html")
print("Site generated under docs/")
return 0
if __name__ == "__main__":
raise SystemExit(main())