Files
CVEs-PoC/docs/generate_cve_list.py

169 lines
5.1 KiB
Python

#!/usr/bin/python3
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse
ROOT = Path(__file__).resolve().parent.parent
OUTPUT = Path(__file__).resolve().with_name("CVE_list.json")
REMOVED_OUTPUT = Path(__file__).resolve().with_name("CVE_blacklist_removed.json")
BLACKLIST = ROOT / "blacklist.txt"
def load_blacklist(path: Path = BLACKLIST) -> List[str]:
if not path.exists():
return []
items: List[str] = []
for raw in path.read_text(encoding="utf-8").splitlines():
entry = raw.strip()
if entry and not entry.startswith("#"):
items.append(entry)
return items
def repo_from_url(url: str) -> str:
try:
parsed = urlparse(url)
host = (parsed.netloc or "").lower()
if host and "github" not in host:
return ""
path = parsed.path or url
except Exception:
path = url
parts = path.strip("/").split("/")
if len(parts) >= 2:
return parts[1].lower()
return (parts[-1] if parts else "").lower()
def is_blacklisted(url: str, blacklist: List[str]) -> bool:
repo = repo_from_url(url)
if not repo:
return False
for entry in blacklist:
slug = entry.lower()
if not slug:
continue
if slug.endswith("*"):
if repo.startswith(slug[:-1]):
return True
elif repo == slug:
return True
return False
def normalise_block(text: str) -> str:
text = text.replace("\r\n", "\n")
text = re.sub(r"\n{2,}", "\n", text.strip())
lines = [line.lstrip("- ").rstrip() for line in text.split("\n")]
return "\n".join(line for line in lines if line)
def parse_sections(content: str) -> Dict[str, str]:
sections: Dict[str, str] = {}
current_header: str | None = None
buffer: List[str] = []
for line in content.splitlines():
header = line.strip()
if header.startswith("### ") or header.startswith("#### "):
if current_header is not None:
sections[current_header] = "\n".join(buffer).strip()
current_header = header
buffer = []
else:
buffer.append(line)
if current_header is not None:
sections[current_header] = "\n".join(buffer).strip()
return sections
def collect_links(block: str, *, blacklist: Optional[List[str]] = None, removed: Optional[List[str]] = None) -> List[str]:
links: List[str] = []
blacklist = blacklist or []
if removed is None:
removed = []
for raw in block.splitlines():
entry = raw.strip()
if not entry or "No PoCs" in entry:
continue
if entry.startswith("- "):
entry = entry[2:].strip()
if not entry:
continue
if is_blacklisted(entry, blacklist):
removed.append(entry)
continue
if entry not in links:
links.append(entry)
return links
def main() -> None:
blacklist = load_blacklist()
cve_entries = []
removed_by_cve: Dict[str, List[str]] = {}
removed_seen: set[str] = set()
years = [entry for entry in os.listdir(ROOT) if entry.isdigit()]
years.sort(reverse=True)
for year in years:
year_dir = ROOT / year
for filename in sorted(os.listdir(year_dir)):
if not filename.endswith(".md"):
continue
with open(year_dir / filename, "r", encoding="utf-8") as handle:
content = handle.read()
sections = parse_sections(content)
description = normalise_block(sections.get("### Description", ""))
removed_links: List[str] = []
references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist, removed=removed_links)
github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist, removed=removed_links)
poc_entries: List[str] = []
seen = set()
for link in references + github_links:
if link not in seen:
poc_entries.append(link)
seen.add(link)
cve_id = filename.replace(".md", "")
if removed_links:
removed_by_cve[cve_id] = sorted(set(removed_links))
removed_seen.update(removed_links)
# Skip CVEs with zero PoCs (both sections empty) to keep lookup clean
if not poc_entries:
continue
cve_entries.append({
"cve": cve_id,
"desc": description,
"poc": poc_entries,
})
with open(OUTPUT, "w", encoding="utf-8") as outfile:
json.dump(cve_entries, outfile, ensure_ascii=False)
with open(REMOVED_OUTPUT, "w", encoding="utf-8") as removed_file:
json.dump(
{
"removed": sorted(removed_seen),
"by_cve": removed_by_cve,
},
removed_file,
ensure_ascii=False,
indent=2,
)
print("CVE list saved to CVE_list.json")
if __name__ == "__main__":
main()