Add PoC pipeline with blacklist filtering and Pages build

This commit is contained in:
0xMarcio
2025-12-17 15:53:37 +01:00
parent b1085c10f5
commit 1f0cd8e78b
20 changed files with 188921 additions and 56 deletions

69
.github/workflows/build.yml vendored Normal file
View File

@@ -0,0 +1,69 @@
name: Build pipeline + Pages
on:
schedule:
- cron: "15 5 * * *"
workflow_dispatch:
permissions:
contents: read
pages: write
id-token: write
concurrency:
group: pages
cancel-in-progress: false
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Cache dependencies and API cache
uses: actions/cache@v4
with:
path: |
~/.cache/pip
data/cache
key: ${{ runner.os }}-cve-pipeline-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-cve-pipeline-
- name: Install requirements
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Build pipeline outputs + site
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: python scripts/build_all.py --days 7
- name: Validate JSON index
run: python -m json.tool docs/api/v1/index.json > /dev/null
- name: Configure Pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: docs
deploy:
needs: build
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deploy.outputs.page_url }}
steps:
- name: Deploy to GitHub Pages
id: deploy
uses: actions/deploy-pages@v4

View File

@@ -1,5 +1,7 @@
<h1 align="center">Recently updated Proof-of-Concepts</h1>
> Live API + site: `/api/v1/index.json`, `/api/v1/top/today.json`, and Pages generated via `python scripts/build_all.py` for the new GitHub PoC discovery + scoring pipeline.
## 2025
@@ -138,4 +140,4 @@
| 312⭐ | 4 days ago | [CVE-2021-26084_Confluence](https://github.com/hev0x/CVE-2021-26084_Confluence) | Confluence Server Webwork OGNL injection |
| 328⭐ | 6 days ago | [CVE-2021-1675-LPE](https://github.com/hlldz/CVE-2021-1675-LPE) | Local Privilege Escalation Edition for CVE-2021-1675/CVE-2021-34527 |
| 233⭐ | 92 days ago | [CVE-2021-38647](https://github.com/horizon3ai/CVE-2021-38647) | Proof on Concept Exploit for CVE-2021-38647 (OMIGOD) |
| 235⭐ | 15 days ago | [CVE-2021-24086](https://github.com/0vercl0k/CVE-2021-24086) | Proof of concept for CVE-2021-24086, a NULL dereference in tcpip.sys triggered remotely. |
| 235⭐ | 15 days ago | [CVE-2021-24086](https://github.com/0vercl0k/CVE-2021-24086) | Proof of concept for CVE-2021-24086, a NULL dereference in tcpip.sys triggered remotely. |

187191
docs/CVE_blacklist_removed.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

54
docs/assets/app.js Normal file
View File

@@ -0,0 +1,54 @@
(function() {
const qs = (sel) => document.querySelector(sel);
const initIndexSearch = () => {
const input = document.querySelector("[data-index-search]");
if (!input) return;
const targetSel = input.getAttribute("data-target");
const target = targetSel ? qs(targetSel) : null;
const indexUrl = input.getAttribute("data-index-url");
if (!target || !indexUrl) return;
let cached = [];
fetch(indexUrl)
.then((resp) => resp.json())
.then((data) => { cached = data.items || []; })
.catch(() => { target.innerHTML = "<p class='muted small'>Index unavailable.</p>"; });
const render = (term) => {
if (!cached.length) return;
const value = term.trim().toLowerCase();
const results = cached.filter((row) => {
if (!value) return false;
return row.cve_id.toLowerCase().includes(value) ||
(row.top_languages || []).join(" ").toLowerCase().includes(value) ||
String(row.max_score || "").includes(value);
}).slice(0, 40);
if (!results.length) {
target.innerHTML = "<p class='muted small'>No matches yet.</p>";
return;
}
target.innerHTML = results.map((row) => {
const langs = (row.top_languages || []).map((lang) => `<span class="pill tiny">${lang}</span>`).join(" ");
return `<article class="card">
<div class="card-title"><a href="/cve/${row.cve_id}.html">${row.cve_id}</a></div>
<div class="meta-row">
<span class="pill tier-high">${row.high_confidence} high</span>
<span class="pill tier-medium">${row.medium_confidence} med</span>
<span class="pill">${row.poc_count} PoCs</span>
</div>
<div class="muted small">Max score ${row.max_score || 0}</div>
<div class="pill-row">${langs}</div>
</article>`;
}).join("");
};
input.addEventListener("input", (e) => render(e.target.value));
};
document.addEventListener("DOMContentLoaded", () => {
initIndexSearch();
});
})();

View File

@@ -1,54 +1,97 @@
:root {
--bg: #0b0c10;
--panel: #11131a;
--text: #e5e8f0;
--muted: #9aa3b5;
--accent: #5ad4e6;
--warn: #f6c177;
--border: #1f2430;
--shadow: 0 10px 30px rgba(0,0,0,0.35);
font-family: "Inter", system-ui, -apple-system, sans-serif;
--bg: #05070d;
--panel: #0d1020;
--panel-2: #11162b;
--text: #f3f4ff;
--muted: #8fa2c8;
--accent: #7ef1d3;
--accent-2: #5bc0eb;
--warn: #ffb86c;
--success: #6ef2a6;
--border: #1f2742;
--shadow: 0 18px 45px rgba(0,0,0,0.35);
font-family: "Space Grotesk", "Inter", "Helvetica Neue", system-ui, sans-serif;
line-height: 1.55;
}
* { box-sizing: border-box; }
body { margin:0; background: var(--bg); color: var(--text); }
body {
margin: 0;
background: radial-gradient(circle at 20% 20%, rgba(91,192,235,0.08), transparent 25%), radial-gradient(circle at 80% 0%, rgba(126,241,211,0.08), transparent 23%), var(--bg);
color: var(--text);
}
a { color: var(--accent); text-decoration: none; }
a:hover { text-decoration: underline; }
code { background: rgba(255,255,255,0.04); padding: 2px 6px; border-radius: 6px; color: var(--accent-2); }
.wrap { width: min(1100px, 95vw); margin: 0 auto; padding: 1.5rem 0; }
.site-header { background: var(--panel); border-bottom: 1px solid var(--border); position: sticky; top:0; z-index:10; box-shadow: var(--shadow); }
.site-header .wrap { display:flex; align-items:center; justify-content: space-between; padding: 0.9rem 0; }
.brand a { font-weight: 700; letter-spacing: 0.5px; }
nav a { margin-left: 1rem; color: var(--text); opacity: 0.85; }
nav a:hover { opacity: 1; }
.wrap { width: min(1200px, 94vw); margin: 0 auto; padding: 1.5rem 0; }
h1, h2, h3 { margin: 0 0 0.5rem; }
section { margin-bottom: 2rem; }
.lead { color: var(--muted); line-height: 1.5; }
.topbar { position: sticky; top: 0; z-index: 10; background: rgba(13,16,32,0.85); backdrop-filter: blur(10px); border-bottom: 1px solid var(--border); }
.topbar .wrap { display: flex; justify-content: space-between; align-items: center; padding: 1rem 0; }
.brand a { font-weight: 700; letter-spacing: 0.5px; color: var(--text); }
.brand .dot { color: var(--accent); margin-right: 4px; }
nav a { margin-left: 1rem; color: var(--muted); font-weight: 600; }
nav a:hover { color: var(--accent); }
h1, h2, h3, h4 { margin: 0 0 0.5rem; line-height: 1.25; }
p { margin: 0 0 0.75rem; }
.muted { color: var(--muted); }
.small { font-size: 0.9rem; }
.eyebrow { text-transform: uppercase; letter-spacing: 0.15em; font-size: 0.8rem; color: var(--accent); margin-bottom: 0.35rem; }
.lede { color: var(--muted); max-width: 60ch; }
.hero { display: grid; grid-template-columns: 2fr 1fr; gap: 1.5rem; align-items: center; padding: 1rem 0 2rem; }
.hero-panel { background: var(--panel); border: 1px solid var(--border); border-radius: 16px; padding: 1rem; box-shadow: var(--shadow); display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 0.75rem; }
.stat .label { color: var(--muted); font-size: 0.9rem; }
.stat .value { font-size: 1.9rem; font-weight: 700; }
.cta-row { display: flex; gap: 0.75rem; flex-wrap: wrap; margin-top: 1rem; }
.btn { background: linear-gradient(90deg, var(--accent), var(--accent-2)); color: #041019; padding: 0.75rem 1rem; border-radius: 12px; font-weight: 700; border: none; display: inline-block; }
.btn.ghost { background: transparent; color: var(--text); border: 1px solid var(--border); }
.text-link { color: var(--accent); font-weight: 600; }
.section-header { display: flex; align-items: center; justify-content: space-between; gap: 1rem; margin-bottom: 0.75rem; }
.card-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); gap: 1rem; }
.card { background: var(--panel); padding: 1rem; border: 1px solid var(--border); border-radius: 10px; box-shadow: var(--shadow); }
.card-title { font-weight: 700; margin-bottom: 0.2rem; }
.card-meta { color: var(--muted); font-size: 0.9rem; margin-bottom: 0.5rem; }
.badge { display: inline-block; background: rgba(90,212,230,0.12); color: var(--accent); padding: 0.15rem 0.5rem; border-radius: 999px; font-size: 0.8rem; margin-right: 0.25rem; }
.card { background: var(--panel); border: 1px solid var(--border); border-radius: 14px; padding: 1rem; box-shadow: var(--shadow); }
.card-title { font-weight: 700; margin-bottom: 0.4rem; }
.meta-row { display: flex; flex-wrap: wrap; gap: 0.35rem; align-items: center; margin-bottom: 0.35rem; }
.filter { width: 100%; padding: 0.65rem 0.75rem; margin: 0 0 0.75rem; border-radius: 8px; border: 1px solid var(--border); background: #0f1320; color: var(--text); }
.pill { display: inline-flex; align-items: center; gap: 4px; padding: 0.25rem 0.6rem; border-radius: 999px; background: rgba(255,255,255,0.04); border: 1px solid var(--border); color: var(--text); font-size: 0.85rem; }
.pill.ghost { background: transparent; color: var(--muted); }
.pill.warn { border-color: var(--warn); color: var(--warn); }
.pill.tier-high { border-color: var(--success); color: var(--success); }
.pill.tier-medium { border-color: var(--accent-2); color: var(--accent-2); }
.pill.tier-low { border-color: var(--muted); color: var(--muted); }
.pill.tiny { font-size: 0.75rem; padding: 0.15rem 0.4rem; }
.table-responsive { overflow-x: auto; border: 1px solid var(--border); border-radius: 10px; box-shadow: var(--shadow); }
.table-responsive table { width: 100%; border-collapse: collapse; }
.table-responsive th, .table-responsive td { padding: 0.75rem 0.9rem; border-bottom: 1px solid var(--border); text-align: left; }
.table-responsive th { background: #161a22; color: #d6dae6; font-size: 0.9rem; letter-spacing: 0.2px; }
.table-responsive tr:last-child td { border-bottom: none; }
.input { width: 100%; padding: 0.75rem; border-radius: 12px; border: 1px solid var(--border); background: var(--panel-2); color: var(--text); margin: 0.5rem 0 1rem; }
.pill-row { display: flex; flex-wrap: wrap; gap: 0.5rem; margin: 0.8rem 0 1rem; }
.pill { padding: 0.35rem 0.65rem; border-radius: 999px; background: #1b202c; border: 1px solid var(--border); color: var(--text); font-size: 0.9rem; }
.pill-warn { background: rgba(246,193,119,0.15); border-color: #f6c177; color: #f6c177; }
.table-wrap { overflow-x: auto; border: 1px solid var(--border); border-radius: 14px; box-shadow: var(--shadow); background: var(--panel); }
table { width: 100%; border-collapse: collapse; }
th, td { padding: 0.85rem 1rem; border-bottom: 1px solid var(--border); text-align: left; }
th { background: #0f1326; color: var(--muted); font-weight: 600; letter-spacing: 0.02em; }
tr:last-child td { border-bottom: none; }
.site-footer { border-top: 1px solid var(--border); padding: 1rem 0; color: var(--muted); }
.site-footer .wrap { display: flex; gap: 1rem; flex-wrap: wrap; font-size: 0.9rem; }
.matches ul { list-style: none; padding: 0; margin: 0.35rem 0 0; }
.matches li { margin-bottom: 0.25rem; color: var(--muted); }
@media (max-width: 640px) {
nav a { margin-left: 0.6rem; }
.card-grid { grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); }
.table-responsive th, .table-responsive td { padding: 0.6rem; }
.grid-2 { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1rem; }
.list { list-style: none; padding: 0; margin: 0.35rem 0; }
.list li { padding: 0.4rem 0; border-bottom: 1px solid var(--border); }
.list li:last-child { border-bottom: none; }
.pill-row { display: flex; flex-wrap: wrap; gap: 0.5rem; margin: 0.8rem 0; }
.footer { border-top: 1px solid var(--border); margin-top: 2rem; }
.footer-inner { display: flex; flex-wrap: wrap; gap: 1rem; padding: 1rem 0; color: var(--muted); }
@media (max-width: 840px) {
.hero { grid-template-columns: 1fr; }
nav { display: none; }
}
@media (max-width: 620px) {
.card-grid { grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); }
th, td { padding: 0.65rem; }
}

View File

@@ -3,10 +3,55 @@ import json
import os
import re
from pathlib import Path
from typing import Dict, List
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlparse
ROOT = Path(__file__).resolve().parent.parent
OUTPUT = Path(__file__).resolve().with_name("CVE_list.json")
REMOVED_OUTPUT = Path(__file__).resolve().with_name("CVE_blacklist_removed.json")
BLACKLIST = ROOT / "blacklist.txt"
def load_blacklist(path: Path = BLACKLIST) -> List[str]:
if not path.exists():
return []
items: List[str] = []
for raw in path.read_text(encoding="utf-8").splitlines():
entry = raw.strip()
if entry and not entry.startswith("#"):
items.append(entry)
return items
def repo_from_url(url: str) -> str:
try:
parsed = urlparse(url)
host = (parsed.netloc or "").lower()
if host and "github" not in host:
return ""
path = parsed.path or url
except Exception:
path = url
parts = path.strip("/").split("/")
if len(parts) >= 2:
return parts[1].lower()
return (parts[-1] if parts else "").lower()
def is_blacklisted(url: str, blacklist: List[str]) -> bool:
repo = repo_from_url(url)
if not repo:
return False
for entry in blacklist:
slug = entry.lower()
if not slug:
continue
if slug.endswith("*"):
if repo.startswith(slug[:-1]):
return True
elif repo == slug:
return True
return False
def normalise_block(text: str) -> str:
@@ -37,21 +82,32 @@ def parse_sections(content: str) -> Dict[str, str]:
return sections
def collect_links(block: str) -> List[str]:
def collect_links(block: str, *, blacklist: Optional[List[str]] = None, removed: Optional[List[str]] = None) -> List[str]:
links: List[str] = []
blacklist = blacklist or []
if removed is None:
removed = []
for raw in block.splitlines():
entry = raw.strip()
if not entry or "No PoCs" in entry:
continue
if entry.startswith("- "):
entry = entry[2:].strip()
if entry and entry not in links:
if not entry:
continue
if is_blacklisted(entry, blacklist):
removed.append(entry)
continue
if entry not in links:
links.append(entry)
return links
def main() -> None:
blacklist = load_blacklist()
cve_entries = []
removed_by_cve: Dict[str, List[str]] = {}
removed_seen: set[str] = set()
years = [entry for entry in os.listdir(ROOT) if entry.isdigit()]
years.sort(reverse=True)
@@ -65,8 +121,9 @@ def main() -> None:
sections = parse_sections(content)
description = normalise_block(sections.get("### Description", ""))
references = collect_links(sections.get("#### Reference", ""))
github_links = collect_links(sections.get("#### Github", ""))
removed_links: List[str] = []
references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist, removed=removed_links)
github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist, removed=removed_links)
poc_entries: List[str] = []
seen = set()
@@ -75,8 +132,17 @@ def main() -> None:
poc_entries.append(link)
seen.add(link)
cve_id = filename.replace(".md", "")
if removed_links:
removed_by_cve[cve_id] = sorted(set(removed_links))
removed_seen.update(removed_links)
# Skip CVEs with zero PoCs (both sections empty) to keep lookup clean
if not poc_entries:
continue
cve_entries.append({
"cve": filename.replace(".md", ""),
"cve": cve_id,
"desc": description,
"poc": poc_entries,
})
@@ -84,6 +150,17 @@ def main() -> None:
with open(OUTPUT, "w", encoding="utf-8") as outfile:
json.dump(cve_entries, outfile, ensure_ascii=False)
with open(REMOVED_OUTPUT, "w", encoding="utf-8") as removed_file:
json.dump(
{
"removed": sorted(removed_seen),
"by_cve": removed_by_cve,
},
removed_file,
ensure_ascii=False,
indent=2,
)
print("CVE list saved to CVE_list.json")

View File

@@ -7,6 +7,7 @@ pip install -r requirements.txt
python scripts/fetch_kev.py
python scripts/fetch_epss.py
python scripts/build_site.py
python scripts/build_all.py # new PoC discovery + scoring pipeline
```
Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diff/`.
Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diffs/`.

118
scripts/build_all.py Normal file
View File

@@ -0,0 +1,118 @@
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from typing import Dict, List
import requests
from pipeline_outputs import (
build_diff,
prune_old_diffs,
prune_old_snapshots,
summarise_for_snapshot,
write_cve_outputs,
write_diff,
write_index,
write_snapshot,
write_top,
)
from poc_pipeline import PoCPipeline, build_scope, persist_evidence
from site_renderer import SiteRenderer
from utils import API_DIR, DOCS_DIR, load_json
def load_existing_results(api_dir: Path) -> List[Dict]:
results: List[Dict] = []
if not api_dir.exists():
return results
for path in api_dir.glob("CVE-*.json"):
data = load_json(path, default={}) or {}
if "pocs" in data:
results.append({"cve_id": data.get("cve_id") or path.stem, "pocs": data.get("pocs", []), "last_updated": data.get("last_updated")})
return results
def main(argv: List[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Build CVE PoC pipeline outputs, snapshots, and static site")
parser.add_argument("--days", type=int, default=7, help="Days window for GitHub discovery windows")
parser.add_argument("--mode", choices=["daily", "weekly"], default="daily", help="Run mode to tune scope")
parser.add_argument("--limit", type=int, default=50, help="Maximum CVEs to scan per run")
parser.add_argument("--cve", action="append", help="Explicit CVE IDs to scan (can be passed multiple times)")
parser.add_argument("--skip-discovery", action="store_true", help="Skip GitHub discovery and reuse existing API outputs")
parser.add_argument("--check-links", action="store_true", help="Optionally HEAD check repo URLs for dead links")
args = parser.parse_args(argv)
pipeline = PoCPipeline()
scope: List[str] = []
discovery_days = args.days
if args.cve:
scope = [cve.upper() for cve in args.cve]
elif not args.skip_discovery:
prefer_recent = True
scan_days = args.days
limit = args.limit
if args.mode == "weekly":
scan_days = max(scan_days, 30)
discovery_days = scan_days
prefer_recent = False
limit = None
scope = build_scope(scan_days, github_list=Path("github.txt"), existing_api=API_DIR / "cve", prefer_recent_years=prefer_recent, max_cves=limit)
results: List[Dict] = []
if args.skip_discovery:
results = load_existing_results(API_DIR / "cve")
else:
for idx, cve_id in enumerate(scope):
try:
results.append(pipeline.discover_for_cve(cve_id, days=discovery_days))
except Exception as exc: # noqa: BLE001
print(f"[warn] Failed to process {cve_id}: {exc}", file=sys.stderr)
persist_evidence(results)
if not results:
print("No results to write; aborting.")
return 1
write_cve_outputs(results)
index_payload = write_index(results)
top_payload = write_top(results)
def maybe_check_links() -> List[Dict]:
if not args.check_links:
return []
urls = []
for result in results:
for poc in result.get("pocs", []):
if poc.get("confidence_tier") in {"high", "medium"} and poc.get("repo_url"):
urls.append(poc["repo_url"])
urls = urls[:25]
dead: List[Dict] = []
for url in urls:
try:
resp = requests.head(url, timeout=5, allow_redirects=True)
if resp.status_code >= 400:
dead.append({"url": url, "status": resp.status_code})
except requests.RequestException as exc: # noqa: BLE001
dead.append({"url": url, "error": str(exc)})
return dead
snapshot_payload = summarise_for_snapshot(results, top=top_payload)
prev_snapshot = load_json(API_DIR / "snapshots" / "latest.json", default={}) or {}
snapshot_path = write_snapshot(snapshot_payload)
diff_payload = build_diff(prev_snapshot, snapshot_payload, dead_links=maybe_check_links())
write_diff(diff_payload)
prune_old_snapshots()
prune_old_diffs()
renderer = SiteRenderer(results=results, index_payload=index_payload, top_payload=top_payload, diff_payload=diff_payload)
renderer.build()
print(f"Generated site under {DOCS_DIR}")
print(f"Wrote latest snapshot to {snapshot_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

210
scripts/github_client.py Normal file
View File

@@ -0,0 +1,210 @@
from __future__ import annotations
import json
import os
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
import requests
from utils import CACHE_DIR, chunked, hash_key, isoformat
TEXT_MATCH_HEADER = "application/vnd.github.text-match+json"
class RateLimiter:
def __init__(self, calls_per_minute: int) -> None:
self.min_interval = 60.0 / max(calls_per_minute, 1)
self.last_call: Dict[str, float] = {}
def wait(self, bucket: str) -> None:
last = self.last_call.get(bucket, 0.0)
elapsed = time.time() - last
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
self.last_call[bucket] = time.time()
class FileCache:
def __init__(self, base: Path) -> None:
self.base = base
self.base.mkdir(parents=True, exist_ok=True)
def _path_for(self, key: str) -> Path:
digest = hash_key(key)
return self.base / digest[:2] / f"{digest}.json"
def load(self, key: str, *, ttl: int) -> Optional[Dict]:
path = self._path_for(key)
if not path.exists():
return None
try:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
except (OSError, json.JSONDecodeError):
return None
expires_at = data.get("expires_at")
if expires_at:
try:
expires_ts = time.mktime(time.strptime(expires_at, "%Y-%m-%dT%H:%M:%S"))
if time.time() > expires_ts:
return None
except Exception:
return None
return data.get("payload")
def save(self, key: str, payload: Dict, *, ttl: int) -> None:
path = self._path_for(key)
path.parent.mkdir(parents=True, exist_ok=True)
data = {
"fetched_at": isoformat(),
"expires_at": time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(time.time() + ttl)),
"payload": payload,
}
with path.open("w", encoding="utf-8") as handle:
json.dump(data, handle, ensure_ascii=False, indent=2)
@dataclass
class SearchResult:
kind: str
query: str
page: int
payload: Dict
class GitHubClient:
def __init__(
self,
token: Optional[str],
*,
cache_dir: Path | None = None,
code_search_rpm: int = 10,
general_rpm: int = 30,
) -> None:
self.session = requests.Session()
self.session.headers.update({"Accept": TEXT_MATCH_HEADER})
if token:
self.session.headers["Authorization"] = f"Bearer {token}"
self.base_url = "https://api.github.com"
self.graphql_url = f"{self.base_url}/graphql"
cache_root = cache_dir or CACHE_DIR / "github"
self.cache = FileCache(cache_root)
self.rate_limiters = {
"code": RateLimiter(code_search_rpm),
"search": RateLimiter(general_rpm),
"graphql": RateLimiter(general_rpm),
}
def _request(self, method: str, url: str, *, bucket: str, **kwargs) -> requests.Response:
self.rate_limiters[bucket].wait(bucket)
attempts = 0
while True:
attempts += 1
try:
response = self.session.request(method, url, timeout=30, **kwargs)
except requests.RequestException:
if attempts >= 3:
raise
time.sleep(2 * attempts)
continue
if response.status_code == 403 and "X-RateLimit-Remaining" in response.headers:
remaining = int(response.headers.get("X-RateLimit-Remaining") or "0")
reset = response.headers.get("X-RateLimit-Reset")
if remaining <= 0 and reset:
try:
reset_ts = int(reset)
wait_for = max(0, reset_ts - int(time.time()) + 1)
time.sleep(wait_for)
continue
except ValueError:
pass
if response.status_code >= 500 and attempts < 3:
time.sleep(1 + attempts)
continue
response.raise_for_status()
return response
def _cached_search(self, kind: str, query: str, page: int, per_page: int, ttl: int) -> Dict:
cache_key = f"{kind}:{query}:p{page}:n{per_page}"
cached = self.cache.load(cache_key, ttl=ttl)
if cached is not None:
return cached
url = f"{self.base_url}/search/{kind}"
params = {"q": query, "page": page, "per_page": per_page}
resp = self._request("GET", url, params=params, bucket="code" if kind == "code" else "search")
payload = resp.json()
self.cache.save(cache_key, payload, ttl=ttl)
return payload
def search_repositories(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
def search_code(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
return SearchResult("code", query, page, self._cached_search("code", query, page, per_page, ttl))
def search_topics(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
def fetch_repo_metadata(self, full_names: Iterable[str], *, ttl: int = 6 * 3600) -> Dict[str, Dict]:
results: Dict[str, Dict] = {}
to_fetch: List[str] = []
for name in full_names:
cache_key = f"repo-meta:{name}"
cached = self.cache.load(cache_key, ttl=ttl)
if cached is not None:
results[name] = cached
else:
to_fetch.append(name)
if not to_fetch:
return results
fields = """
nameWithOwner
url
stargazerCount
description
forkCount
isFork
isArchived
pushedAt
updatedAt
primaryLanguage { name }
parent { nameWithOwner url }
repositoryTopics(first: 20) { nodes { topic { name } } }
"""
for batch in chunked(to_fetch, 12):
parts = []
for idx, full_name in enumerate(batch):
if "/" not in full_name:
continue
owner, name = full_name.split("/", 1)
owner = owner.replace('"', "")
name = name.replace('"', "")
parts.append(f'repo_{idx}: repository(owner: "{owner}", name: "{name}") {{ {fields} }}')
if not parts:
continue
query = "query { " + " ".join(parts) + " }"
resp = self._request("POST", self.graphql_url, json={"query": query}, bucket="graphql")
data = resp.json()
repos = data.get("data", {})
for idx, full_name in enumerate(batch):
key = f"repo_{idx}"
meta = repos.get(key) or {}
cache_key = f"repo-meta:{full_name}"
self.cache.save(cache_key, meta, ttl=ttl)
results[full_name] = meta
return results
def build_client(token_env: str = "GITHUB_TOKEN") -> GitHubClient:
token = os.environ.get(token_env)
return GitHubClient(token, cache_dir=CACHE_DIR / "github")

220
scripts/pipeline_outputs.py Normal file
View File

@@ -0,0 +1,220 @@
from __future__ import annotations
from collections import Counter
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
from utils import API_DIR, DIFFS_DIR, SNAPSHOT_DIR, TOP_DIR, ensure_dirs, load_json, save_json, today_str
def write_cve_outputs(results: List[Dict], *, base_dir: Path | None = None) -> None:
target_dir = base_dir or API_DIR / "cve"
ensure_dirs(target_dir)
for result in results:
last_updated = result.get("last_updated") or today_str()
output = {
"cve_id": result["cve_id"],
"last_updated": last_updated,
"pocs": [
{
"repo_full_name": poc.get("repo_full_name"),
"repo_url": poc.get("repo_url"),
"is_fork": poc.get("is_fork"),
"parent_repo_url": poc.get("parent_repo_url"),
"stars": poc.get("stars"),
"forks": poc.get("forks"),
"archived": poc.get("archived"),
"pushed_at": poc.get("pushed_at") or poc.get("updated_at"),
"topics": poc.get("topics", []),
"primary_language": poc.get("primary_language"),
"matches": poc.get("matches", []),
"confidence_score": poc.get("confidence_score"),
"confidence_tier": poc.get("confidence_tier"),
}
for poc in result.get("pocs", [])
],
}
save_json(target_dir / f"{result['cve_id']}.json", output)
def build_index(results: List[Dict]) -> Dict:
items: List[Dict] = []
for result in results:
poc_entries = result.get("pocs", [])
high = [p for p in poc_entries if p.get("confidence_tier") == "high"]
medium = [p for p in poc_entries if p.get("confidence_tier") == "medium"]
langs = Counter()
max_score = 0.0
for poc in poc_entries:
lang = poc.get("primary_language")
if lang:
langs[lang] += 1
max_score = max(max_score, poc.get("confidence_score") or 0)
items.append(
{
"cve_id": result["cve_id"],
"poc_count": len(poc_entries),
"high_confidence": len(high),
"medium_confidence": len(medium),
"top_languages": [lang for lang, _ in langs.most_common(3)],
"max_score": max_score,
"last_updated": result.get("last_updated"),
}
)
return {"generated": today_str(), "items": sorted(items, key=lambda r: r["cve_id"], reverse=True)}
def write_index(results: List[Dict]) -> Dict:
ensure_dirs(API_DIR)
payload = build_index(results)
save_json(API_DIR / "index.json", payload)
return payload
def write_top(results: List[Dict], *, limit: int = 100) -> Dict:
ensure_dirs(TOP_DIR)
entries: List[Dict] = []
for result in results:
for poc in result.get("pocs", []):
if poc.get("confidence_tier") not in {"high", "medium"}:
continue
entries.append(
{
"cve_id": result["cve_id"],
"repo_full_name": poc.get("repo_full_name"),
"repo_url": poc.get("repo_url"),
"score": poc.get("confidence_score"),
"tier": poc.get("confidence_tier"),
"stars": poc.get("stars"),
"primary_language": poc.get("primary_language"),
}
)
entries.sort(key=lambda e: (-(e.get("score") or 0), -(e.get("stars") or 0)))
payload = {"generated": today_str(), "items": entries[:limit]}
save_json(TOP_DIR / "today.json", payload)
return payload
def summarise_for_snapshot(results: List[Dict], *, top: Dict | None = None) -> Dict:
summary: Dict[str, Dict[str, Dict]] = {}
for result in results:
repo_map: Dict[str, Dict] = {}
for poc in result.get("pocs", []):
repo_map[poc.get("repo_full_name")] = {
"score": poc.get("confidence_score"),
"tier": poc.get("confidence_tier"),
}
summary[result["cve_id"]] = repo_map
payload = {"generated": today_str(), "entries": summary}
if top:
payload["top"] = top
return payload
def write_snapshot(summary: Dict) -> Path:
ensure_dirs(SNAPSHOT_DIR)
target = SNAPSHOT_DIR / f"{summary['generated']}.json"
save_json(target, summary)
save_json(SNAPSHOT_DIR / "latest.json", summary)
return target
def prune_old_snapshots(days: int = 14) -> None:
if not SNAPSHOT_DIR.exists():
return
cutoff = datetime.utcnow().date() - timedelta(days=days)
for snap in SNAPSHOT_DIR.glob("*.json"):
try:
snap_date = datetime.strptime(snap.stem, "%Y-%m-%d").date()
except ValueError:
continue
if snap_date < cutoff:
snap.unlink(missing_ok=True)
def prune_old_diffs(days: int = 14) -> None:
if not DIFFS_DIR.exists():
return
cutoff = datetime.now().date() - timedelta(days=days)
for diff in DIFFS_DIR.glob("*.json"):
try:
diff_date = datetime.strptime(diff.stem, "%Y-%m-%d").date()
except ValueError:
continue
if diff_date < cutoff:
diff.unlink(missing_ok=True)
def _load_snapshot(path: Path) -> Dict:
return load_json(path, default={}) or {}
def build_diff(prev: Dict, curr: Dict, *, dead_links: List[Dict] | None = None) -> Dict:
prev_entries = prev.get("entries", {})
curr_entries = curr.get("entries", {})
new_high: List[Dict] = []
promoted: List[Dict] = []
demoted: List[Dict] = []
for cve_id, repos in curr_entries.items():
for repo_name, info in repos.items():
tier = info.get("tier")
if tier != "high":
continue
prev_info = (prev_entries.get(cve_id) or {}).get(repo_name)
if not prev_info:
new_high.append({"cve_id": cve_id, "repo_full_name": repo_name, "score": info.get("score")})
elif prev_info.get("tier") != "high":
promoted.append(
{
"cve_id": cve_id,
"repo_full_name": repo_name,
"score": info.get("score"),
"previous_tier": prev_info.get("tier"),
}
)
for cve_id, repos in prev_entries.items():
for repo_name, info in repos.items():
if info.get("tier") != "high":
continue
curr_info = (curr_entries.get(cve_id) or {}).get(repo_name)
if not curr_info or curr_info.get("tier") != "high":
demoted.append(
{
"cve_id": cve_id,
"repo_full_name": repo_name,
"previous_score": info.get("score"),
"previous_tier": info.get("tier"),
"current_tier": curr_info.get("tier") if curr_info else None,
}
)
return {
"generated": curr.get("generated"),
"new_high_conf_pocs": new_high,
"promoted_to_high": promoted,
"demoted_or_removed": demoted,
"dead_links": dead_links or [],
}
def write_diff(diff: Dict) -> Path:
ensure_dirs(DIFFS_DIR)
target = DIFFS_DIR / f"{diff['generated']}.json"
save_json(target, diff)
save_json(DIFFS_DIR / "latest.json", diff)
return target
def latest_snapshots() -> Tuple[Dict, Dict]:
if not SNAPSHOT_DIR.exists():
return {}, {}
snaps = sorted(SNAPSHOT_DIR.glob("*.json"))
if not snaps:
return {}, {}
curr = _load_snapshot(snaps[-1])
prev = _load_snapshot(snaps[-2]) if len(snaps) > 1 else {}
return prev, curr

274
scripts/poc_pipeline.py Normal file
View File

@@ -0,0 +1,274 @@
from __future__ import annotations
import re
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, Tuple
from github_client import GitHubClient, SearchResult, build_client
from poc_scoring import match_score, score_repo
from utils import API_DIR, EVIDENCE_DIR, chunked, cve_year, ensure_dirs, isoformat, load_blacklist, load_json, save_json, today_str
LANG_PARTITIONS = ("python", "go", "c", "shell", "powershell", "java", "ruby", "js")
CVE_RE = re.compile(r"CVE-\d{4}-\d{4,}", re.IGNORECASE)
@dataclass
class MatchEvidence:
path: str
match_type: str
query: str
score: float | None = None
@dataclass
class RepoCandidate:
cve_id: str
repo_full_name: str
repo_url: str
matches: List[MatchEvidence] = field(default_factory=list)
metadata: Dict[str, object] = field(default_factory=dict)
def add_match(self, path: str, match_type: str, query: str) -> None:
key = (path, match_type)
existing = {(m.path, m.match_type) for m in self.matches}
if key in existing:
return
self.matches.append(MatchEvidence(path=path, match_type=match_type, query=query))
def build_created_ranges(days: int, *, window: int = 7) -> List[Tuple[str, str]]:
end = date.today()
start = end - timedelta(days=max(days, 1))
ranges: List[Tuple[str, str]] = []
cursor = start
while cursor <= end:
window_end = min(cursor + timedelta(days=window - 1), end)
ranges.append((cursor.isoformat(), window_end.isoformat()))
cursor = window_end + timedelta(days=1)
return ranges or [(start.isoformat(), end.isoformat())]
def build_query_pack(cve_id: str, created_range: Tuple[str, str] | None = None) -> List[Dict[str, str]]:
base_repo = f'{cve_id} in:name,description,readme fork:false'
enriched_repo = f'{cve_id} (poc OR exploit) in:name,description,readme fork:false'
topic_query = f"topic:{cve_id.lower()} fork:false"
created_suffix = ""
if created_range:
created_suffix = f" created:{created_range[0]}..{created_range[1]}"
queries = [
{"kind": "repositories", "query": base_repo + created_suffix, "match_type": "name"},
{"kind": "repositories", "query": enriched_repo + created_suffix, "match_type": "description"},
{"kind": "repositories", "query": topic_query + created_suffix, "match_type": "topic"},
]
for lang in LANG_PARTITIONS:
base_code = f'{cve_id} in:file language:{lang}{created_suffix}'
queries.append({"kind": "code", "query": base_code, "match_type": "code"})
# generic code search without language partition for the most recent window
queries.append({"kind": "code", "query": f"{cve_id} in:file{created_suffix}", "match_type": "code"})
return queries
def parse_repo_from_item(item: Dict) -> Tuple[str | None, str | None]:
repo_full_name = item.get("full_name") or item.get("repository", {}).get("full_name")
repo_url = item.get("html_url") or item.get("repository", {}).get("html_url")
if not repo_full_name and "repository" in item:
repo_full_name = item["repository"].get("owner", {}).get("login", "")
if repo_full_name:
repo_full_name = f"{repo_full_name}/{item['repository'].get('name', '')}"
return repo_full_name, repo_url
def extract_matches(item: Dict, default_type: str, query: str) -> List[MatchEvidence]:
matches: List[MatchEvidence] = []
for text_match in item.get("text_matches", []) or []:
prop = text_match.get("property") or text_match.get("object_type") or ""
fragment = text_match.get("fragment") or text_match.get("path") or prop or ""
match_type = prop if prop else default_type
matches.append(MatchEvidence(path=str(fragment), match_type=str(match_type), query=query))
if not matches:
path = item.get("path") or default_type
matches.append(MatchEvidence(path=str(path), match_type=default_type, query=query))
return matches
def normalise_metadata(meta: Dict, fallback_full_name: str, fallback_url: str) -> Dict:
topics = []
if meta.get("repositoryTopics"):
for node in meta["repositoryTopics"].get("nodes", []):
topic = (node.get("topic") or {}).get("name")
if topic:
topics.append(topic)
primary_language = None
if meta.get("primaryLanguage"):
primary_language = meta["primaryLanguage"].get("name")
parent = meta.get("parent") or {}
return {
"repo_full_name": meta.get("nameWithOwner") or fallback_full_name,
"repo_url": meta.get("url") or fallback_url,
"description": meta.get("description") or "",
"is_fork": bool(meta.get("isFork")),
"parent_repo_url": parent.get("url"),
"stars": meta.get("stargazerCount") or 0,
"forks": meta.get("forkCount") or 0,
"archived": bool(meta.get("isArchived")),
"pushed_at": meta.get("pushedAt"),
"updated_at": meta.get("updatedAt"),
"topics": topics,
"primary_language": primary_language,
}
class PoCPipeline:
def __init__(
self,
client: GitHubClient | None = None,
*,
blacklist_path: Path | None = None,
search_ttl: int = 3 * 3600,
) -> None:
self.client = client or build_client()
self.blacklist = load_blacklist(blacklist_path)
self.search_ttl = search_ttl
def _run_query(self, query: Dict, page: int) -> SearchResult:
if query["kind"] == "repositories":
return self.client.search_repositories(query["query"], page=page, per_page=50, ttl=self.search_ttl)
if query["kind"] == "code":
return self.client.search_code(query["query"], page=page, per_page=50, ttl=self.search_ttl)
return self.client.search_topics(query["query"], page=page, per_page=50, ttl=self.search_ttl)
def discover_for_cve(self, cve_id: str, *, days: int, max_pages_repo: int = 2, max_pages_code: int = 2) -> Dict:
ranges = build_created_ranges(days)
candidates: Dict[str, RepoCandidate] = {}
query_log: List[Dict] = []
for created_range in ranges:
query_pack = build_query_pack(cve_id, created_range)
for query in query_pack:
query_log.append({"query": query["query"], "kind": query["kind"], "window": created_range})
page_limit = max_pages_code if query["kind"] == "code" else max_pages_repo
for page in range(1, page_limit + 1):
result = self._run_query(query, page)
items = result.payload.get("items", [])
for item in items:
repo_full_name, repo_url = parse_repo_from_item(item)
if not repo_full_name or not repo_url:
continue
candidate = candidates.setdefault(
repo_full_name,
RepoCandidate(cve_id=cve_id, repo_full_name=repo_full_name, repo_url=repo_url),
)
for match in extract_matches(item, query["match_type"], query["query"]):
candidate.add_match(match.path, match.match_type, match.query)
if len(items) < 50:
break
metadata = self.client.fetch_repo_metadata(candidates.keys())
for repo_full_name, candidate in candidates.items():
meta = metadata.get(repo_full_name, {})
candidate.metadata = normalise_metadata(meta, repo_full_name, candidate.repo_url)
repos: List[Dict] = []
for candidate in candidates.values():
matches_dicts = []
for m in candidate.matches:
m.score = match_score({"path": m.path, "match_type": m.match_type})
matches_dicts.append({"path": m.path, "match_type": m.match_type, "query": m.query, "score": m.score})
score, tier = score_repo(candidate.metadata, matches_dicts, self.blacklist)
repo_entry = {
**candidate.metadata,
"matches": matches_dicts,
"confidence_score": score,
"confidence_tier": tier,
"cve_id": cve_id,
}
repos.append(repo_entry)
repos.sort(key=lambda r: (-r["confidence_score"], -r.get("stars", 0)))
evidence = {
"queries": query_log,
"candidates": [
{
"repo_full_name": r["repo_full_name"],
"matches": r["matches"],
"match_count": len(r["matches"]),
"score": r["confidence_score"],
"tier": r["confidence_tier"],
}
for r in repos
],
}
return {"cve_id": cve_id, "last_updated": isoformat(), "pocs": repos, "evidence": evidence}
def discover_many(self, cve_ids: Iterable[str], *, days: int, limit: Optional[int] = None) -> List[Dict]:
results: List[Dict] = []
for idx, cve_id in enumerate(cve_ids):
if limit and idx >= limit:
break
results.append(self.discover_for_cve(cve_id, days=days))
return results
def persist_evidence(results: List[Dict]) -> None:
ensure_dirs(EVIDENCE_DIR)
for result in results:
cve_id = result["cve_id"]
evidence_path = EVIDENCE_DIR / f"{cve_id}.json"
save_json(evidence_path, result.get("evidence", {}))
def discover_from_github_list(path: Path) -> List[str]:
if not path.exists():
return []
ids: List[str] = []
for line in path.read_text(encoding="utf-8").splitlines():
matches = CVE_RE.findall(line)
for match in matches:
if match.upper() not in ids:
ids.append(match.upper())
return ids
def load_existing_cves(api_dir: Path = API_DIR / "cve") -> List[str]:
if not api_dir.exists():
return []
return sorted({p.stem.upper() for p in api_dir.glob("CVE-*.json") if CVE_RE.match(p.stem)})
def build_scope(
days: int,
*,
github_list: Path,
existing_api: Path,
prefer_recent_years: bool = True,
max_cves: int | None = None,
low_conf_threshold: int = 1,
) -> List[str]:
seeds = discover_from_github_list(github_list)
existing = load_existing_cves(existing_api)
candidates = seeds or existing
if prefer_recent_years:
current_year = date.today().year
candidates = [cve for cve in candidates if cve_year(cve) and cve_year(cve) >= current_year - 2] or candidates
index_path = API_DIR / "index.json"
low_conf: List[str] = []
if index_path.exists():
index_payload = load_json(index_path, default={}) or {}
for item in index_payload.get("items", []):
score = (item.get("high_confidence", 0) or 0) + (item.get("medium_confidence", 0) or 0)
if score <= low_conf_threshold:
low_conf.append(item.get("cve_id"))
scoped = candidates + [cve for cve in low_conf if cve and cve not in candidates]
if max_cves:
scoped = scoped[:max_cves]
return scoped

121
scripts/poc_scoring.py Normal file
View File

@@ -0,0 +1,121 @@
from __future__ import annotations
import re
from datetime import datetime, timedelta, timezone
from typing import Dict, Iterable, List, Tuple
from utils import clamp, parse_date
DOC_EXTS = {"md", "txt", "rst", "adoc", "markdown", "mkd", "mdown"}
POSITIVE_KEYWORDS = ("poc", "exploit", "rce", "lpe", "auth bypass", "bypass")
NEGATIVE_KEYWORDS = ("report", "writeup", "advisory", "changelog")
def is_doc_path(path: str) -> bool:
lower = path.lower()
if lower.endswith("/"):
return True
if "." not in lower:
return False
ext = lower.rsplit(".", 1)[-1]
return ext in DOC_EXTS
def match_score(match: Dict) -> float:
path = str(match.get("path", ""))
match_type = str(match.get("match_type", "")).lower()
base = 50 if not is_doc_path(path) else 30
if match_type in ("code",):
base += 10
if "readme" in match_type:
base += 5
if "topic" in match_type:
base -= 5
return clamp(base, 0, 100)
def tier_for_score(score: float) -> str:
if score >= 75:
return "high"
if score >= 45:
return "medium"
return "low"
def keyword_hits(text: str, keywords: Iterable[str]) -> int:
if not text:
return 0
lower = text.lower()
return sum(1 for kw in keywords if kw in lower)
def recency_bonus(pushed_at: str | None) -> float:
if not pushed_at:
return 0.0
dt = parse_date(pushed_at)
if not dt:
return 0.0
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
delta = datetime.now(timezone.utc) - dt
if delta <= timedelta(days=30):
return 18.0
if delta <= timedelta(days=90):
return 10.0
if delta <= timedelta(days=180):
return 5.0
return 0.0
def score_repo(repo: Dict, matches: List[Dict], blacklist: List[str]) -> Tuple[float, str]:
stars = repo.get("stargazerCount") or repo.get("stars") or 0
forks = repo.get("forkCount") or repo.get("forks") or 0
is_fork = bool(repo.get("isFork"))
archived = bool(repo.get("isArchived"))
topics = [t.lower() for t in repo.get("topics", []) if t]
name = str(repo.get("nameWithOwner") or repo.get("repo_full_name") or "").lower()
description = str(repo.get("description") or "").lower()
non_doc_matches = [m for m in matches if not is_doc_path(str(m.get("path", "")))]
doc_matches = [m for m in matches if is_doc_path(str(m.get("path", "")))]
score = 12.0
if non_doc_matches:
score += 25 + min(len(non_doc_matches) * 2, 10)
if doc_matches and not non_doc_matches:
score -= 20
score += recency_bonus(repo.get("pushed_at") or repo.get("pushedAt") or repo.get("updated_at"))
score += min(stars / 50.0, 25.0)
score += min(forks / 200.0, 5.0)
score += keyword_hits(description, POSITIVE_KEYWORDS) * 4.0
score += keyword_hits(" ".join(topics), POSITIVE_KEYWORDS) * 4.0
negative_bias = keyword_hits(description, NEGATIVE_KEYWORDS)
if negative_bias and not non_doc_matches:
score -= 15
if is_fork:
score -= 12
if archived:
score -= 30
lowered_blacklist = [entry.lower() for entry in blacklist]
for forbidden in lowered_blacklist:
if not forbidden:
continue
if forbidden.endswith("*"):
prefix = forbidden[:-1]
if prefix and name.startswith(prefix):
score -= 40
break
elif forbidden in name:
score -= 40
break
for match in matches:
score += match_score(match) / 25.0
return clamp(score, 0, 100), tier_for_score(score)

99
scripts/site_renderer.py Normal file
View File

@@ -0,0 +1,99 @@
from __future__ import annotations
from pathlib import Path
from typing import Dict, List
from jinja2 import Environment, FileSystemLoader, select_autoescape
from utils import DOCS_DIR, TEMPLATES_DIR, ensure_dirs
def build_env() -> Environment:
loader = FileSystemLoader(str(TEMPLATES_DIR))
env = Environment(loader=loader, autoescape=select_autoescape(["html", "xml"]))
env.trim_blocks = True
env.lstrip_blocks = True
return env
class SiteRenderer:
def __init__(
self,
*,
results: List[Dict],
index_payload: Dict,
top_payload: Dict,
diff_payload: Dict | None = None,
) -> None:
self.results = []
for result in results:
visible = [p for p in result.get("pocs", []) if p.get("confidence_tier") in {"high", "medium"}]
if not visible:
visible = result.get("pocs", [])
self.results.append({**result, "visible_pocs": visible})
self.index_payload = index_payload
self.top_payload = top_payload
self.diff_payload = diff_payload or {}
self.env = build_env()
ensure_dirs(
DOCS_DIR,
DOCS_DIR / "pocs",
DOCS_DIR / "cve",
DOCS_DIR / "diffs",
DOCS_DIR / "assets",
)
def render(self, template_name: str, context: Dict, target: Path) -> None:
html = self.env.get_template(template_name).render(**context)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(html, encoding="utf-8")
def build(self) -> None:
generated = self.index_payload.get("generated")
summary = {
"generated": generated,
"total_cves": len(self.index_payload.get("items", [])),
"total_pocs": sum(item.get("poc_count", 0) for item in self.index_payload.get("items", [])),
"high_total": sum(item.get("high_confidence", 0) for item in self.index_payload.get("items", [])),
"medium_total": sum(item.get("medium_confidence", 0) for item in self.index_payload.get("items", [])),
}
self.render(
"pipeline_index.html",
{
"summary": summary,
"top": self.top_payload.get("items", [])[:25],
"diff": self.diff_payload or {},
},
DOCS_DIR / "index.html",
)
self.render(
"pipeline_pocs.html",
{
"generated": generated,
"index": self.index_payload.get("items", []),
"top": self.top_payload.get("items", [])[:100],
},
DOCS_DIR / "pocs" / "index.html",
)
for result in self.results:
self.render(
"pipeline_cve.html",
{"cve": result, "generated": generated},
DOCS_DIR / "cve" / f"{result['cve_id']}.html",
)
if self.diff_payload:
diff_date = self.diff_payload.get("generated")
self.render(
"pipeline_diff.html",
{"diff": self.diff_payload, "generated": generated},
DOCS_DIR / "diffs" / "index.html",
)
if diff_date:
self.render(
"pipeline_diff.html",
{"diff": self.diff_payload, "generated": generated},
DOCS_DIR / "diffs" / f"{diff_date}.html",
)

View File

@@ -13,8 +13,13 @@ DATA_DIR = ROOT / "data"
DOCS_DIR = ROOT / "docs"
API_DIR = DOCS_DIR / "api" / "v1"
SNAPSHOT_DIR = API_DIR / "snapshots"
DIFFS_DIR = API_DIR / "diffs"
TOP_DIR = API_DIR / "top"
TEMPLATES_DIR = ROOT / "templates"
ASSETS_DIR = DOCS_DIR / "assets"
CACHE_DIR = DATA_DIR / "cache"
STATE_DIR = DATA_DIR / "state"
EVIDENCE_DIR = DATA_DIR / "evidence"
def ensure_dirs(*paths: Path) -> None:
@@ -45,6 +50,21 @@ def today_str() -> str:
return datetime.now(timezone.utc).date().isoformat()
def now_utc() -> datetime:
return datetime.now(timezone.utc)
def isoformat(dt: datetime | None = None) -> str:
return (dt or now_utc()).isoformat()
def parse_date(value: str) -> datetime | None:
try:
return datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError:
return None
def slugify(text: str) -> str:
cleaned = re.sub(r"[^A-Za-z0-9]+", "-", text.strip().lower())
cleaned = cleaned.strip("-")
@@ -79,6 +99,7 @@ CVE_SECTION_RE = re.compile(r"^CVE-\d{4}-\d{4,}$", re.IGNORECASE)
def load_poc_index() -> Dict[str, Dict[str, object]]:
"""Load CVE → {desc, poc} mapping from docs/CVE_list.json or markdown files."""
cve_json = DOCS_DIR / "CVE_list.json"
blacklist = load_blacklist()
if cve_json.exists():
data = load_json(cve_json, default=[]) or []
mapping = {}
@@ -86,32 +107,35 @@ def load_poc_index() -> Dict[str, Dict[str, object]]:
cve = str(entry.get("cve", "")).upper()
if not is_valid_cve(cve):
continue
poc_links = stable_unique(entry.get("poc", []) or [])
poc_links = filter_links_by_blacklist(poc_links, blacklist)
mapping[cve] = {
"desc": entry.get("desc", ""),
"poc": stable_unique(entry.get("poc", []) or []),
"poc": poc_links,
}
return mapping
return build_poc_index_from_markdown()
return build_poc_index_from_markdown(blacklist=blacklist)
def build_poc_index_from_markdown() -> Dict[str, Dict[str, object]]:
def build_poc_index_from_markdown(*, blacklist: Optional[List[str]] = None) -> Dict[str, Dict[str, object]]:
mapping: Dict[str, Dict[str, object]] = {}
for md_path in sorted(ROOT.glob("[12][0-9][0-9][0-9]/CVE-*.md")):
cve = md_path.stem.upper()
if not is_valid_cve(cve):
continue
desc, poc_links = parse_cve_markdown(md_path)
desc, poc_links = parse_cve_markdown(md_path, blacklist=blacklist)
mapping[cve] = {"desc": desc, "poc": poc_links}
return mapping
def parse_cve_markdown(path: Path) -> Tuple[str, List[str]]:
def parse_cve_markdown(path: Path, *, blacklist: Optional[List[str]] = None) -> Tuple[str, List[str]]:
text = path.read_text(encoding="utf-8")
sections = parse_sections(text)
description = normalise_block(sections.get("### Description", ""))
references = collect_links(sections.get("#### Reference", ""))
github_links = collect_links(sections.get("#### Github", ""))
blacklist = blacklist or []
references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist)
github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist)
poc_links = stable_unique([*references, *github_links])
return description, poc_links
@@ -144,7 +168,7 @@ def parse_sections(content: str) -> Dict[str, str]:
return sections
def collect_links(block: str) -> List[str]:
def collect_links(block: str, *, blacklist: Optional[List[str]] = None) -> List[str]:
links: List[str] = []
for raw in block.splitlines():
entry = raw.strip()
@@ -154,7 +178,7 @@ def collect_links(block: str) -> List[str]:
entry = entry[2:].strip()
if entry and entry not in links:
links.append(entry)
return links
return filter_links_by_blacklist(links, blacklist or [])
def is_valid_cve(cve_id: str) -> bool:
@@ -165,6 +189,15 @@ def is_valid_cve(cve_id: str) -> bool:
return year.isdigit() and parts[2].isdigit()
def cve_year(cve_id: str) -> int | None:
if not is_valid_cve(cve_id):
return None
try:
return int(cve_id.split("-")[1])
except (TypeError, ValueError):
return None
# --- Trending PoCs -------------------------------------------------------
TREND_ROW_RE = re.compile(r"^\|\s*(?P<stars>\d+)\s*⭐\s*\|\s*(?P<updated>[^|]+)\|\s*\[(?P<name>[^\]]+)\]\((?P<url>[^)]+)\)\s*\|\s*(?P<desc>.*)\|$")
@@ -199,3 +232,87 @@ def read_text(path: Path) -> str:
def write_text(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
# --- New helpers for PoC discovery -------------------------------------------------
def clamp(value: float, minimum: float = 0, maximum: float = 100) -> float:
return max(minimum, min(maximum, value))
def chunked(iterable: Iterable, size: int) -> Iterable[List]:
chunk: List = []
for item in iterable:
chunk.append(item)
if len(chunk) >= size:
yield chunk
chunk = []
if chunk:
yield chunk
def hash_key(text: str) -> str:
import hashlib
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def load_blacklist(path: Path | None = None) -> List[str]:
target = path or ROOT / "blacklist.txt"
if not target.exists():
return []
entries: List[str] = []
for raw in target.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if line and not line.startswith("#"):
entries.append(line)
return entries
def extract_repo_from_url(url: str) -> str:
"""Return repository name segment from a URL (best effort)."""
try:
from urllib.parse import urlparse
parsed = urlparse(url)
host = (parsed.netloc or "").lower()
if host and "github" not in host:
return ""
path = parsed.path or url
except Exception:
path = url
parts = path.strip("/").split("/")
if len(parts) >= 2:
return parts[1].lower()
if parts:
return parts[-1].lower()
return ""
def is_blacklisted_repo(url: str, blacklist: List[str]) -> bool:
repo = extract_repo_from_url(url)
if not repo:
return False
for entry in blacklist:
slug = entry.strip().lower()
if not slug:
continue
if slug.endswith("*"):
prefix = slug[:-1]
if prefix and repo.startswith(prefix):
return True
elif repo == slug:
return True
return False
def filter_links_by_blacklist(links: List[str], blacklist: List[str]) -> List[str]:
if not blacklist:
return links
filtered: List[str] = []
for link in links:
if is_blacklisted_repo(link, blacklist):
continue
filtered.append(link)
return filtered

View File

@@ -0,0 +1,35 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{% block title %}CVE PoC Radar{% endblock %}</title>
<link rel="stylesheet" href="/assets/style.css">
</head>
<body>
<header class="topbar">
<div class="wrap">
<div class="brand">
<a href="/"><span class="dot"></span> CVE PoC Radar</a>
<div class="muted small">Generated {{ generated or summary.generated }}</div>
</div>
<nav>
<a href="/">Dashboard</a>
<a href="/pocs/">PoC Explorer</a>
<a href="/diffs/">Diffs</a>
<a href="https://github.com/0xMarcio/cve" target="_blank" rel="noreferrer">GitHub</a>
</nav>
</div>
</header>
<main class="wrap">
{% block content %}{% endblock %}
</main>
<footer class="footer">
<div class="wrap footer-inner">
<div>Built daily from GitHub search with scoring + evidence.</div>
<div class="muted">API: <code>/api/v1/</code> · Pages under <code>/docs/</code></div>
</div>
</footer>
<script src="/assets/app.js"></script>
</body>
</html>

View File

@@ -0,0 +1,46 @@
{% extends "pipeline_base.html" %}
{% block title %}{{ cve.cve_id }} PoCs{% endblock %}
{% block content %}
<section>
<div class="section-header">
<div>
<p class="eyebrow">CVE record</p>
<h1>{{ cve.cve_id }}</h1>
<p class="muted small">Last updated {{ cve.last_updated }}</p>
</div>
<a class="text-link" href="/api/v1/cve/{{ cve.cve_id }}.json">JSON</a>
</div>
<div class="card-grid">
{% for poc in cve.visible_pocs %}
<article class="card">
<div class="card-title"><a href="{{ poc.repo_url }}" target="_blank" rel="noreferrer">{{ poc.repo_full_name }}</a></div>
<div class="meta-row">
<span class="pill tier-{{ poc.confidence_tier }}">{{ poc.confidence_tier|capitalize }} ({{ poc.confidence_score|round(1) }})</span>
{% if poc.primary_language %}<span class="pill">{{ poc.primary_language }}</span>{% endif %}
{% if poc.stars %}<span class="pill">{{ poc.stars }}★</span>{% endif %}
{% if poc.is_fork %}<span class="pill ghost">Fork</span>{% endif %}
</div>
<div class="muted small">
{% if poc.pushed_at %}Updated {{ poc.pushed_at }} · {% endif %}
{% if poc.archived %}<span class="pill warn">Archived</span>{% endif %}
{% if poc.parent_repo_url %}Parent: <a href="{{ poc.parent_repo_url }}" target="_blank" rel="noreferrer">{{ poc.parent_repo_url }}</a>{% endif %}
</div>
<div class="pill-row">
{% for topic in poc.topics %}<span class="pill ghost">{{ topic }}</span>{% endfor %}
</div>
<div class="matches">
<div class="muted small">Matches</div>
<ul>
{% for match in poc.matches %}
<li><span class="pill tiny">{{ match.match_type }}</span> {{ match.path }}</li>
{% endfor %}
</ul>
</div>
</article>
{% else %}
<p class="muted">No PoCs found yet for {{ cve.cve_id }}.</p>
{% endfor %}
</div>
</section>
{% endblock %}

View File

@@ -0,0 +1,72 @@
{% extends "pipeline_base.html" %}
{% block title %}Diff {{ diff.generated or generated }}{% endblock %}
{% block content %}
<section>
<div class="section-header">
<div>
<p class="eyebrow">Daily delta</p>
<h1>Diff for {{ diff.generated }}</h1>
</div>
<a class="text-link" href="/api/v1/diffs/{{ diff.generated }}.json">JSON</a>
</div>
<div class="grid-2">
<div>
<h3>New high-confidence PoCs</h3>
<ul class="list">
{% for item in diff.new_high_conf_pocs %}
<li>
<span class="pill">+ High</span>
<a href="/cve/{{ item.cve_id }}.html">{{ item.cve_id }}</a>
<a href="https://github.com/{{ item.repo_full_name }}" target="_blank" rel="noreferrer">{{ item.repo_full_name }}</a>
</li>
{% else %}
<li class="muted">No new high-confidence entries.</li>
{% endfor %}
</ul>
</div>
<div>
<h3>Promoted to high</h3>
<ul class="list">
{% for item in diff.promoted_to_high %}
<li>
<span class="pill"></span>
<a href="/cve/{{ item.cve_id }}.html">{{ item.cve_id }}</a>
<a href="https://github.com/{{ item.repo_full_name }}" target="_blank" rel="noreferrer">{{ item.repo_full_name }}</a>
<span class="muted small">(prev {{ item.previous_tier }})</span>
</li>
{% else %}
<li class="muted">No promotions this run.</li>
{% endfor %}
</ul>
</div>
</div>
<div class="grid-2">
<div>
<h3>Demoted or removed</h3>
<ul class="list">
{% for item in diff.demoted_or_removed %}
<li>
<span class="pill warn"></span>
<a href="/cve/{{ item.cve_id }}.html">{{ item.cve_id }}</a>
<span class="muted small">{{ item.repo_full_name }}</span>
</li>
{% else %}
<li class="muted">No removals.</li>
{% endfor %}
</ul>
</div>
<div>
<h3>Dead links (optional checks)</h3>
<ul class="list">
{% for item in diff.dead_links %}
<li><span class="pill warn">offline</span> <a href="{{ item.url }}">{{ item.url }}</a></li>
{% else %}
<li class="muted">Link checks skipped or none failed.</li>
{% endfor %}
</ul>
</div>
</div>
</section>
{% endblock %}

View File

@@ -0,0 +1,69 @@
{% extends "pipeline_base.html" %}
{% block title %}CVE PoC Radar{% endblock %}
{% block content %}
<section class="hero">
<div>
<p class="eyebrow">Daily GitHub sweep</p>
<h1>CVE PoC Goldmine</h1>
<p class="lede">Incremental discovery, scoring, and diffing for public exploit PoCs. High-confidence hits surface first; low-signal noise stays out of the spotlight.</p>
<div class="cta-row">
<a class="btn" href="/pocs/">Open PoC Explorer</a>
<a class="btn ghost" href="/api/v1/index.json">API index</a>
</div>
</div>
<div class="hero-panel">
<div class="stat">
<div class="label">High confidence</div>
<div class="value">{{ summary.high_total }}</div>
</div>
<div class="stat">
<div class="label">Medium confidence</div>
<div class="value">{{ summary.medium_total }}</div>
</div>
<div class="stat">
<div class="label">Tracked CVEs</div>
<div class="value">{{ summary.total_cves }}</div>
</div>
<div class="label muted small">Generated {{ summary.generated }}</div>
</div>
</section>
<section>
<div class="section-header">
<h2>Top PoCs right now</h2>
<a class="text-link" href="/api/v1/top/today.json">JSON</a>
</div>
<div class="card-grid">
{% for poc in top %}
<article class="card">
<div class="card-title"><a href="{{ poc.repo_url }}" target="_blank" rel="noreferrer">{{ poc.repo_full_name }}</a></div>
<div class="meta-row">
<span class="pill tier-{{ poc.tier }}">{{ poc.tier|capitalize }}</span>
<span class="pill">{{ poc.score|round(1) }} pts</span>
{% if poc.stars %}<span class="pill">{{ poc.stars }}★</span>{% endif %}
{% if poc.primary_language %}<span class="pill">{{ poc.primary_language }}</span>{% endif %}
</div>
<div class="muted small">CVE: <a href="/cve/{{ poc.cve_id }}.html">{{ poc.cve_id }}</a></div>
</article>
{% else %}
<p class="muted">No PoCs available yet.</p>
{% endfor %}
</div>
</section>
<section>
<div class="section-header">
<h2>Latest diff</h2>
<a class="text-link" href="/diffs/">Diffs</a>
</div>
{% if diff and diff.new_high_conf_pocs %}
<div class="pill-row">
{% for item in diff.new_high_conf_pocs %}
<span class="pill">+ {{ item.cve_id }} / {{ item.repo_full_name }}</span>
{% endfor %}
</div>
{% else %}
<p class="muted small">No new high-confidence PoCs in the latest run.</p>
{% endif %}
</section>
{% endblock %}

View File

@@ -0,0 +1,47 @@
{% extends "pipeline_base.html" %}
{% block title %}PoC Explorer{% endblock %}
{% block content %}
<section>
<div class="section-header">
<h1>PoC Explorer</h1>
<div class="muted">Search across the pre-built index JSON. Client-side results stay small and fast.</div>
</div>
<input class="input" type="search" placeholder="Search CVE id, language, tier…" data-index-search data-index-url="/api/v1/index.json" data-target="#search-results">
<div id="search-results" class="card-grid" data-search-results>
<p class="muted small">Type to search recent CVEs. Results stream in from <code>/api/v1/index.json</code>.</p>
</div>
</section>
<section>
<div class="section-header">
<h2>Latest high + medium</h2>
<div class="muted small">Server-side snapshot</div>
</div>
<div class="table-wrap">
<table>
<thead>
<tr>
<th>CVE</th>
<th>High</th>
<th>Medium</th>
<th>Languages</th>
<th>Max score</th>
</tr>
</thead>
<tbody>
{% for item in index %}
<tr>
<td><a href="/cve/{{ item.cve_id }}.html">{{ item.cve_id }}</a></td>
<td>{{ item.high_confidence }}</td>
<td>{{ item.medium_confidence }}</td>
<td>{% for lang in item.top_languages %}<span class="pill">{{ lang }}</span>{% else %}<span class="muted"></span>{% endfor %}</td>
<td>{{ item.max_score|round(1) }}</td>
</tr>
{% else %}
<tr><td colspan="5" class="muted">No entries available.</td></tr>
{% endfor %}
</tbody>
</table>
</div>
</section>
{% endblock %}