mirror of
https://github.com/0xMarcio/cve.git
synced 2026-05-28 11:21:40 +02:00
Add PoC pipeline with blacklist filtering and Pages build
This commit is contained in:
+2
-1
@@ -7,6 +7,7 @@ pip install -r requirements.txt
|
||||
python scripts/fetch_kev.py
|
||||
python scripts/fetch_epss.py
|
||||
python scripts/build_site.py
|
||||
python scripts/build_all.py # new PoC discovery + scoring pipeline
|
||||
```
|
||||
|
||||
Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diff/`.
|
||||
Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diffs/`.
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import requests
|
||||
|
||||
from pipeline_outputs import (
|
||||
build_diff,
|
||||
prune_old_diffs,
|
||||
prune_old_snapshots,
|
||||
summarise_for_snapshot,
|
||||
write_cve_outputs,
|
||||
write_diff,
|
||||
write_index,
|
||||
write_snapshot,
|
||||
write_top,
|
||||
)
|
||||
from poc_pipeline import PoCPipeline, build_scope, persist_evidence
|
||||
from site_renderer import SiteRenderer
|
||||
from utils import API_DIR, DOCS_DIR, load_json
|
||||
|
||||
|
||||
def load_existing_results(api_dir: Path) -> List[Dict]:
|
||||
results: List[Dict] = []
|
||||
if not api_dir.exists():
|
||||
return results
|
||||
for path in api_dir.glob("CVE-*.json"):
|
||||
data = load_json(path, default={}) or {}
|
||||
if "pocs" in data:
|
||||
results.append({"cve_id": data.get("cve_id") or path.stem, "pocs": data.get("pocs", []), "last_updated": data.get("last_updated")})
|
||||
return results
|
||||
|
||||
|
||||
def main(argv: List[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Build CVE PoC pipeline outputs, snapshots, and static site")
|
||||
parser.add_argument("--days", type=int, default=7, help="Days window for GitHub discovery windows")
|
||||
parser.add_argument("--mode", choices=["daily", "weekly"], default="daily", help="Run mode to tune scope")
|
||||
parser.add_argument("--limit", type=int, default=50, help="Maximum CVEs to scan per run")
|
||||
parser.add_argument("--cve", action="append", help="Explicit CVE IDs to scan (can be passed multiple times)")
|
||||
parser.add_argument("--skip-discovery", action="store_true", help="Skip GitHub discovery and reuse existing API outputs")
|
||||
parser.add_argument("--check-links", action="store_true", help="Optionally HEAD check repo URLs for dead links")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
pipeline = PoCPipeline()
|
||||
scope: List[str] = []
|
||||
discovery_days = args.days
|
||||
if args.cve:
|
||||
scope = [cve.upper() for cve in args.cve]
|
||||
elif not args.skip_discovery:
|
||||
prefer_recent = True
|
||||
scan_days = args.days
|
||||
limit = args.limit
|
||||
if args.mode == "weekly":
|
||||
scan_days = max(scan_days, 30)
|
||||
discovery_days = scan_days
|
||||
prefer_recent = False
|
||||
limit = None
|
||||
scope = build_scope(scan_days, github_list=Path("github.txt"), existing_api=API_DIR / "cve", prefer_recent_years=prefer_recent, max_cves=limit)
|
||||
|
||||
results: List[Dict] = []
|
||||
if args.skip_discovery:
|
||||
results = load_existing_results(API_DIR / "cve")
|
||||
else:
|
||||
for idx, cve_id in enumerate(scope):
|
||||
try:
|
||||
results.append(pipeline.discover_for_cve(cve_id, days=discovery_days))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
print(f"[warn] Failed to process {cve_id}: {exc}", file=sys.stderr)
|
||||
persist_evidence(results)
|
||||
|
||||
if not results:
|
||||
print("No results to write; aborting.")
|
||||
return 1
|
||||
|
||||
write_cve_outputs(results)
|
||||
index_payload = write_index(results)
|
||||
top_payload = write_top(results)
|
||||
|
||||
def maybe_check_links() -> List[Dict]:
|
||||
if not args.check_links:
|
||||
return []
|
||||
urls = []
|
||||
for result in results:
|
||||
for poc in result.get("pocs", []):
|
||||
if poc.get("confidence_tier") in {"high", "medium"} and poc.get("repo_url"):
|
||||
urls.append(poc["repo_url"])
|
||||
urls = urls[:25]
|
||||
dead: List[Dict] = []
|
||||
for url in urls:
|
||||
try:
|
||||
resp = requests.head(url, timeout=5, allow_redirects=True)
|
||||
if resp.status_code >= 400:
|
||||
dead.append({"url": url, "status": resp.status_code})
|
||||
except requests.RequestException as exc: # noqa: BLE001
|
||||
dead.append({"url": url, "error": str(exc)})
|
||||
return dead
|
||||
|
||||
snapshot_payload = summarise_for_snapshot(results, top=top_payload)
|
||||
prev_snapshot = load_json(API_DIR / "snapshots" / "latest.json", default={}) or {}
|
||||
snapshot_path = write_snapshot(snapshot_payload)
|
||||
diff_payload = build_diff(prev_snapshot, snapshot_payload, dead_links=maybe_check_links())
|
||||
write_diff(diff_payload)
|
||||
prune_old_snapshots()
|
||||
prune_old_diffs()
|
||||
|
||||
renderer = SiteRenderer(results=results, index_payload=index_payload, top_payload=top_payload, diff_payload=diff_payload)
|
||||
renderer.build()
|
||||
|
||||
print(f"Generated site under {DOCS_DIR}")
|
||||
print(f"Wrote latest snapshot to {snapshot_path}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,210 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from utils import CACHE_DIR, chunked, hash_key, isoformat
|
||||
|
||||
|
||||
TEXT_MATCH_HEADER = "application/vnd.github.text-match+json"
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
def __init__(self, calls_per_minute: int) -> None:
|
||||
self.min_interval = 60.0 / max(calls_per_minute, 1)
|
||||
self.last_call: Dict[str, float] = {}
|
||||
|
||||
def wait(self, bucket: str) -> None:
|
||||
last = self.last_call.get(bucket, 0.0)
|
||||
elapsed = time.time() - last
|
||||
if elapsed < self.min_interval:
|
||||
time.sleep(self.min_interval - elapsed)
|
||||
self.last_call[bucket] = time.time()
|
||||
|
||||
|
||||
class FileCache:
|
||||
def __init__(self, base: Path) -> None:
|
||||
self.base = base
|
||||
self.base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _path_for(self, key: str) -> Path:
|
||||
digest = hash_key(key)
|
||||
return self.base / digest[:2] / f"{digest}.json"
|
||||
|
||||
def load(self, key: str, *, ttl: int) -> Optional[Dict]:
|
||||
path = self._path_for(key)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
data = json.load(handle)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return None
|
||||
expires_at = data.get("expires_at")
|
||||
if expires_at:
|
||||
try:
|
||||
expires_ts = time.mktime(time.strptime(expires_at, "%Y-%m-%dT%H:%M:%S"))
|
||||
if time.time() > expires_ts:
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
return data.get("payload")
|
||||
|
||||
def save(self, key: str, payload: Dict, *, ttl: int) -> None:
|
||||
path = self._path_for(key)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
data = {
|
||||
"fetched_at": isoformat(),
|
||||
"expires_at": time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(time.time() + ttl)),
|
||||
"payload": payload,
|
||||
}
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
json.dump(data, handle, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
kind: str
|
||||
query: str
|
||||
page: int
|
||||
payload: Dict
|
||||
|
||||
|
||||
class GitHubClient:
|
||||
def __init__(
|
||||
self,
|
||||
token: Optional[str],
|
||||
*,
|
||||
cache_dir: Path | None = None,
|
||||
code_search_rpm: int = 10,
|
||||
general_rpm: int = 30,
|
||||
) -> None:
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({"Accept": TEXT_MATCH_HEADER})
|
||||
if token:
|
||||
self.session.headers["Authorization"] = f"Bearer {token}"
|
||||
self.base_url = "https://api.github.com"
|
||||
self.graphql_url = f"{self.base_url}/graphql"
|
||||
cache_root = cache_dir or CACHE_DIR / "github"
|
||||
self.cache = FileCache(cache_root)
|
||||
self.rate_limiters = {
|
||||
"code": RateLimiter(code_search_rpm),
|
||||
"search": RateLimiter(general_rpm),
|
||||
"graphql": RateLimiter(general_rpm),
|
||||
}
|
||||
|
||||
def _request(self, method: str, url: str, *, bucket: str, **kwargs) -> requests.Response:
|
||||
self.rate_limiters[bucket].wait(bucket)
|
||||
attempts = 0
|
||||
while True:
|
||||
attempts += 1
|
||||
try:
|
||||
response = self.session.request(method, url, timeout=30, **kwargs)
|
||||
except requests.RequestException:
|
||||
if attempts >= 3:
|
||||
raise
|
||||
time.sleep(2 * attempts)
|
||||
continue
|
||||
|
||||
if response.status_code == 403 and "X-RateLimit-Remaining" in response.headers:
|
||||
remaining = int(response.headers.get("X-RateLimit-Remaining") or "0")
|
||||
reset = response.headers.get("X-RateLimit-Reset")
|
||||
if remaining <= 0 and reset:
|
||||
try:
|
||||
reset_ts = int(reset)
|
||||
wait_for = max(0, reset_ts - int(time.time()) + 1)
|
||||
time.sleep(wait_for)
|
||||
continue
|
||||
except ValueError:
|
||||
pass
|
||||
if response.status_code >= 500 and attempts < 3:
|
||||
time.sleep(1 + attempts)
|
||||
continue
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
def _cached_search(self, kind: str, query: str, page: int, per_page: int, ttl: int) -> Dict:
|
||||
cache_key = f"{kind}:{query}:p{page}:n{per_page}"
|
||||
cached = self.cache.load(cache_key, ttl=ttl)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
url = f"{self.base_url}/search/{kind}"
|
||||
params = {"q": query, "page": page, "per_page": per_page}
|
||||
resp = self._request("GET", url, params=params, bucket="code" if kind == "code" else "search")
|
||||
payload = resp.json()
|
||||
self.cache.save(cache_key, payload, ttl=ttl)
|
||||
return payload
|
||||
|
||||
def search_repositories(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
|
||||
return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
|
||||
|
||||
def search_code(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
|
||||
return SearchResult("code", query, page, self._cached_search("code", query, page, per_page, ttl))
|
||||
|
||||
def search_topics(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
|
||||
return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
|
||||
|
||||
def fetch_repo_metadata(self, full_names: Iterable[str], *, ttl: int = 6 * 3600) -> Dict[str, Dict]:
|
||||
results: Dict[str, Dict] = {}
|
||||
to_fetch: List[str] = []
|
||||
for name in full_names:
|
||||
cache_key = f"repo-meta:{name}"
|
||||
cached = self.cache.load(cache_key, ttl=ttl)
|
||||
if cached is not None:
|
||||
results[name] = cached
|
||||
else:
|
||||
to_fetch.append(name)
|
||||
|
||||
if not to_fetch:
|
||||
return results
|
||||
|
||||
fields = """
|
||||
nameWithOwner
|
||||
url
|
||||
stargazerCount
|
||||
description
|
||||
forkCount
|
||||
isFork
|
||||
isArchived
|
||||
pushedAt
|
||||
updatedAt
|
||||
primaryLanguage { name }
|
||||
parent { nameWithOwner url }
|
||||
repositoryTopics(first: 20) { nodes { topic { name } } }
|
||||
"""
|
||||
|
||||
for batch in chunked(to_fetch, 12):
|
||||
parts = []
|
||||
for idx, full_name in enumerate(batch):
|
||||
if "/" not in full_name:
|
||||
continue
|
||||
owner, name = full_name.split("/", 1)
|
||||
owner = owner.replace('"', "")
|
||||
name = name.replace('"', "")
|
||||
parts.append(f'repo_{idx}: repository(owner: "{owner}", name: "{name}") {{ {fields} }}')
|
||||
if not parts:
|
||||
continue
|
||||
query = "query { " + " ".join(parts) + " }"
|
||||
resp = self._request("POST", self.graphql_url, json={"query": query}, bucket="graphql")
|
||||
data = resp.json()
|
||||
repos = data.get("data", {})
|
||||
for idx, full_name in enumerate(batch):
|
||||
key = f"repo_{idx}"
|
||||
meta = repos.get(key) or {}
|
||||
cache_key = f"repo-meta:{full_name}"
|
||||
self.cache.save(cache_key, meta, ttl=ttl)
|
||||
results[full_name] = meta
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def build_client(token_env: str = "GITHUB_TOKEN") -> GitHubClient:
|
||||
token = os.environ.get(token_env)
|
||||
return GitHubClient(token, cache_dir=CACHE_DIR / "github")
|
||||
@@ -0,0 +1,220 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
from utils import API_DIR, DIFFS_DIR, SNAPSHOT_DIR, TOP_DIR, ensure_dirs, load_json, save_json, today_str
|
||||
|
||||
|
||||
def write_cve_outputs(results: List[Dict], *, base_dir: Path | None = None) -> None:
|
||||
target_dir = base_dir or API_DIR / "cve"
|
||||
ensure_dirs(target_dir)
|
||||
for result in results:
|
||||
last_updated = result.get("last_updated") or today_str()
|
||||
output = {
|
||||
"cve_id": result["cve_id"],
|
||||
"last_updated": last_updated,
|
||||
"pocs": [
|
||||
{
|
||||
"repo_full_name": poc.get("repo_full_name"),
|
||||
"repo_url": poc.get("repo_url"),
|
||||
"is_fork": poc.get("is_fork"),
|
||||
"parent_repo_url": poc.get("parent_repo_url"),
|
||||
"stars": poc.get("stars"),
|
||||
"forks": poc.get("forks"),
|
||||
"archived": poc.get("archived"),
|
||||
"pushed_at": poc.get("pushed_at") or poc.get("updated_at"),
|
||||
"topics": poc.get("topics", []),
|
||||
"primary_language": poc.get("primary_language"),
|
||||
"matches": poc.get("matches", []),
|
||||
"confidence_score": poc.get("confidence_score"),
|
||||
"confidence_tier": poc.get("confidence_tier"),
|
||||
}
|
||||
for poc in result.get("pocs", [])
|
||||
],
|
||||
}
|
||||
save_json(target_dir / f"{result['cve_id']}.json", output)
|
||||
|
||||
|
||||
def build_index(results: List[Dict]) -> Dict:
|
||||
items: List[Dict] = []
|
||||
for result in results:
|
||||
poc_entries = result.get("pocs", [])
|
||||
high = [p for p in poc_entries if p.get("confidence_tier") == "high"]
|
||||
medium = [p for p in poc_entries if p.get("confidence_tier") == "medium"]
|
||||
langs = Counter()
|
||||
max_score = 0.0
|
||||
for poc in poc_entries:
|
||||
lang = poc.get("primary_language")
|
||||
if lang:
|
||||
langs[lang] += 1
|
||||
max_score = max(max_score, poc.get("confidence_score") or 0)
|
||||
items.append(
|
||||
{
|
||||
"cve_id": result["cve_id"],
|
||||
"poc_count": len(poc_entries),
|
||||
"high_confidence": len(high),
|
||||
"medium_confidence": len(medium),
|
||||
"top_languages": [lang for lang, _ in langs.most_common(3)],
|
||||
"max_score": max_score,
|
||||
"last_updated": result.get("last_updated"),
|
||||
}
|
||||
)
|
||||
return {"generated": today_str(), "items": sorted(items, key=lambda r: r["cve_id"], reverse=True)}
|
||||
|
||||
|
||||
def write_index(results: List[Dict]) -> Dict:
|
||||
ensure_dirs(API_DIR)
|
||||
payload = build_index(results)
|
||||
save_json(API_DIR / "index.json", payload)
|
||||
return payload
|
||||
|
||||
|
||||
def write_top(results: List[Dict], *, limit: int = 100) -> Dict:
|
||||
ensure_dirs(TOP_DIR)
|
||||
entries: List[Dict] = []
|
||||
for result in results:
|
||||
for poc in result.get("pocs", []):
|
||||
if poc.get("confidence_tier") not in {"high", "medium"}:
|
||||
continue
|
||||
entries.append(
|
||||
{
|
||||
"cve_id": result["cve_id"],
|
||||
"repo_full_name": poc.get("repo_full_name"),
|
||||
"repo_url": poc.get("repo_url"),
|
||||
"score": poc.get("confidence_score"),
|
||||
"tier": poc.get("confidence_tier"),
|
||||
"stars": poc.get("stars"),
|
||||
"primary_language": poc.get("primary_language"),
|
||||
}
|
||||
)
|
||||
entries.sort(key=lambda e: (-(e.get("score") or 0), -(e.get("stars") or 0)))
|
||||
payload = {"generated": today_str(), "items": entries[:limit]}
|
||||
save_json(TOP_DIR / "today.json", payload)
|
||||
return payload
|
||||
|
||||
|
||||
def summarise_for_snapshot(results: List[Dict], *, top: Dict | None = None) -> Dict:
|
||||
summary: Dict[str, Dict[str, Dict]] = {}
|
||||
for result in results:
|
||||
repo_map: Dict[str, Dict] = {}
|
||||
for poc in result.get("pocs", []):
|
||||
repo_map[poc.get("repo_full_name")] = {
|
||||
"score": poc.get("confidence_score"),
|
||||
"tier": poc.get("confidence_tier"),
|
||||
}
|
||||
summary[result["cve_id"]] = repo_map
|
||||
payload = {"generated": today_str(), "entries": summary}
|
||||
if top:
|
||||
payload["top"] = top
|
||||
return payload
|
||||
|
||||
|
||||
def write_snapshot(summary: Dict) -> Path:
|
||||
ensure_dirs(SNAPSHOT_DIR)
|
||||
target = SNAPSHOT_DIR / f"{summary['generated']}.json"
|
||||
save_json(target, summary)
|
||||
save_json(SNAPSHOT_DIR / "latest.json", summary)
|
||||
return target
|
||||
|
||||
|
||||
def prune_old_snapshots(days: int = 14) -> None:
|
||||
if not SNAPSHOT_DIR.exists():
|
||||
return
|
||||
cutoff = datetime.utcnow().date() - timedelta(days=days)
|
||||
for snap in SNAPSHOT_DIR.glob("*.json"):
|
||||
try:
|
||||
snap_date = datetime.strptime(snap.stem, "%Y-%m-%d").date()
|
||||
except ValueError:
|
||||
continue
|
||||
if snap_date < cutoff:
|
||||
snap.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def prune_old_diffs(days: int = 14) -> None:
|
||||
if not DIFFS_DIR.exists():
|
||||
return
|
||||
cutoff = datetime.now().date() - timedelta(days=days)
|
||||
for diff in DIFFS_DIR.glob("*.json"):
|
||||
try:
|
||||
diff_date = datetime.strptime(diff.stem, "%Y-%m-%d").date()
|
||||
except ValueError:
|
||||
continue
|
||||
if diff_date < cutoff:
|
||||
diff.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def _load_snapshot(path: Path) -> Dict:
|
||||
return load_json(path, default={}) or {}
|
||||
|
||||
|
||||
def build_diff(prev: Dict, curr: Dict, *, dead_links: List[Dict] | None = None) -> Dict:
|
||||
prev_entries = prev.get("entries", {})
|
||||
curr_entries = curr.get("entries", {})
|
||||
|
||||
new_high: List[Dict] = []
|
||||
promoted: List[Dict] = []
|
||||
demoted: List[Dict] = []
|
||||
|
||||
for cve_id, repos in curr_entries.items():
|
||||
for repo_name, info in repos.items():
|
||||
tier = info.get("tier")
|
||||
if tier != "high":
|
||||
continue
|
||||
prev_info = (prev_entries.get(cve_id) or {}).get(repo_name)
|
||||
if not prev_info:
|
||||
new_high.append({"cve_id": cve_id, "repo_full_name": repo_name, "score": info.get("score")})
|
||||
elif prev_info.get("tier") != "high":
|
||||
promoted.append(
|
||||
{
|
||||
"cve_id": cve_id,
|
||||
"repo_full_name": repo_name,
|
||||
"score": info.get("score"),
|
||||
"previous_tier": prev_info.get("tier"),
|
||||
}
|
||||
)
|
||||
|
||||
for cve_id, repos in prev_entries.items():
|
||||
for repo_name, info in repos.items():
|
||||
if info.get("tier") != "high":
|
||||
continue
|
||||
curr_info = (curr_entries.get(cve_id) or {}).get(repo_name)
|
||||
if not curr_info or curr_info.get("tier") != "high":
|
||||
demoted.append(
|
||||
{
|
||||
"cve_id": cve_id,
|
||||
"repo_full_name": repo_name,
|
||||
"previous_score": info.get("score"),
|
||||
"previous_tier": info.get("tier"),
|
||||
"current_tier": curr_info.get("tier") if curr_info else None,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"generated": curr.get("generated"),
|
||||
"new_high_conf_pocs": new_high,
|
||||
"promoted_to_high": promoted,
|
||||
"demoted_or_removed": demoted,
|
||||
"dead_links": dead_links or [],
|
||||
}
|
||||
|
||||
|
||||
def write_diff(diff: Dict) -> Path:
|
||||
ensure_dirs(DIFFS_DIR)
|
||||
target = DIFFS_DIR / f"{diff['generated']}.json"
|
||||
save_json(target, diff)
|
||||
save_json(DIFFS_DIR / "latest.json", diff)
|
||||
return target
|
||||
|
||||
|
||||
def latest_snapshots() -> Tuple[Dict, Dict]:
|
||||
if not SNAPSHOT_DIR.exists():
|
||||
return {}, {}
|
||||
snaps = sorted(SNAPSHOT_DIR.glob("*.json"))
|
||||
if not snaps:
|
||||
return {}, {}
|
||||
curr = _load_snapshot(snaps[-1])
|
||||
prev = _load_snapshot(snaps[-2]) if len(snaps) > 1 else {}
|
||||
return prev, curr
|
||||
@@ -0,0 +1,274 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date, datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
||||
|
||||
from github_client import GitHubClient, SearchResult, build_client
|
||||
from poc_scoring import match_score, score_repo
|
||||
from utils import API_DIR, EVIDENCE_DIR, chunked, cve_year, ensure_dirs, isoformat, load_blacklist, load_json, save_json, today_str
|
||||
|
||||
|
||||
LANG_PARTITIONS = ("python", "go", "c", "shell", "powershell", "java", "ruby", "js")
|
||||
CVE_RE = re.compile(r"CVE-\d{4}-\d{4,}", re.IGNORECASE)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchEvidence:
|
||||
path: str
|
||||
match_type: str
|
||||
query: str
|
||||
score: float | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RepoCandidate:
|
||||
cve_id: str
|
||||
repo_full_name: str
|
||||
repo_url: str
|
||||
matches: List[MatchEvidence] = field(default_factory=list)
|
||||
metadata: Dict[str, object] = field(default_factory=dict)
|
||||
|
||||
def add_match(self, path: str, match_type: str, query: str) -> None:
|
||||
key = (path, match_type)
|
||||
existing = {(m.path, m.match_type) for m in self.matches}
|
||||
if key in existing:
|
||||
return
|
||||
self.matches.append(MatchEvidence(path=path, match_type=match_type, query=query))
|
||||
|
||||
|
||||
def build_created_ranges(days: int, *, window: int = 7) -> List[Tuple[str, str]]:
|
||||
end = date.today()
|
||||
start = end - timedelta(days=max(days, 1))
|
||||
ranges: List[Tuple[str, str]] = []
|
||||
cursor = start
|
||||
while cursor <= end:
|
||||
window_end = min(cursor + timedelta(days=window - 1), end)
|
||||
ranges.append((cursor.isoformat(), window_end.isoformat()))
|
||||
cursor = window_end + timedelta(days=1)
|
||||
return ranges or [(start.isoformat(), end.isoformat())]
|
||||
|
||||
|
||||
def build_query_pack(cve_id: str, created_range: Tuple[str, str] | None = None) -> List[Dict[str, str]]:
|
||||
base_repo = f'{cve_id} in:name,description,readme fork:false'
|
||||
enriched_repo = f'{cve_id} (poc OR exploit) in:name,description,readme fork:false'
|
||||
topic_query = f"topic:{cve_id.lower()} fork:false"
|
||||
created_suffix = ""
|
||||
if created_range:
|
||||
created_suffix = f" created:{created_range[0]}..{created_range[1]}"
|
||||
|
||||
queries = [
|
||||
{"kind": "repositories", "query": base_repo + created_suffix, "match_type": "name"},
|
||||
{"kind": "repositories", "query": enriched_repo + created_suffix, "match_type": "description"},
|
||||
{"kind": "repositories", "query": topic_query + created_suffix, "match_type": "topic"},
|
||||
]
|
||||
|
||||
for lang in LANG_PARTITIONS:
|
||||
base_code = f'{cve_id} in:file language:{lang}{created_suffix}'
|
||||
queries.append({"kind": "code", "query": base_code, "match_type": "code"})
|
||||
|
||||
# generic code search without language partition for the most recent window
|
||||
queries.append({"kind": "code", "query": f"{cve_id} in:file{created_suffix}", "match_type": "code"})
|
||||
return queries
|
||||
|
||||
|
||||
def parse_repo_from_item(item: Dict) -> Tuple[str | None, str | None]:
|
||||
repo_full_name = item.get("full_name") or item.get("repository", {}).get("full_name")
|
||||
repo_url = item.get("html_url") or item.get("repository", {}).get("html_url")
|
||||
if not repo_full_name and "repository" in item:
|
||||
repo_full_name = item["repository"].get("owner", {}).get("login", "")
|
||||
if repo_full_name:
|
||||
repo_full_name = f"{repo_full_name}/{item['repository'].get('name', '')}"
|
||||
return repo_full_name, repo_url
|
||||
|
||||
|
||||
def extract_matches(item: Dict, default_type: str, query: str) -> List[MatchEvidence]:
|
||||
matches: List[MatchEvidence] = []
|
||||
for text_match in item.get("text_matches", []) or []:
|
||||
prop = text_match.get("property") or text_match.get("object_type") or ""
|
||||
fragment = text_match.get("fragment") or text_match.get("path") or prop or ""
|
||||
match_type = prop if prop else default_type
|
||||
matches.append(MatchEvidence(path=str(fragment), match_type=str(match_type), query=query))
|
||||
if not matches:
|
||||
path = item.get("path") or default_type
|
||||
matches.append(MatchEvidence(path=str(path), match_type=default_type, query=query))
|
||||
return matches
|
||||
|
||||
|
||||
def normalise_metadata(meta: Dict, fallback_full_name: str, fallback_url: str) -> Dict:
|
||||
topics = []
|
||||
if meta.get("repositoryTopics"):
|
||||
for node in meta["repositoryTopics"].get("nodes", []):
|
||||
topic = (node.get("topic") or {}).get("name")
|
||||
if topic:
|
||||
topics.append(topic)
|
||||
primary_language = None
|
||||
if meta.get("primaryLanguage"):
|
||||
primary_language = meta["primaryLanguage"].get("name")
|
||||
parent = meta.get("parent") or {}
|
||||
return {
|
||||
"repo_full_name": meta.get("nameWithOwner") or fallback_full_name,
|
||||
"repo_url": meta.get("url") or fallback_url,
|
||||
"description": meta.get("description") or "",
|
||||
"is_fork": bool(meta.get("isFork")),
|
||||
"parent_repo_url": parent.get("url"),
|
||||
"stars": meta.get("stargazerCount") or 0,
|
||||
"forks": meta.get("forkCount") or 0,
|
||||
"archived": bool(meta.get("isArchived")),
|
||||
"pushed_at": meta.get("pushedAt"),
|
||||
"updated_at": meta.get("updatedAt"),
|
||||
"topics": topics,
|
||||
"primary_language": primary_language,
|
||||
}
|
||||
|
||||
|
||||
class PoCPipeline:
|
||||
def __init__(
|
||||
self,
|
||||
client: GitHubClient | None = None,
|
||||
*,
|
||||
blacklist_path: Path | None = None,
|
||||
search_ttl: int = 3 * 3600,
|
||||
) -> None:
|
||||
self.client = client or build_client()
|
||||
self.blacklist = load_blacklist(blacklist_path)
|
||||
self.search_ttl = search_ttl
|
||||
|
||||
def _run_query(self, query: Dict, page: int) -> SearchResult:
|
||||
if query["kind"] == "repositories":
|
||||
return self.client.search_repositories(query["query"], page=page, per_page=50, ttl=self.search_ttl)
|
||||
if query["kind"] == "code":
|
||||
return self.client.search_code(query["query"], page=page, per_page=50, ttl=self.search_ttl)
|
||||
return self.client.search_topics(query["query"], page=page, per_page=50, ttl=self.search_ttl)
|
||||
|
||||
def discover_for_cve(self, cve_id: str, *, days: int, max_pages_repo: int = 2, max_pages_code: int = 2) -> Dict:
|
||||
ranges = build_created_ranges(days)
|
||||
candidates: Dict[str, RepoCandidate] = {}
|
||||
query_log: List[Dict] = []
|
||||
|
||||
for created_range in ranges:
|
||||
query_pack = build_query_pack(cve_id, created_range)
|
||||
for query in query_pack:
|
||||
query_log.append({"query": query["query"], "kind": query["kind"], "window": created_range})
|
||||
page_limit = max_pages_code if query["kind"] == "code" else max_pages_repo
|
||||
for page in range(1, page_limit + 1):
|
||||
result = self._run_query(query, page)
|
||||
items = result.payload.get("items", [])
|
||||
for item in items:
|
||||
repo_full_name, repo_url = parse_repo_from_item(item)
|
||||
if not repo_full_name or not repo_url:
|
||||
continue
|
||||
candidate = candidates.setdefault(
|
||||
repo_full_name,
|
||||
RepoCandidate(cve_id=cve_id, repo_full_name=repo_full_name, repo_url=repo_url),
|
||||
)
|
||||
for match in extract_matches(item, query["match_type"], query["query"]):
|
||||
candidate.add_match(match.path, match.match_type, match.query)
|
||||
if len(items) < 50:
|
||||
break
|
||||
|
||||
metadata = self.client.fetch_repo_metadata(candidates.keys())
|
||||
for repo_full_name, candidate in candidates.items():
|
||||
meta = metadata.get(repo_full_name, {})
|
||||
candidate.metadata = normalise_metadata(meta, repo_full_name, candidate.repo_url)
|
||||
|
||||
repos: List[Dict] = []
|
||||
for candidate in candidates.values():
|
||||
matches_dicts = []
|
||||
for m in candidate.matches:
|
||||
m.score = match_score({"path": m.path, "match_type": m.match_type})
|
||||
matches_dicts.append({"path": m.path, "match_type": m.match_type, "query": m.query, "score": m.score})
|
||||
score, tier = score_repo(candidate.metadata, matches_dicts, self.blacklist)
|
||||
repo_entry = {
|
||||
**candidate.metadata,
|
||||
"matches": matches_dicts,
|
||||
"confidence_score": score,
|
||||
"confidence_tier": tier,
|
||||
"cve_id": cve_id,
|
||||
}
|
||||
repos.append(repo_entry)
|
||||
|
||||
repos.sort(key=lambda r: (-r["confidence_score"], -r.get("stars", 0)))
|
||||
|
||||
evidence = {
|
||||
"queries": query_log,
|
||||
"candidates": [
|
||||
{
|
||||
"repo_full_name": r["repo_full_name"],
|
||||
"matches": r["matches"],
|
||||
"match_count": len(r["matches"]),
|
||||
"score": r["confidence_score"],
|
||||
"tier": r["confidence_tier"],
|
||||
}
|
||||
for r in repos
|
||||
],
|
||||
}
|
||||
return {"cve_id": cve_id, "last_updated": isoformat(), "pocs": repos, "evidence": evidence}
|
||||
|
||||
def discover_many(self, cve_ids: Iterable[str], *, days: int, limit: Optional[int] = None) -> List[Dict]:
|
||||
results: List[Dict] = []
|
||||
for idx, cve_id in enumerate(cve_ids):
|
||||
if limit and idx >= limit:
|
||||
break
|
||||
results.append(self.discover_for_cve(cve_id, days=days))
|
||||
return results
|
||||
|
||||
|
||||
def persist_evidence(results: List[Dict]) -> None:
|
||||
ensure_dirs(EVIDENCE_DIR)
|
||||
for result in results:
|
||||
cve_id = result["cve_id"]
|
||||
evidence_path = EVIDENCE_DIR / f"{cve_id}.json"
|
||||
save_json(evidence_path, result.get("evidence", {}))
|
||||
|
||||
|
||||
def discover_from_github_list(path: Path) -> List[str]:
|
||||
if not path.exists():
|
||||
return []
|
||||
ids: List[str] = []
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
matches = CVE_RE.findall(line)
|
||||
for match in matches:
|
||||
if match.upper() not in ids:
|
||||
ids.append(match.upper())
|
||||
return ids
|
||||
|
||||
|
||||
def load_existing_cves(api_dir: Path = API_DIR / "cve") -> List[str]:
|
||||
if not api_dir.exists():
|
||||
return []
|
||||
return sorted({p.stem.upper() for p in api_dir.glob("CVE-*.json") if CVE_RE.match(p.stem)})
|
||||
|
||||
|
||||
def build_scope(
|
||||
days: int,
|
||||
*,
|
||||
github_list: Path,
|
||||
existing_api: Path,
|
||||
prefer_recent_years: bool = True,
|
||||
max_cves: int | None = None,
|
||||
low_conf_threshold: int = 1,
|
||||
) -> List[str]:
|
||||
seeds = discover_from_github_list(github_list)
|
||||
existing = load_existing_cves(existing_api)
|
||||
candidates = seeds or existing
|
||||
|
||||
if prefer_recent_years:
|
||||
current_year = date.today().year
|
||||
candidates = [cve for cve in candidates if cve_year(cve) and cve_year(cve) >= current_year - 2] or candidates
|
||||
|
||||
index_path = API_DIR / "index.json"
|
||||
low_conf: List[str] = []
|
||||
if index_path.exists():
|
||||
index_payload = load_json(index_path, default={}) or {}
|
||||
for item in index_payload.get("items", []):
|
||||
score = (item.get("high_confidence", 0) or 0) + (item.get("medium_confidence", 0) or 0)
|
||||
if score <= low_conf_threshold:
|
||||
low_conf.append(item.get("cve_id"))
|
||||
|
||||
scoped = candidates + [cve for cve in low_conf if cve and cve not in candidates]
|
||||
if max_cves:
|
||||
scoped = scoped[:max_cves]
|
||||
return scoped
|
||||
@@ -0,0 +1,121 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
from utils import clamp, parse_date
|
||||
|
||||
DOC_EXTS = {"md", "txt", "rst", "adoc", "markdown", "mkd", "mdown"}
|
||||
POSITIVE_KEYWORDS = ("poc", "exploit", "rce", "lpe", "auth bypass", "bypass")
|
||||
NEGATIVE_KEYWORDS = ("report", "writeup", "advisory", "changelog")
|
||||
|
||||
|
||||
def is_doc_path(path: str) -> bool:
|
||||
lower = path.lower()
|
||||
if lower.endswith("/"):
|
||||
return True
|
||||
if "." not in lower:
|
||||
return False
|
||||
ext = lower.rsplit(".", 1)[-1]
|
||||
return ext in DOC_EXTS
|
||||
|
||||
|
||||
def match_score(match: Dict) -> float:
|
||||
path = str(match.get("path", ""))
|
||||
match_type = str(match.get("match_type", "")).lower()
|
||||
base = 50 if not is_doc_path(path) else 30
|
||||
if match_type in ("code",):
|
||||
base += 10
|
||||
if "readme" in match_type:
|
||||
base += 5
|
||||
if "topic" in match_type:
|
||||
base -= 5
|
||||
return clamp(base, 0, 100)
|
||||
|
||||
|
||||
def tier_for_score(score: float) -> str:
|
||||
if score >= 75:
|
||||
return "high"
|
||||
if score >= 45:
|
||||
return "medium"
|
||||
return "low"
|
||||
|
||||
|
||||
def keyword_hits(text: str, keywords: Iterable[str]) -> int:
|
||||
if not text:
|
||||
return 0
|
||||
lower = text.lower()
|
||||
return sum(1 for kw in keywords if kw in lower)
|
||||
|
||||
|
||||
def recency_bonus(pushed_at: str | None) -> float:
|
||||
if not pushed_at:
|
||||
return 0.0
|
||||
dt = parse_date(pushed_at)
|
||||
if not dt:
|
||||
return 0.0
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
delta = datetime.now(timezone.utc) - dt
|
||||
if delta <= timedelta(days=30):
|
||||
return 18.0
|
||||
if delta <= timedelta(days=90):
|
||||
return 10.0
|
||||
if delta <= timedelta(days=180):
|
||||
return 5.0
|
||||
return 0.0
|
||||
|
||||
|
||||
def score_repo(repo: Dict, matches: List[Dict], blacklist: List[str]) -> Tuple[float, str]:
|
||||
stars = repo.get("stargazerCount") or repo.get("stars") or 0
|
||||
forks = repo.get("forkCount") or repo.get("forks") or 0
|
||||
is_fork = bool(repo.get("isFork"))
|
||||
archived = bool(repo.get("isArchived"))
|
||||
topics = [t.lower() for t in repo.get("topics", []) if t]
|
||||
name = str(repo.get("nameWithOwner") or repo.get("repo_full_name") or "").lower()
|
||||
description = str(repo.get("description") or "").lower()
|
||||
|
||||
non_doc_matches = [m for m in matches if not is_doc_path(str(m.get("path", "")))]
|
||||
doc_matches = [m for m in matches if is_doc_path(str(m.get("path", "")))]
|
||||
|
||||
score = 12.0
|
||||
if non_doc_matches:
|
||||
score += 25 + min(len(non_doc_matches) * 2, 10)
|
||||
if doc_matches and not non_doc_matches:
|
||||
score -= 20
|
||||
|
||||
score += recency_bonus(repo.get("pushed_at") or repo.get("pushedAt") or repo.get("updated_at"))
|
||||
|
||||
score += min(stars / 50.0, 25.0)
|
||||
score += min(forks / 200.0, 5.0)
|
||||
|
||||
score += keyword_hits(description, POSITIVE_KEYWORDS) * 4.0
|
||||
score += keyword_hits(" ".join(topics), POSITIVE_KEYWORDS) * 4.0
|
||||
|
||||
negative_bias = keyword_hits(description, NEGATIVE_KEYWORDS)
|
||||
if negative_bias and not non_doc_matches:
|
||||
score -= 15
|
||||
|
||||
if is_fork:
|
||||
score -= 12
|
||||
if archived:
|
||||
score -= 30
|
||||
|
||||
lowered_blacklist = [entry.lower() for entry in blacklist]
|
||||
for forbidden in lowered_blacklist:
|
||||
if not forbidden:
|
||||
continue
|
||||
if forbidden.endswith("*"):
|
||||
prefix = forbidden[:-1]
|
||||
if prefix and name.startswith(prefix):
|
||||
score -= 40
|
||||
break
|
||||
elif forbidden in name:
|
||||
score -= 40
|
||||
break
|
||||
|
||||
for match in matches:
|
||||
score += match_score(match) / 25.0
|
||||
|
||||
return clamp(score, 0, 100), tier_for_score(score)
|
||||
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from jinja2 import Environment, FileSystemLoader, select_autoescape
|
||||
|
||||
from utils import DOCS_DIR, TEMPLATES_DIR, ensure_dirs
|
||||
|
||||
|
||||
def build_env() -> Environment:
|
||||
loader = FileSystemLoader(str(TEMPLATES_DIR))
|
||||
env = Environment(loader=loader, autoescape=select_autoescape(["html", "xml"]))
|
||||
env.trim_blocks = True
|
||||
env.lstrip_blocks = True
|
||||
return env
|
||||
|
||||
|
||||
class SiteRenderer:
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
results: List[Dict],
|
||||
index_payload: Dict,
|
||||
top_payload: Dict,
|
||||
diff_payload: Dict | None = None,
|
||||
) -> None:
|
||||
self.results = []
|
||||
for result in results:
|
||||
visible = [p for p in result.get("pocs", []) if p.get("confidence_tier") in {"high", "medium"}]
|
||||
if not visible:
|
||||
visible = result.get("pocs", [])
|
||||
self.results.append({**result, "visible_pocs": visible})
|
||||
self.index_payload = index_payload
|
||||
self.top_payload = top_payload
|
||||
self.diff_payload = diff_payload or {}
|
||||
self.env = build_env()
|
||||
ensure_dirs(
|
||||
DOCS_DIR,
|
||||
DOCS_DIR / "pocs",
|
||||
DOCS_DIR / "cve",
|
||||
DOCS_DIR / "diffs",
|
||||
DOCS_DIR / "assets",
|
||||
)
|
||||
|
||||
def render(self, template_name: str, context: Dict, target: Path) -> None:
|
||||
html = self.env.get_template(template_name).render(**context)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(html, encoding="utf-8")
|
||||
|
||||
def build(self) -> None:
|
||||
generated = self.index_payload.get("generated")
|
||||
summary = {
|
||||
"generated": generated,
|
||||
"total_cves": len(self.index_payload.get("items", [])),
|
||||
"total_pocs": sum(item.get("poc_count", 0) for item in self.index_payload.get("items", [])),
|
||||
"high_total": sum(item.get("high_confidence", 0) for item in self.index_payload.get("items", [])),
|
||||
"medium_total": sum(item.get("medium_confidence", 0) for item in self.index_payload.get("items", [])),
|
||||
}
|
||||
self.render(
|
||||
"pipeline_index.html",
|
||||
{
|
||||
"summary": summary,
|
||||
"top": self.top_payload.get("items", [])[:25],
|
||||
"diff": self.diff_payload or {},
|
||||
},
|
||||
DOCS_DIR / "index.html",
|
||||
)
|
||||
|
||||
self.render(
|
||||
"pipeline_pocs.html",
|
||||
{
|
||||
"generated": generated,
|
||||
"index": self.index_payload.get("items", []),
|
||||
"top": self.top_payload.get("items", [])[:100],
|
||||
},
|
||||
DOCS_DIR / "pocs" / "index.html",
|
||||
)
|
||||
|
||||
for result in self.results:
|
||||
self.render(
|
||||
"pipeline_cve.html",
|
||||
{"cve": result, "generated": generated},
|
||||
DOCS_DIR / "cve" / f"{result['cve_id']}.html",
|
||||
)
|
||||
|
||||
if self.diff_payload:
|
||||
diff_date = self.diff_payload.get("generated")
|
||||
self.render(
|
||||
"pipeline_diff.html",
|
||||
{"diff": self.diff_payload, "generated": generated},
|
||||
DOCS_DIR / "diffs" / "index.html",
|
||||
)
|
||||
if diff_date:
|
||||
self.render(
|
||||
"pipeline_diff.html",
|
||||
{"diff": self.diff_payload, "generated": generated},
|
||||
DOCS_DIR / "diffs" / f"{diff_date}.html",
|
||||
)
|
||||
+126
-9
@@ -13,8 +13,13 @@ DATA_DIR = ROOT / "data"
|
||||
DOCS_DIR = ROOT / "docs"
|
||||
API_DIR = DOCS_DIR / "api" / "v1"
|
||||
SNAPSHOT_DIR = API_DIR / "snapshots"
|
||||
DIFFS_DIR = API_DIR / "diffs"
|
||||
TOP_DIR = API_DIR / "top"
|
||||
TEMPLATES_DIR = ROOT / "templates"
|
||||
ASSETS_DIR = DOCS_DIR / "assets"
|
||||
CACHE_DIR = DATA_DIR / "cache"
|
||||
STATE_DIR = DATA_DIR / "state"
|
||||
EVIDENCE_DIR = DATA_DIR / "evidence"
|
||||
|
||||
|
||||
def ensure_dirs(*paths: Path) -> None:
|
||||
@@ -45,6 +50,21 @@ def today_str() -> str:
|
||||
return datetime.now(timezone.utc).date().isoformat()
|
||||
|
||||
|
||||
def now_utc() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def isoformat(dt: datetime | None = None) -> str:
|
||||
return (dt or now_utc()).isoformat()
|
||||
|
||||
|
||||
def parse_date(value: str) -> datetime | None:
|
||||
try:
|
||||
return datetime.fromisoformat(value.replace("Z", "+00:00"))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def slugify(text: str) -> str:
|
||||
cleaned = re.sub(r"[^A-Za-z0-9]+", "-", text.strip().lower())
|
||||
cleaned = cleaned.strip("-")
|
||||
@@ -79,6 +99,7 @@ CVE_SECTION_RE = re.compile(r"^CVE-\d{4}-\d{4,}$", re.IGNORECASE)
|
||||
def load_poc_index() -> Dict[str, Dict[str, object]]:
|
||||
"""Load CVE → {desc, poc} mapping from docs/CVE_list.json or markdown files."""
|
||||
cve_json = DOCS_DIR / "CVE_list.json"
|
||||
blacklist = load_blacklist()
|
||||
if cve_json.exists():
|
||||
data = load_json(cve_json, default=[]) or []
|
||||
mapping = {}
|
||||
@@ -86,32 +107,35 @@ def load_poc_index() -> Dict[str, Dict[str, object]]:
|
||||
cve = str(entry.get("cve", "")).upper()
|
||||
if not is_valid_cve(cve):
|
||||
continue
|
||||
poc_links = stable_unique(entry.get("poc", []) or [])
|
||||
poc_links = filter_links_by_blacklist(poc_links, blacklist)
|
||||
mapping[cve] = {
|
||||
"desc": entry.get("desc", ""),
|
||||
"poc": stable_unique(entry.get("poc", []) or []),
|
||||
"poc": poc_links,
|
||||
}
|
||||
return mapping
|
||||
|
||||
return build_poc_index_from_markdown()
|
||||
return build_poc_index_from_markdown(blacklist=blacklist)
|
||||
|
||||
|
||||
def build_poc_index_from_markdown() -> Dict[str, Dict[str, object]]:
|
||||
def build_poc_index_from_markdown(*, blacklist: Optional[List[str]] = None) -> Dict[str, Dict[str, object]]:
|
||||
mapping: Dict[str, Dict[str, object]] = {}
|
||||
for md_path in sorted(ROOT.glob("[12][0-9][0-9][0-9]/CVE-*.md")):
|
||||
cve = md_path.stem.upper()
|
||||
if not is_valid_cve(cve):
|
||||
continue
|
||||
desc, poc_links = parse_cve_markdown(md_path)
|
||||
desc, poc_links = parse_cve_markdown(md_path, blacklist=blacklist)
|
||||
mapping[cve] = {"desc": desc, "poc": poc_links}
|
||||
return mapping
|
||||
|
||||
|
||||
def parse_cve_markdown(path: Path) -> Tuple[str, List[str]]:
|
||||
def parse_cve_markdown(path: Path, *, blacklist: Optional[List[str]] = None) -> Tuple[str, List[str]]:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
sections = parse_sections(text)
|
||||
description = normalise_block(sections.get("### Description", ""))
|
||||
references = collect_links(sections.get("#### Reference", ""))
|
||||
github_links = collect_links(sections.get("#### Github", ""))
|
||||
blacklist = blacklist or []
|
||||
references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist)
|
||||
github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist)
|
||||
poc_links = stable_unique([*references, *github_links])
|
||||
return description, poc_links
|
||||
|
||||
@@ -144,7 +168,7 @@ def parse_sections(content: str) -> Dict[str, str]:
|
||||
return sections
|
||||
|
||||
|
||||
def collect_links(block: str) -> List[str]:
|
||||
def collect_links(block: str, *, blacklist: Optional[List[str]] = None) -> List[str]:
|
||||
links: List[str] = []
|
||||
for raw in block.splitlines():
|
||||
entry = raw.strip()
|
||||
@@ -154,7 +178,7 @@ def collect_links(block: str) -> List[str]:
|
||||
entry = entry[2:].strip()
|
||||
if entry and entry not in links:
|
||||
links.append(entry)
|
||||
return links
|
||||
return filter_links_by_blacklist(links, blacklist or [])
|
||||
|
||||
|
||||
def is_valid_cve(cve_id: str) -> bool:
|
||||
@@ -165,6 +189,15 @@ def is_valid_cve(cve_id: str) -> bool:
|
||||
return year.isdigit() and parts[2].isdigit()
|
||||
|
||||
|
||||
def cve_year(cve_id: str) -> int | None:
|
||||
if not is_valid_cve(cve_id):
|
||||
return None
|
||||
try:
|
||||
return int(cve_id.split("-")[1])
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
# --- Trending PoCs -------------------------------------------------------
|
||||
|
||||
TREND_ROW_RE = re.compile(r"^\|\s*(?P<stars>\d+)\s*⭐\s*\|\s*(?P<updated>[^|]+)\|\s*\[(?P<name>[^\]]+)\]\((?P<url>[^)]+)\)\s*\|\s*(?P<desc>.*)\|$")
|
||||
@@ -199,3 +232,87 @@ def read_text(path: Path) -> str:
|
||||
def write_text(path: Path, content: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
# --- New helpers for PoC discovery -------------------------------------------------
|
||||
|
||||
|
||||
def clamp(value: float, minimum: float = 0, maximum: float = 100) -> float:
|
||||
return max(minimum, min(maximum, value))
|
||||
|
||||
|
||||
def chunked(iterable: Iterable, size: int) -> Iterable[List]:
|
||||
chunk: List = []
|
||||
for item in iterable:
|
||||
chunk.append(item)
|
||||
if len(chunk) >= size:
|
||||
yield chunk
|
||||
chunk = []
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
|
||||
def hash_key(text: str) -> str:
|
||||
import hashlib
|
||||
|
||||
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def load_blacklist(path: Path | None = None) -> List[str]:
|
||||
target = path or ROOT / "blacklist.txt"
|
||||
if not target.exists():
|
||||
return []
|
||||
entries: List[str] = []
|
||||
for raw in target.read_text(encoding="utf-8").splitlines():
|
||||
line = raw.strip()
|
||||
if line and not line.startswith("#"):
|
||||
entries.append(line)
|
||||
return entries
|
||||
|
||||
|
||||
def extract_repo_from_url(url: str) -> str:
|
||||
"""Return repository name segment from a URL (best effort)."""
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed = urlparse(url)
|
||||
host = (parsed.netloc or "").lower()
|
||||
if host and "github" not in host:
|
||||
return ""
|
||||
path = parsed.path or url
|
||||
except Exception:
|
||||
path = url
|
||||
parts = path.strip("/").split("/")
|
||||
if len(parts) >= 2:
|
||||
return parts[1].lower()
|
||||
if parts:
|
||||
return parts[-1].lower()
|
||||
return ""
|
||||
|
||||
|
||||
def is_blacklisted_repo(url: str, blacklist: List[str]) -> bool:
|
||||
repo = extract_repo_from_url(url)
|
||||
if not repo:
|
||||
return False
|
||||
for entry in blacklist:
|
||||
slug = entry.strip().lower()
|
||||
if not slug:
|
||||
continue
|
||||
if slug.endswith("*"):
|
||||
prefix = slug[:-1]
|
||||
if prefix and repo.startswith(prefix):
|
||||
return True
|
||||
elif repo == slug:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def filter_links_by_blacklist(links: List[str], blacklist: List[str]) -> List[str]:
|
||||
if not blacklist:
|
||||
return links
|
||||
filtered: List[str] = []
|
||||
for link in links:
|
||||
if is_blacklisted_repo(link, blacklist):
|
||||
continue
|
||||
filtered.append(link)
|
||||
return filtered
|
||||
|
||||
Reference in New Issue
Block a user