Add PoC pipeline with blacklist filtering and Pages build

This commit is contained in:
0xMarcio
2025-12-17 15:53:37 +01:00
parent b1085c10f5
commit 1f0cd8e78b
20 changed files with 188921 additions and 56 deletions
+2 -1
View File
@@ -7,6 +7,7 @@ pip install -r requirements.txt
python scripts/fetch_kev.py
python scripts/fetch_epss.py
python scripts/build_site.py
python scripts/build_all.py # new PoC discovery + scoring pipeline
```
Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diff/`.
Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diffs/`.
+118
View File
@@ -0,0 +1,118 @@
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from typing import Dict, List
import requests
from pipeline_outputs import (
build_diff,
prune_old_diffs,
prune_old_snapshots,
summarise_for_snapshot,
write_cve_outputs,
write_diff,
write_index,
write_snapshot,
write_top,
)
from poc_pipeline import PoCPipeline, build_scope, persist_evidence
from site_renderer import SiteRenderer
from utils import API_DIR, DOCS_DIR, load_json
def load_existing_results(api_dir: Path) -> List[Dict]:
results: List[Dict] = []
if not api_dir.exists():
return results
for path in api_dir.glob("CVE-*.json"):
data = load_json(path, default={}) or {}
if "pocs" in data:
results.append({"cve_id": data.get("cve_id") or path.stem, "pocs": data.get("pocs", []), "last_updated": data.get("last_updated")})
return results
def main(argv: List[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Build CVE PoC pipeline outputs, snapshots, and static site")
parser.add_argument("--days", type=int, default=7, help="Days window for GitHub discovery windows")
parser.add_argument("--mode", choices=["daily", "weekly"], default="daily", help="Run mode to tune scope")
parser.add_argument("--limit", type=int, default=50, help="Maximum CVEs to scan per run")
parser.add_argument("--cve", action="append", help="Explicit CVE IDs to scan (can be passed multiple times)")
parser.add_argument("--skip-discovery", action="store_true", help="Skip GitHub discovery and reuse existing API outputs")
parser.add_argument("--check-links", action="store_true", help="Optionally HEAD check repo URLs for dead links")
args = parser.parse_args(argv)
pipeline = PoCPipeline()
scope: List[str] = []
discovery_days = args.days
if args.cve:
scope = [cve.upper() for cve in args.cve]
elif not args.skip_discovery:
prefer_recent = True
scan_days = args.days
limit = args.limit
if args.mode == "weekly":
scan_days = max(scan_days, 30)
discovery_days = scan_days
prefer_recent = False
limit = None
scope = build_scope(scan_days, github_list=Path("github.txt"), existing_api=API_DIR / "cve", prefer_recent_years=prefer_recent, max_cves=limit)
results: List[Dict] = []
if args.skip_discovery:
results = load_existing_results(API_DIR / "cve")
else:
for idx, cve_id in enumerate(scope):
try:
results.append(pipeline.discover_for_cve(cve_id, days=discovery_days))
except Exception as exc: # noqa: BLE001
print(f"[warn] Failed to process {cve_id}: {exc}", file=sys.stderr)
persist_evidence(results)
if not results:
print("No results to write; aborting.")
return 1
write_cve_outputs(results)
index_payload = write_index(results)
top_payload = write_top(results)
def maybe_check_links() -> List[Dict]:
if not args.check_links:
return []
urls = []
for result in results:
for poc in result.get("pocs", []):
if poc.get("confidence_tier") in {"high", "medium"} and poc.get("repo_url"):
urls.append(poc["repo_url"])
urls = urls[:25]
dead: List[Dict] = []
for url in urls:
try:
resp = requests.head(url, timeout=5, allow_redirects=True)
if resp.status_code >= 400:
dead.append({"url": url, "status": resp.status_code})
except requests.RequestException as exc: # noqa: BLE001
dead.append({"url": url, "error": str(exc)})
return dead
snapshot_payload = summarise_for_snapshot(results, top=top_payload)
prev_snapshot = load_json(API_DIR / "snapshots" / "latest.json", default={}) or {}
snapshot_path = write_snapshot(snapshot_payload)
diff_payload = build_diff(prev_snapshot, snapshot_payload, dead_links=maybe_check_links())
write_diff(diff_payload)
prune_old_snapshots()
prune_old_diffs()
renderer = SiteRenderer(results=results, index_payload=index_payload, top_payload=top_payload, diff_payload=diff_payload)
renderer.build()
print(f"Generated site under {DOCS_DIR}")
print(f"Wrote latest snapshot to {snapshot_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
+210
View File
@@ -0,0 +1,210 @@
from __future__ import annotations
import json
import os
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
import requests
from utils import CACHE_DIR, chunked, hash_key, isoformat
TEXT_MATCH_HEADER = "application/vnd.github.text-match+json"
class RateLimiter:
def __init__(self, calls_per_minute: int) -> None:
self.min_interval = 60.0 / max(calls_per_minute, 1)
self.last_call: Dict[str, float] = {}
def wait(self, bucket: str) -> None:
last = self.last_call.get(bucket, 0.0)
elapsed = time.time() - last
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
self.last_call[bucket] = time.time()
class FileCache:
def __init__(self, base: Path) -> None:
self.base = base
self.base.mkdir(parents=True, exist_ok=True)
def _path_for(self, key: str) -> Path:
digest = hash_key(key)
return self.base / digest[:2] / f"{digest}.json"
def load(self, key: str, *, ttl: int) -> Optional[Dict]:
path = self._path_for(key)
if not path.exists():
return None
try:
with path.open("r", encoding="utf-8") as handle:
data = json.load(handle)
except (OSError, json.JSONDecodeError):
return None
expires_at = data.get("expires_at")
if expires_at:
try:
expires_ts = time.mktime(time.strptime(expires_at, "%Y-%m-%dT%H:%M:%S"))
if time.time() > expires_ts:
return None
except Exception:
return None
return data.get("payload")
def save(self, key: str, payload: Dict, *, ttl: int) -> None:
path = self._path_for(key)
path.parent.mkdir(parents=True, exist_ok=True)
data = {
"fetched_at": isoformat(),
"expires_at": time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(time.time() + ttl)),
"payload": payload,
}
with path.open("w", encoding="utf-8") as handle:
json.dump(data, handle, ensure_ascii=False, indent=2)
@dataclass
class SearchResult:
kind: str
query: str
page: int
payload: Dict
class GitHubClient:
def __init__(
self,
token: Optional[str],
*,
cache_dir: Path | None = None,
code_search_rpm: int = 10,
general_rpm: int = 30,
) -> None:
self.session = requests.Session()
self.session.headers.update({"Accept": TEXT_MATCH_HEADER})
if token:
self.session.headers["Authorization"] = f"Bearer {token}"
self.base_url = "https://api.github.com"
self.graphql_url = f"{self.base_url}/graphql"
cache_root = cache_dir or CACHE_DIR / "github"
self.cache = FileCache(cache_root)
self.rate_limiters = {
"code": RateLimiter(code_search_rpm),
"search": RateLimiter(general_rpm),
"graphql": RateLimiter(general_rpm),
}
def _request(self, method: str, url: str, *, bucket: str, **kwargs) -> requests.Response:
self.rate_limiters[bucket].wait(bucket)
attempts = 0
while True:
attempts += 1
try:
response = self.session.request(method, url, timeout=30, **kwargs)
except requests.RequestException:
if attempts >= 3:
raise
time.sleep(2 * attempts)
continue
if response.status_code == 403 and "X-RateLimit-Remaining" in response.headers:
remaining = int(response.headers.get("X-RateLimit-Remaining") or "0")
reset = response.headers.get("X-RateLimit-Reset")
if remaining <= 0 and reset:
try:
reset_ts = int(reset)
wait_for = max(0, reset_ts - int(time.time()) + 1)
time.sleep(wait_for)
continue
except ValueError:
pass
if response.status_code >= 500 and attempts < 3:
time.sleep(1 + attempts)
continue
response.raise_for_status()
return response
def _cached_search(self, kind: str, query: str, page: int, per_page: int, ttl: int) -> Dict:
cache_key = f"{kind}:{query}:p{page}:n{per_page}"
cached = self.cache.load(cache_key, ttl=ttl)
if cached is not None:
return cached
url = f"{self.base_url}/search/{kind}"
params = {"q": query, "page": page, "per_page": per_page}
resp = self._request("GET", url, params=params, bucket="code" if kind == "code" else "search")
payload = resp.json()
self.cache.save(cache_key, payload, ttl=ttl)
return payload
def search_repositories(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
def search_code(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
return SearchResult("code", query, page, self._cached_search("code", query, page, per_page, ttl))
def search_topics(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
def fetch_repo_metadata(self, full_names: Iterable[str], *, ttl: int = 6 * 3600) -> Dict[str, Dict]:
results: Dict[str, Dict] = {}
to_fetch: List[str] = []
for name in full_names:
cache_key = f"repo-meta:{name}"
cached = self.cache.load(cache_key, ttl=ttl)
if cached is not None:
results[name] = cached
else:
to_fetch.append(name)
if not to_fetch:
return results
fields = """
nameWithOwner
url
stargazerCount
description
forkCount
isFork
isArchived
pushedAt
updatedAt
primaryLanguage { name }
parent { nameWithOwner url }
repositoryTopics(first: 20) { nodes { topic { name } } }
"""
for batch in chunked(to_fetch, 12):
parts = []
for idx, full_name in enumerate(batch):
if "/" not in full_name:
continue
owner, name = full_name.split("/", 1)
owner = owner.replace('"', "")
name = name.replace('"', "")
parts.append(f'repo_{idx}: repository(owner: "{owner}", name: "{name}") {{ {fields} }}')
if not parts:
continue
query = "query { " + " ".join(parts) + " }"
resp = self._request("POST", self.graphql_url, json={"query": query}, bucket="graphql")
data = resp.json()
repos = data.get("data", {})
for idx, full_name in enumerate(batch):
key = f"repo_{idx}"
meta = repos.get(key) or {}
cache_key = f"repo-meta:{full_name}"
self.cache.save(cache_key, meta, ttl=ttl)
results[full_name] = meta
return results
def build_client(token_env: str = "GITHUB_TOKEN") -> GitHubClient:
token = os.environ.get(token_env)
return GitHubClient(token, cache_dir=CACHE_DIR / "github")
+220
View File
@@ -0,0 +1,220 @@
from __future__ import annotations
from collections import Counter
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
from utils import API_DIR, DIFFS_DIR, SNAPSHOT_DIR, TOP_DIR, ensure_dirs, load_json, save_json, today_str
def write_cve_outputs(results: List[Dict], *, base_dir: Path | None = None) -> None:
target_dir = base_dir or API_DIR / "cve"
ensure_dirs(target_dir)
for result in results:
last_updated = result.get("last_updated") or today_str()
output = {
"cve_id": result["cve_id"],
"last_updated": last_updated,
"pocs": [
{
"repo_full_name": poc.get("repo_full_name"),
"repo_url": poc.get("repo_url"),
"is_fork": poc.get("is_fork"),
"parent_repo_url": poc.get("parent_repo_url"),
"stars": poc.get("stars"),
"forks": poc.get("forks"),
"archived": poc.get("archived"),
"pushed_at": poc.get("pushed_at") or poc.get("updated_at"),
"topics": poc.get("topics", []),
"primary_language": poc.get("primary_language"),
"matches": poc.get("matches", []),
"confidence_score": poc.get("confidence_score"),
"confidence_tier": poc.get("confidence_tier"),
}
for poc in result.get("pocs", [])
],
}
save_json(target_dir / f"{result['cve_id']}.json", output)
def build_index(results: List[Dict]) -> Dict:
items: List[Dict] = []
for result in results:
poc_entries = result.get("pocs", [])
high = [p for p in poc_entries if p.get("confidence_tier") == "high"]
medium = [p for p in poc_entries if p.get("confidence_tier") == "medium"]
langs = Counter()
max_score = 0.0
for poc in poc_entries:
lang = poc.get("primary_language")
if lang:
langs[lang] += 1
max_score = max(max_score, poc.get("confidence_score") or 0)
items.append(
{
"cve_id": result["cve_id"],
"poc_count": len(poc_entries),
"high_confidence": len(high),
"medium_confidence": len(medium),
"top_languages": [lang for lang, _ in langs.most_common(3)],
"max_score": max_score,
"last_updated": result.get("last_updated"),
}
)
return {"generated": today_str(), "items": sorted(items, key=lambda r: r["cve_id"], reverse=True)}
def write_index(results: List[Dict]) -> Dict:
ensure_dirs(API_DIR)
payload = build_index(results)
save_json(API_DIR / "index.json", payload)
return payload
def write_top(results: List[Dict], *, limit: int = 100) -> Dict:
ensure_dirs(TOP_DIR)
entries: List[Dict] = []
for result in results:
for poc in result.get("pocs", []):
if poc.get("confidence_tier") not in {"high", "medium"}:
continue
entries.append(
{
"cve_id": result["cve_id"],
"repo_full_name": poc.get("repo_full_name"),
"repo_url": poc.get("repo_url"),
"score": poc.get("confidence_score"),
"tier": poc.get("confidence_tier"),
"stars": poc.get("stars"),
"primary_language": poc.get("primary_language"),
}
)
entries.sort(key=lambda e: (-(e.get("score") or 0), -(e.get("stars") or 0)))
payload = {"generated": today_str(), "items": entries[:limit]}
save_json(TOP_DIR / "today.json", payload)
return payload
def summarise_for_snapshot(results: List[Dict], *, top: Dict | None = None) -> Dict:
summary: Dict[str, Dict[str, Dict]] = {}
for result in results:
repo_map: Dict[str, Dict] = {}
for poc in result.get("pocs", []):
repo_map[poc.get("repo_full_name")] = {
"score": poc.get("confidence_score"),
"tier": poc.get("confidence_tier"),
}
summary[result["cve_id"]] = repo_map
payload = {"generated": today_str(), "entries": summary}
if top:
payload["top"] = top
return payload
def write_snapshot(summary: Dict) -> Path:
ensure_dirs(SNAPSHOT_DIR)
target = SNAPSHOT_DIR / f"{summary['generated']}.json"
save_json(target, summary)
save_json(SNAPSHOT_DIR / "latest.json", summary)
return target
def prune_old_snapshots(days: int = 14) -> None:
if not SNAPSHOT_DIR.exists():
return
cutoff = datetime.utcnow().date() - timedelta(days=days)
for snap in SNAPSHOT_DIR.glob("*.json"):
try:
snap_date = datetime.strptime(snap.stem, "%Y-%m-%d").date()
except ValueError:
continue
if snap_date < cutoff:
snap.unlink(missing_ok=True)
def prune_old_diffs(days: int = 14) -> None:
if not DIFFS_DIR.exists():
return
cutoff = datetime.now().date() - timedelta(days=days)
for diff in DIFFS_DIR.glob("*.json"):
try:
diff_date = datetime.strptime(diff.stem, "%Y-%m-%d").date()
except ValueError:
continue
if diff_date < cutoff:
diff.unlink(missing_ok=True)
def _load_snapshot(path: Path) -> Dict:
return load_json(path, default={}) or {}
def build_diff(prev: Dict, curr: Dict, *, dead_links: List[Dict] | None = None) -> Dict:
prev_entries = prev.get("entries", {})
curr_entries = curr.get("entries", {})
new_high: List[Dict] = []
promoted: List[Dict] = []
demoted: List[Dict] = []
for cve_id, repos in curr_entries.items():
for repo_name, info in repos.items():
tier = info.get("tier")
if tier != "high":
continue
prev_info = (prev_entries.get(cve_id) or {}).get(repo_name)
if not prev_info:
new_high.append({"cve_id": cve_id, "repo_full_name": repo_name, "score": info.get("score")})
elif prev_info.get("tier") != "high":
promoted.append(
{
"cve_id": cve_id,
"repo_full_name": repo_name,
"score": info.get("score"),
"previous_tier": prev_info.get("tier"),
}
)
for cve_id, repos in prev_entries.items():
for repo_name, info in repos.items():
if info.get("tier") != "high":
continue
curr_info = (curr_entries.get(cve_id) or {}).get(repo_name)
if not curr_info or curr_info.get("tier") != "high":
demoted.append(
{
"cve_id": cve_id,
"repo_full_name": repo_name,
"previous_score": info.get("score"),
"previous_tier": info.get("tier"),
"current_tier": curr_info.get("tier") if curr_info else None,
}
)
return {
"generated": curr.get("generated"),
"new_high_conf_pocs": new_high,
"promoted_to_high": promoted,
"demoted_or_removed": demoted,
"dead_links": dead_links or [],
}
def write_diff(diff: Dict) -> Path:
ensure_dirs(DIFFS_DIR)
target = DIFFS_DIR / f"{diff['generated']}.json"
save_json(target, diff)
save_json(DIFFS_DIR / "latest.json", diff)
return target
def latest_snapshots() -> Tuple[Dict, Dict]:
if not SNAPSHOT_DIR.exists():
return {}, {}
snaps = sorted(SNAPSHOT_DIR.glob("*.json"))
if not snaps:
return {}, {}
curr = _load_snapshot(snaps[-1])
prev = _load_snapshot(snaps[-2]) if len(snaps) > 1 else {}
return prev, curr
+274
View File
@@ -0,0 +1,274 @@
from __future__ import annotations
import re
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, Tuple
from github_client import GitHubClient, SearchResult, build_client
from poc_scoring import match_score, score_repo
from utils import API_DIR, EVIDENCE_DIR, chunked, cve_year, ensure_dirs, isoformat, load_blacklist, load_json, save_json, today_str
LANG_PARTITIONS = ("python", "go", "c", "shell", "powershell", "java", "ruby", "js")
CVE_RE = re.compile(r"CVE-\d{4}-\d{4,}", re.IGNORECASE)
@dataclass
class MatchEvidence:
path: str
match_type: str
query: str
score: float | None = None
@dataclass
class RepoCandidate:
cve_id: str
repo_full_name: str
repo_url: str
matches: List[MatchEvidence] = field(default_factory=list)
metadata: Dict[str, object] = field(default_factory=dict)
def add_match(self, path: str, match_type: str, query: str) -> None:
key = (path, match_type)
existing = {(m.path, m.match_type) for m in self.matches}
if key in existing:
return
self.matches.append(MatchEvidence(path=path, match_type=match_type, query=query))
def build_created_ranges(days: int, *, window: int = 7) -> List[Tuple[str, str]]:
end = date.today()
start = end - timedelta(days=max(days, 1))
ranges: List[Tuple[str, str]] = []
cursor = start
while cursor <= end:
window_end = min(cursor + timedelta(days=window - 1), end)
ranges.append((cursor.isoformat(), window_end.isoformat()))
cursor = window_end + timedelta(days=1)
return ranges or [(start.isoformat(), end.isoformat())]
def build_query_pack(cve_id: str, created_range: Tuple[str, str] | None = None) -> List[Dict[str, str]]:
base_repo = f'{cve_id} in:name,description,readme fork:false'
enriched_repo = f'{cve_id} (poc OR exploit) in:name,description,readme fork:false'
topic_query = f"topic:{cve_id.lower()} fork:false"
created_suffix = ""
if created_range:
created_suffix = f" created:{created_range[0]}..{created_range[1]}"
queries = [
{"kind": "repositories", "query": base_repo + created_suffix, "match_type": "name"},
{"kind": "repositories", "query": enriched_repo + created_suffix, "match_type": "description"},
{"kind": "repositories", "query": topic_query + created_suffix, "match_type": "topic"},
]
for lang in LANG_PARTITIONS:
base_code = f'{cve_id} in:file language:{lang}{created_suffix}'
queries.append({"kind": "code", "query": base_code, "match_type": "code"})
# generic code search without language partition for the most recent window
queries.append({"kind": "code", "query": f"{cve_id} in:file{created_suffix}", "match_type": "code"})
return queries
def parse_repo_from_item(item: Dict) -> Tuple[str | None, str | None]:
repo_full_name = item.get("full_name") or item.get("repository", {}).get("full_name")
repo_url = item.get("html_url") or item.get("repository", {}).get("html_url")
if not repo_full_name and "repository" in item:
repo_full_name = item["repository"].get("owner", {}).get("login", "")
if repo_full_name:
repo_full_name = f"{repo_full_name}/{item['repository'].get('name', '')}"
return repo_full_name, repo_url
def extract_matches(item: Dict, default_type: str, query: str) -> List[MatchEvidence]:
matches: List[MatchEvidence] = []
for text_match in item.get("text_matches", []) or []:
prop = text_match.get("property") or text_match.get("object_type") or ""
fragment = text_match.get("fragment") or text_match.get("path") or prop or ""
match_type = prop if prop else default_type
matches.append(MatchEvidence(path=str(fragment), match_type=str(match_type), query=query))
if not matches:
path = item.get("path") or default_type
matches.append(MatchEvidence(path=str(path), match_type=default_type, query=query))
return matches
def normalise_metadata(meta: Dict, fallback_full_name: str, fallback_url: str) -> Dict:
topics = []
if meta.get("repositoryTopics"):
for node in meta["repositoryTopics"].get("nodes", []):
topic = (node.get("topic") or {}).get("name")
if topic:
topics.append(topic)
primary_language = None
if meta.get("primaryLanguage"):
primary_language = meta["primaryLanguage"].get("name")
parent = meta.get("parent") or {}
return {
"repo_full_name": meta.get("nameWithOwner") or fallback_full_name,
"repo_url": meta.get("url") or fallback_url,
"description": meta.get("description") or "",
"is_fork": bool(meta.get("isFork")),
"parent_repo_url": parent.get("url"),
"stars": meta.get("stargazerCount") or 0,
"forks": meta.get("forkCount") or 0,
"archived": bool(meta.get("isArchived")),
"pushed_at": meta.get("pushedAt"),
"updated_at": meta.get("updatedAt"),
"topics": topics,
"primary_language": primary_language,
}
class PoCPipeline:
def __init__(
self,
client: GitHubClient | None = None,
*,
blacklist_path: Path | None = None,
search_ttl: int = 3 * 3600,
) -> None:
self.client = client or build_client()
self.blacklist = load_blacklist(blacklist_path)
self.search_ttl = search_ttl
def _run_query(self, query: Dict, page: int) -> SearchResult:
if query["kind"] == "repositories":
return self.client.search_repositories(query["query"], page=page, per_page=50, ttl=self.search_ttl)
if query["kind"] == "code":
return self.client.search_code(query["query"], page=page, per_page=50, ttl=self.search_ttl)
return self.client.search_topics(query["query"], page=page, per_page=50, ttl=self.search_ttl)
def discover_for_cve(self, cve_id: str, *, days: int, max_pages_repo: int = 2, max_pages_code: int = 2) -> Dict:
ranges = build_created_ranges(days)
candidates: Dict[str, RepoCandidate] = {}
query_log: List[Dict] = []
for created_range in ranges:
query_pack = build_query_pack(cve_id, created_range)
for query in query_pack:
query_log.append({"query": query["query"], "kind": query["kind"], "window": created_range})
page_limit = max_pages_code if query["kind"] == "code" else max_pages_repo
for page in range(1, page_limit + 1):
result = self._run_query(query, page)
items = result.payload.get("items", [])
for item in items:
repo_full_name, repo_url = parse_repo_from_item(item)
if not repo_full_name or not repo_url:
continue
candidate = candidates.setdefault(
repo_full_name,
RepoCandidate(cve_id=cve_id, repo_full_name=repo_full_name, repo_url=repo_url),
)
for match in extract_matches(item, query["match_type"], query["query"]):
candidate.add_match(match.path, match.match_type, match.query)
if len(items) < 50:
break
metadata = self.client.fetch_repo_metadata(candidates.keys())
for repo_full_name, candidate in candidates.items():
meta = metadata.get(repo_full_name, {})
candidate.metadata = normalise_metadata(meta, repo_full_name, candidate.repo_url)
repos: List[Dict] = []
for candidate in candidates.values():
matches_dicts = []
for m in candidate.matches:
m.score = match_score({"path": m.path, "match_type": m.match_type})
matches_dicts.append({"path": m.path, "match_type": m.match_type, "query": m.query, "score": m.score})
score, tier = score_repo(candidate.metadata, matches_dicts, self.blacklist)
repo_entry = {
**candidate.metadata,
"matches": matches_dicts,
"confidence_score": score,
"confidence_tier": tier,
"cve_id": cve_id,
}
repos.append(repo_entry)
repos.sort(key=lambda r: (-r["confidence_score"], -r.get("stars", 0)))
evidence = {
"queries": query_log,
"candidates": [
{
"repo_full_name": r["repo_full_name"],
"matches": r["matches"],
"match_count": len(r["matches"]),
"score": r["confidence_score"],
"tier": r["confidence_tier"],
}
for r in repos
],
}
return {"cve_id": cve_id, "last_updated": isoformat(), "pocs": repos, "evidence": evidence}
def discover_many(self, cve_ids: Iterable[str], *, days: int, limit: Optional[int] = None) -> List[Dict]:
results: List[Dict] = []
for idx, cve_id in enumerate(cve_ids):
if limit and idx >= limit:
break
results.append(self.discover_for_cve(cve_id, days=days))
return results
def persist_evidence(results: List[Dict]) -> None:
ensure_dirs(EVIDENCE_DIR)
for result in results:
cve_id = result["cve_id"]
evidence_path = EVIDENCE_DIR / f"{cve_id}.json"
save_json(evidence_path, result.get("evidence", {}))
def discover_from_github_list(path: Path) -> List[str]:
if not path.exists():
return []
ids: List[str] = []
for line in path.read_text(encoding="utf-8").splitlines():
matches = CVE_RE.findall(line)
for match in matches:
if match.upper() not in ids:
ids.append(match.upper())
return ids
def load_existing_cves(api_dir: Path = API_DIR / "cve") -> List[str]:
if not api_dir.exists():
return []
return sorted({p.stem.upper() for p in api_dir.glob("CVE-*.json") if CVE_RE.match(p.stem)})
def build_scope(
days: int,
*,
github_list: Path,
existing_api: Path,
prefer_recent_years: bool = True,
max_cves: int | None = None,
low_conf_threshold: int = 1,
) -> List[str]:
seeds = discover_from_github_list(github_list)
existing = load_existing_cves(existing_api)
candidates = seeds or existing
if prefer_recent_years:
current_year = date.today().year
candidates = [cve for cve in candidates if cve_year(cve) and cve_year(cve) >= current_year - 2] or candidates
index_path = API_DIR / "index.json"
low_conf: List[str] = []
if index_path.exists():
index_payload = load_json(index_path, default={}) or {}
for item in index_payload.get("items", []):
score = (item.get("high_confidence", 0) or 0) + (item.get("medium_confidence", 0) or 0)
if score <= low_conf_threshold:
low_conf.append(item.get("cve_id"))
scoped = candidates + [cve for cve in low_conf if cve and cve not in candidates]
if max_cves:
scoped = scoped[:max_cves]
return scoped
+121
View File
@@ -0,0 +1,121 @@
from __future__ import annotations
import re
from datetime import datetime, timedelta, timezone
from typing import Dict, Iterable, List, Tuple
from utils import clamp, parse_date
DOC_EXTS = {"md", "txt", "rst", "adoc", "markdown", "mkd", "mdown"}
POSITIVE_KEYWORDS = ("poc", "exploit", "rce", "lpe", "auth bypass", "bypass")
NEGATIVE_KEYWORDS = ("report", "writeup", "advisory", "changelog")
def is_doc_path(path: str) -> bool:
lower = path.lower()
if lower.endswith("/"):
return True
if "." not in lower:
return False
ext = lower.rsplit(".", 1)[-1]
return ext in DOC_EXTS
def match_score(match: Dict) -> float:
path = str(match.get("path", ""))
match_type = str(match.get("match_type", "")).lower()
base = 50 if not is_doc_path(path) else 30
if match_type in ("code",):
base += 10
if "readme" in match_type:
base += 5
if "topic" in match_type:
base -= 5
return clamp(base, 0, 100)
def tier_for_score(score: float) -> str:
if score >= 75:
return "high"
if score >= 45:
return "medium"
return "low"
def keyword_hits(text: str, keywords: Iterable[str]) -> int:
if not text:
return 0
lower = text.lower()
return sum(1 for kw in keywords if kw in lower)
def recency_bonus(pushed_at: str | None) -> float:
if not pushed_at:
return 0.0
dt = parse_date(pushed_at)
if not dt:
return 0.0
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
delta = datetime.now(timezone.utc) - dt
if delta <= timedelta(days=30):
return 18.0
if delta <= timedelta(days=90):
return 10.0
if delta <= timedelta(days=180):
return 5.0
return 0.0
def score_repo(repo: Dict, matches: List[Dict], blacklist: List[str]) -> Tuple[float, str]:
stars = repo.get("stargazerCount") or repo.get("stars") or 0
forks = repo.get("forkCount") or repo.get("forks") or 0
is_fork = bool(repo.get("isFork"))
archived = bool(repo.get("isArchived"))
topics = [t.lower() for t in repo.get("topics", []) if t]
name = str(repo.get("nameWithOwner") or repo.get("repo_full_name") or "").lower()
description = str(repo.get("description") or "").lower()
non_doc_matches = [m for m in matches if not is_doc_path(str(m.get("path", "")))]
doc_matches = [m for m in matches if is_doc_path(str(m.get("path", "")))]
score = 12.0
if non_doc_matches:
score += 25 + min(len(non_doc_matches) * 2, 10)
if doc_matches and not non_doc_matches:
score -= 20
score += recency_bonus(repo.get("pushed_at") or repo.get("pushedAt") or repo.get("updated_at"))
score += min(stars / 50.0, 25.0)
score += min(forks / 200.0, 5.0)
score += keyword_hits(description, POSITIVE_KEYWORDS) * 4.0
score += keyword_hits(" ".join(topics), POSITIVE_KEYWORDS) * 4.0
negative_bias = keyword_hits(description, NEGATIVE_KEYWORDS)
if negative_bias and not non_doc_matches:
score -= 15
if is_fork:
score -= 12
if archived:
score -= 30
lowered_blacklist = [entry.lower() for entry in blacklist]
for forbidden in lowered_blacklist:
if not forbidden:
continue
if forbidden.endswith("*"):
prefix = forbidden[:-1]
if prefix and name.startswith(prefix):
score -= 40
break
elif forbidden in name:
score -= 40
break
for match in matches:
score += match_score(match) / 25.0
return clamp(score, 0, 100), tier_for_score(score)
+99
View File
@@ -0,0 +1,99 @@
from __future__ import annotations
from pathlib import Path
from typing import Dict, List
from jinja2 import Environment, FileSystemLoader, select_autoescape
from utils import DOCS_DIR, TEMPLATES_DIR, ensure_dirs
def build_env() -> Environment:
loader = FileSystemLoader(str(TEMPLATES_DIR))
env = Environment(loader=loader, autoescape=select_autoescape(["html", "xml"]))
env.trim_blocks = True
env.lstrip_blocks = True
return env
class SiteRenderer:
def __init__(
self,
*,
results: List[Dict],
index_payload: Dict,
top_payload: Dict,
diff_payload: Dict | None = None,
) -> None:
self.results = []
for result in results:
visible = [p for p in result.get("pocs", []) if p.get("confidence_tier") in {"high", "medium"}]
if not visible:
visible = result.get("pocs", [])
self.results.append({**result, "visible_pocs": visible})
self.index_payload = index_payload
self.top_payload = top_payload
self.diff_payload = diff_payload or {}
self.env = build_env()
ensure_dirs(
DOCS_DIR,
DOCS_DIR / "pocs",
DOCS_DIR / "cve",
DOCS_DIR / "diffs",
DOCS_DIR / "assets",
)
def render(self, template_name: str, context: Dict, target: Path) -> None:
html = self.env.get_template(template_name).render(**context)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(html, encoding="utf-8")
def build(self) -> None:
generated = self.index_payload.get("generated")
summary = {
"generated": generated,
"total_cves": len(self.index_payload.get("items", [])),
"total_pocs": sum(item.get("poc_count", 0) for item in self.index_payload.get("items", [])),
"high_total": sum(item.get("high_confidence", 0) for item in self.index_payload.get("items", [])),
"medium_total": sum(item.get("medium_confidence", 0) for item in self.index_payload.get("items", [])),
}
self.render(
"pipeline_index.html",
{
"summary": summary,
"top": self.top_payload.get("items", [])[:25],
"diff": self.diff_payload or {},
},
DOCS_DIR / "index.html",
)
self.render(
"pipeline_pocs.html",
{
"generated": generated,
"index": self.index_payload.get("items", []),
"top": self.top_payload.get("items", [])[:100],
},
DOCS_DIR / "pocs" / "index.html",
)
for result in self.results:
self.render(
"pipeline_cve.html",
{"cve": result, "generated": generated},
DOCS_DIR / "cve" / f"{result['cve_id']}.html",
)
if self.diff_payload:
diff_date = self.diff_payload.get("generated")
self.render(
"pipeline_diff.html",
{"diff": self.diff_payload, "generated": generated},
DOCS_DIR / "diffs" / "index.html",
)
if diff_date:
self.render(
"pipeline_diff.html",
{"diff": self.diff_payload, "generated": generated},
DOCS_DIR / "diffs" / f"{diff_date}.html",
)
+126 -9
View File
@@ -13,8 +13,13 @@ DATA_DIR = ROOT / "data"
DOCS_DIR = ROOT / "docs"
API_DIR = DOCS_DIR / "api" / "v1"
SNAPSHOT_DIR = API_DIR / "snapshots"
DIFFS_DIR = API_DIR / "diffs"
TOP_DIR = API_DIR / "top"
TEMPLATES_DIR = ROOT / "templates"
ASSETS_DIR = DOCS_DIR / "assets"
CACHE_DIR = DATA_DIR / "cache"
STATE_DIR = DATA_DIR / "state"
EVIDENCE_DIR = DATA_DIR / "evidence"
def ensure_dirs(*paths: Path) -> None:
@@ -45,6 +50,21 @@ def today_str() -> str:
return datetime.now(timezone.utc).date().isoformat()
def now_utc() -> datetime:
return datetime.now(timezone.utc)
def isoformat(dt: datetime | None = None) -> str:
return (dt or now_utc()).isoformat()
def parse_date(value: str) -> datetime | None:
try:
return datetime.fromisoformat(value.replace("Z", "+00:00"))
except ValueError:
return None
def slugify(text: str) -> str:
cleaned = re.sub(r"[^A-Za-z0-9]+", "-", text.strip().lower())
cleaned = cleaned.strip("-")
@@ -79,6 +99,7 @@ CVE_SECTION_RE = re.compile(r"^CVE-\d{4}-\d{4,}$", re.IGNORECASE)
def load_poc_index() -> Dict[str, Dict[str, object]]:
"""Load CVE → {desc, poc} mapping from docs/CVE_list.json or markdown files."""
cve_json = DOCS_DIR / "CVE_list.json"
blacklist = load_blacklist()
if cve_json.exists():
data = load_json(cve_json, default=[]) or []
mapping = {}
@@ -86,32 +107,35 @@ def load_poc_index() -> Dict[str, Dict[str, object]]:
cve = str(entry.get("cve", "")).upper()
if not is_valid_cve(cve):
continue
poc_links = stable_unique(entry.get("poc", []) or [])
poc_links = filter_links_by_blacklist(poc_links, blacklist)
mapping[cve] = {
"desc": entry.get("desc", ""),
"poc": stable_unique(entry.get("poc", []) or []),
"poc": poc_links,
}
return mapping
return build_poc_index_from_markdown()
return build_poc_index_from_markdown(blacklist=blacklist)
def build_poc_index_from_markdown() -> Dict[str, Dict[str, object]]:
def build_poc_index_from_markdown(*, blacklist: Optional[List[str]] = None) -> Dict[str, Dict[str, object]]:
mapping: Dict[str, Dict[str, object]] = {}
for md_path in sorted(ROOT.glob("[12][0-9][0-9][0-9]/CVE-*.md")):
cve = md_path.stem.upper()
if not is_valid_cve(cve):
continue
desc, poc_links = parse_cve_markdown(md_path)
desc, poc_links = parse_cve_markdown(md_path, blacklist=blacklist)
mapping[cve] = {"desc": desc, "poc": poc_links}
return mapping
def parse_cve_markdown(path: Path) -> Tuple[str, List[str]]:
def parse_cve_markdown(path: Path, *, blacklist: Optional[List[str]] = None) -> Tuple[str, List[str]]:
text = path.read_text(encoding="utf-8")
sections = parse_sections(text)
description = normalise_block(sections.get("### Description", ""))
references = collect_links(sections.get("#### Reference", ""))
github_links = collect_links(sections.get("#### Github", ""))
blacklist = blacklist or []
references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist)
github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist)
poc_links = stable_unique([*references, *github_links])
return description, poc_links
@@ -144,7 +168,7 @@ def parse_sections(content: str) -> Dict[str, str]:
return sections
def collect_links(block: str) -> List[str]:
def collect_links(block: str, *, blacklist: Optional[List[str]] = None) -> List[str]:
links: List[str] = []
for raw in block.splitlines():
entry = raw.strip()
@@ -154,7 +178,7 @@ def collect_links(block: str) -> List[str]:
entry = entry[2:].strip()
if entry and entry not in links:
links.append(entry)
return links
return filter_links_by_blacklist(links, blacklist or [])
def is_valid_cve(cve_id: str) -> bool:
@@ -165,6 +189,15 @@ def is_valid_cve(cve_id: str) -> bool:
return year.isdigit() and parts[2].isdigit()
def cve_year(cve_id: str) -> int | None:
if not is_valid_cve(cve_id):
return None
try:
return int(cve_id.split("-")[1])
except (TypeError, ValueError):
return None
# --- Trending PoCs -------------------------------------------------------
TREND_ROW_RE = re.compile(r"^\|\s*(?P<stars>\d+)\s*⭐\s*\|\s*(?P<updated>[^|]+)\|\s*\[(?P<name>[^\]]+)\]\((?P<url>[^)]+)\)\s*\|\s*(?P<desc>.*)\|$")
@@ -199,3 +232,87 @@ def read_text(path: Path) -> str:
def write_text(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
# --- New helpers for PoC discovery -------------------------------------------------
def clamp(value: float, minimum: float = 0, maximum: float = 100) -> float:
return max(minimum, min(maximum, value))
def chunked(iterable: Iterable, size: int) -> Iterable[List]:
chunk: List = []
for item in iterable:
chunk.append(item)
if len(chunk) >= size:
yield chunk
chunk = []
if chunk:
yield chunk
def hash_key(text: str) -> str:
import hashlib
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def load_blacklist(path: Path | None = None) -> List[str]:
target = path or ROOT / "blacklist.txt"
if not target.exists():
return []
entries: List[str] = []
for raw in target.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if line and not line.startswith("#"):
entries.append(line)
return entries
def extract_repo_from_url(url: str) -> str:
"""Return repository name segment from a URL (best effort)."""
try:
from urllib.parse import urlparse
parsed = urlparse(url)
host = (parsed.netloc or "").lower()
if host and "github" not in host:
return ""
path = parsed.path or url
except Exception:
path = url
parts = path.strip("/").split("/")
if len(parts) >= 2:
return parts[1].lower()
if parts:
return parts[-1].lower()
return ""
def is_blacklisted_repo(url: str, blacklist: List[str]) -> bool:
repo = extract_repo_from_url(url)
if not repo:
return False
for entry in blacklist:
slug = entry.strip().lower()
if not slug:
continue
if slug.endswith("*"):
prefix = slug[:-1]
if prefix and repo.startswith(prefix):
return True
elif repo == slug:
return True
return False
def filter_links_by_blacklist(links: List[str], blacklist: List[str]) -> List[str]:
if not blacklist:
return links
filtered: List[str] = []
for link in links:
if is_blacklisted_repo(link, blacklist):
continue
filtered.append(link)
return filtered