Add PoC pipeline with blacklist filtering and Pages build

2026-05-28 11:21:40 +02:00 · 2025-12-17 15:53:37 +01:00
parent b1085c10f5
commit 1f0cd8e78b
20 changed files with 188921 additions and 56 deletions
@@ -7,6 +7,7 @@ pip install -r requirements.txt
 python scripts/fetch_kev.py
 python scripts/fetch_epss.py
 python scripts/build_site.py
+python scripts/build_all.py  # new PoC discovery + scoring pipeline
 ```

-Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diff/`.
+Outputs land in `docs/` and JSON under `docs/api/v1/`. Snapshots live in `docs/api/v1/snapshots/` (last 14 days) and diffs under `docs/api/v1/diffs/`.
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+from typing import Dict, List
+
+import requests
+
+from pipeline_outputs import (
+    build_diff,
+    prune_old_diffs,
+    prune_old_snapshots,
+    summarise_for_snapshot,
+    write_cve_outputs,
+    write_diff,
+    write_index,
+    write_snapshot,
+    write_top,
+)
+from poc_pipeline import PoCPipeline, build_scope, persist_evidence
+from site_renderer import SiteRenderer
+from utils import API_DIR, DOCS_DIR, load_json
+
+
+def load_existing_results(api_dir: Path) -> List[Dict]:
+    results: List[Dict] = []
+    if not api_dir.exists():
+        return results
+    for path in api_dir.glob("CVE-*.json"):
+        data = load_json(path, default={}) or {}
+        if "pocs" in data:
+            results.append({"cve_id": data.get("cve_id") or path.stem, "pocs": data.get("pocs", []), "last_updated": data.get("last_updated")})
+    return results
+
+
+def main(argv: List[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Build CVE PoC pipeline outputs, snapshots, and static site")
+    parser.add_argument("--days", type=int, default=7, help="Days window for GitHub discovery windows")
+    parser.add_argument("--mode", choices=["daily", "weekly"], default="daily", help="Run mode to tune scope")
+    parser.add_argument("--limit", type=int, default=50, help="Maximum CVEs to scan per run")
+    parser.add_argument("--cve", action="append", help="Explicit CVE IDs to scan (can be passed multiple times)")
+    parser.add_argument("--skip-discovery", action="store_true", help="Skip GitHub discovery and reuse existing API outputs")
+    parser.add_argument("--check-links", action="store_true", help="Optionally HEAD check repo URLs for dead links")
+    args = parser.parse_args(argv)
+
+    pipeline = PoCPipeline()
+    scope: List[str] = []
+    discovery_days = args.days
+    if args.cve:
+        scope = [cve.upper() for cve in args.cve]
+    elif not args.skip_discovery:
+        prefer_recent = True
+        scan_days = args.days
+        limit = args.limit
+        if args.mode == "weekly":
+            scan_days = max(scan_days, 30)
+            discovery_days = scan_days
+            prefer_recent = False
+            limit = None
+        scope = build_scope(scan_days, github_list=Path("github.txt"), existing_api=API_DIR / "cve", prefer_recent_years=prefer_recent, max_cves=limit)
+
+    results: List[Dict] = []
+    if args.skip_discovery:
+        results = load_existing_results(API_DIR / "cve")
+    else:
+        for idx, cve_id in enumerate(scope):
+            try:
+                results.append(pipeline.discover_for_cve(cve_id, days=discovery_days))
+            except Exception as exc:  # noqa: BLE001
+                print(f"[warn] Failed to process {cve_id}: {exc}", file=sys.stderr)
+        persist_evidence(results)
+
+    if not results:
+        print("No results to write; aborting.")
+        return 1
+
+    write_cve_outputs(results)
+    index_payload = write_index(results)
+    top_payload = write_top(results)
+
+    def maybe_check_links() -> List[Dict]:
+        if not args.check_links:
+            return []
+        urls = []
+        for result in results:
+            for poc in result.get("pocs", []):
+                if poc.get("confidence_tier") in {"high", "medium"} and poc.get("repo_url"):
+                    urls.append(poc["repo_url"])
+        urls = urls[:25]
+        dead: List[Dict] = []
+        for url in urls:
+            try:
+                resp = requests.head(url, timeout=5, allow_redirects=True)
+                if resp.status_code >= 400:
+                    dead.append({"url": url, "status": resp.status_code})
+            except requests.RequestException as exc:  # noqa: BLE001
+                dead.append({"url": url, "error": str(exc)})
+        return dead
+
+    snapshot_payload = summarise_for_snapshot(results, top=top_payload)
+    prev_snapshot = load_json(API_DIR / "snapshots" / "latest.json", default={}) or {}
+    snapshot_path = write_snapshot(snapshot_payload)
+    diff_payload = build_diff(prev_snapshot, snapshot_payload, dead_links=maybe_check_links())
+    write_diff(diff_payload)
+    prune_old_snapshots()
+    prune_old_diffs()
+
+    renderer = SiteRenderer(results=results, index_payload=index_payload, top_payload=top_payload, diff_payload=diff_payload)
+    renderer.build()
+
+    print(f"Generated site under {DOCS_DIR}")
+    print(f"Wrote latest snapshot to {snapshot_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import json
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Tuple
+
+import requests
+
+from utils import CACHE_DIR, chunked, hash_key, isoformat
+
+
+TEXT_MATCH_HEADER = "application/vnd.github.text-match+json"
+
+
+class RateLimiter:
+    def __init__(self, calls_per_minute: int) -> None:
+        self.min_interval = 60.0 / max(calls_per_minute, 1)
+        self.last_call: Dict[str, float] = {}
+
+    def wait(self, bucket: str) -> None:
+        last = self.last_call.get(bucket, 0.0)
+        elapsed = time.time() - last
+        if elapsed < self.min_interval:
+            time.sleep(self.min_interval - elapsed)
+        self.last_call[bucket] = time.time()
+
+
+class FileCache:
+    def __init__(self, base: Path) -> None:
+        self.base = base
+        self.base.mkdir(parents=True, exist_ok=True)
+
+    def _path_for(self, key: str) -> Path:
+        digest = hash_key(key)
+        return self.base / digest[:2] / f"{digest}.json"
+
+    def load(self, key: str, *, ttl: int) -> Optional[Dict]:
+        path = self._path_for(key)
+        if not path.exists():
+            return None
+        try:
+            with path.open("r", encoding="utf-8") as handle:
+                data = json.load(handle)
+        except (OSError, json.JSONDecodeError):
+            return None
+        expires_at = data.get("expires_at")
+        if expires_at:
+            try:
+                expires_ts = time.mktime(time.strptime(expires_at, "%Y-%m-%dT%H:%M:%S"))
+                if time.time() > expires_ts:
+                    return None
+            except Exception:
+                return None
+        return data.get("payload")
+
+    def save(self, key: str, payload: Dict, *, ttl: int) -> None:
+        path = self._path_for(key)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        data = {
+            "fetched_at": isoformat(),
+            "expires_at": time.strftime("%Y-%m-%dT%H:%M:%S", time.gmtime(time.time() + ttl)),
+            "payload": payload,
+        }
+        with path.open("w", encoding="utf-8") as handle:
+            json.dump(data, handle, ensure_ascii=False, indent=2)
+
+
+@dataclass
+class SearchResult:
+    kind: str
+    query: str
+    page: int
+    payload: Dict
+
+
+class GitHubClient:
+    def __init__(
+        self,
+        token: Optional[str],
+        *,
+        cache_dir: Path | None = None,
+        code_search_rpm: int = 10,
+        general_rpm: int = 30,
+    ) -> None:
+        self.session = requests.Session()
+        self.session.headers.update({"Accept": TEXT_MATCH_HEADER})
+        if token:
+            self.session.headers["Authorization"] = f"Bearer {token}"
+        self.base_url = "https://api.github.com"
+        self.graphql_url = f"{self.base_url}/graphql"
+        cache_root = cache_dir or CACHE_DIR / "github"
+        self.cache = FileCache(cache_root)
+        self.rate_limiters = {
+            "code": RateLimiter(code_search_rpm),
+            "search": RateLimiter(general_rpm),
+            "graphql": RateLimiter(general_rpm),
+        }
+
+    def _request(self, method: str, url: str, *, bucket: str, **kwargs) -> requests.Response:
+        self.rate_limiters[bucket].wait(bucket)
+        attempts = 0
+        while True:
+            attempts += 1
+            try:
+                response = self.session.request(method, url, timeout=30, **kwargs)
+            except requests.RequestException:
+                if attempts >= 3:
+                    raise
+                time.sleep(2 * attempts)
+                continue
+
+            if response.status_code == 403 and "X-RateLimit-Remaining" in response.headers:
+                remaining = int(response.headers.get("X-RateLimit-Remaining") or "0")
+                reset = response.headers.get("X-RateLimit-Reset")
+                if remaining <= 0 and reset:
+                    try:
+                        reset_ts = int(reset)
+                        wait_for = max(0, reset_ts - int(time.time()) + 1)
+                        time.sleep(wait_for)
+                        continue
+                    except ValueError:
+                        pass
+            if response.status_code >= 500 and attempts < 3:
+                time.sleep(1 + attempts)
+                continue
+            response.raise_for_status()
+            return response
+
+    def _cached_search(self, kind: str, query: str, page: int, per_page: int, ttl: int) -> Dict:
+        cache_key = f"{kind}:{query}:p{page}:n{per_page}"
+        cached = self.cache.load(cache_key, ttl=ttl)
+        if cached is not None:
+            return cached
+
+        url = f"{self.base_url}/search/{kind}"
+        params = {"q": query, "page": page, "per_page": per_page}
+        resp = self._request("GET", url, params=params, bucket="code" if kind == "code" else "search")
+        payload = resp.json()
+        self.cache.save(cache_key, payload, ttl=ttl)
+        return payload
+
+    def search_repositories(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
+        return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
+
+    def search_code(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
+        return SearchResult("code", query, page, self._cached_search("code", query, page, per_page, ttl))
+
+    def search_topics(self, query: str, *, page: int = 1, per_page: int = 100, ttl: int = 3600) -> SearchResult:
+        return SearchResult("repositories", query, page, self._cached_search("repositories", query, page, per_page, ttl))
+
+    def fetch_repo_metadata(self, full_names: Iterable[str], *, ttl: int = 6 * 3600) -> Dict[str, Dict]:
+        results: Dict[str, Dict] = {}
+        to_fetch: List[str] = []
+        for name in full_names:
+            cache_key = f"repo-meta:{name}"
+            cached = self.cache.load(cache_key, ttl=ttl)
+            if cached is not None:
+                results[name] = cached
+            else:
+                to_fetch.append(name)
+
+        if not to_fetch:
+            return results
+
+        fields = """
+        nameWithOwner
+        url
+        stargazerCount
+        description
+        forkCount
+        isFork
+        isArchived
+        pushedAt
+        updatedAt
+        primaryLanguage { name }
+        parent { nameWithOwner url }
+        repositoryTopics(first: 20) { nodes { topic { name } } }
+        """
+
+        for batch in chunked(to_fetch, 12):
+            parts = []
+            for idx, full_name in enumerate(batch):
+                if "/" not in full_name:
+                    continue
+                owner, name = full_name.split("/", 1)
+                owner = owner.replace('"', "")
+                name = name.replace('"', "")
+                parts.append(f'repo_{idx}: repository(owner: "{owner}", name: "{name}") {{ {fields} }}')
+            if not parts:
+                continue
+            query = "query { " + " ".join(parts) + " }"
+            resp = self._request("POST", self.graphql_url, json={"query": query}, bucket="graphql")
+            data = resp.json()
+            repos = data.get("data", {})
+            for idx, full_name in enumerate(batch):
+                key = f"repo_{idx}"
+                meta = repos.get(key) or {}
+                cache_key = f"repo-meta:{full_name}"
+                self.cache.save(cache_key, meta, ttl=ttl)
+                results[full_name] = meta
+
+        return results
+
+
+def build_client(token_env: str = "GITHUB_TOKEN") -> GitHubClient:
+    token = os.environ.get(token_env)
+    return GitHubClient(token, cache_dir=CACHE_DIR / "github")
@@ -0,0 +1,220 @@
+from __future__ import annotations
+
+from collections import Counter
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+from utils import API_DIR, DIFFS_DIR, SNAPSHOT_DIR, TOP_DIR, ensure_dirs, load_json, save_json, today_str
+
+
+def write_cve_outputs(results: List[Dict], *, base_dir: Path | None = None) -> None:
+    target_dir = base_dir or API_DIR / "cve"
+    ensure_dirs(target_dir)
+    for result in results:
+        last_updated = result.get("last_updated") or today_str()
+        output = {
+            "cve_id": result["cve_id"],
+            "last_updated": last_updated,
+            "pocs": [
+                {
+                    "repo_full_name": poc.get("repo_full_name"),
+                    "repo_url": poc.get("repo_url"),
+                    "is_fork": poc.get("is_fork"),
+                    "parent_repo_url": poc.get("parent_repo_url"),
+                    "stars": poc.get("stars"),
+                    "forks": poc.get("forks"),
+                    "archived": poc.get("archived"),
+                    "pushed_at": poc.get("pushed_at") or poc.get("updated_at"),
+                    "topics": poc.get("topics", []),
+                    "primary_language": poc.get("primary_language"),
+                    "matches": poc.get("matches", []),
+                    "confidence_score": poc.get("confidence_score"),
+                    "confidence_tier": poc.get("confidence_tier"),
+                }
+                for poc in result.get("pocs", [])
+            ],
+        }
+        save_json(target_dir / f"{result['cve_id']}.json", output)
+
+
+def build_index(results: List[Dict]) -> Dict:
+    items: List[Dict] = []
+    for result in results:
+        poc_entries = result.get("pocs", [])
+        high = [p for p in poc_entries if p.get("confidence_tier") == "high"]
+        medium = [p for p in poc_entries if p.get("confidence_tier") == "medium"]
+        langs = Counter()
+        max_score = 0.0
+        for poc in poc_entries:
+            lang = poc.get("primary_language")
+            if lang:
+                langs[lang] += 1
+            max_score = max(max_score, poc.get("confidence_score") or 0)
+        items.append(
+            {
+                "cve_id": result["cve_id"],
+                "poc_count": len(poc_entries),
+                "high_confidence": len(high),
+                "medium_confidence": len(medium),
+                "top_languages": [lang for lang, _ in langs.most_common(3)],
+                "max_score": max_score,
+                "last_updated": result.get("last_updated"),
+            }
+        )
+    return {"generated": today_str(), "items": sorted(items, key=lambda r: r["cve_id"], reverse=True)}
+
+
+def write_index(results: List[Dict]) -> Dict:
+    ensure_dirs(API_DIR)
+    payload = build_index(results)
+    save_json(API_DIR / "index.json", payload)
+    return payload
+
+
+def write_top(results: List[Dict], *, limit: int = 100) -> Dict:
+    ensure_dirs(TOP_DIR)
+    entries: List[Dict] = []
+    for result in results:
+        for poc in result.get("pocs", []):
+            if poc.get("confidence_tier") not in {"high", "medium"}:
+                continue
+            entries.append(
+                {
+                    "cve_id": result["cve_id"],
+                    "repo_full_name": poc.get("repo_full_name"),
+                    "repo_url": poc.get("repo_url"),
+                    "score": poc.get("confidence_score"),
+                    "tier": poc.get("confidence_tier"),
+                    "stars": poc.get("stars"),
+                    "primary_language": poc.get("primary_language"),
+                }
+            )
+    entries.sort(key=lambda e: (-(e.get("score") or 0), -(e.get("stars") or 0)))
+    payload = {"generated": today_str(), "items": entries[:limit]}
+    save_json(TOP_DIR / "today.json", payload)
+    return payload
+
+
+def summarise_for_snapshot(results: List[Dict], *, top: Dict | None = None) -> Dict:
+    summary: Dict[str, Dict[str, Dict]] = {}
+    for result in results:
+        repo_map: Dict[str, Dict] = {}
+        for poc in result.get("pocs", []):
+            repo_map[poc.get("repo_full_name")] = {
+                "score": poc.get("confidence_score"),
+                "tier": poc.get("confidence_tier"),
+            }
+        summary[result["cve_id"]] = repo_map
+    payload = {"generated": today_str(), "entries": summary}
+    if top:
+        payload["top"] = top
+    return payload
+
+
+def write_snapshot(summary: Dict) -> Path:
+    ensure_dirs(SNAPSHOT_DIR)
+    target = SNAPSHOT_DIR / f"{summary['generated']}.json"
+    save_json(target, summary)
+    save_json(SNAPSHOT_DIR / "latest.json", summary)
+    return target
+
+
+def prune_old_snapshots(days: int = 14) -> None:
+    if not SNAPSHOT_DIR.exists():
+        return
+    cutoff = datetime.utcnow().date() - timedelta(days=days)
+    for snap in SNAPSHOT_DIR.glob("*.json"):
+        try:
+            snap_date = datetime.strptime(snap.stem, "%Y-%m-%d").date()
+        except ValueError:
+            continue
+        if snap_date < cutoff:
+            snap.unlink(missing_ok=True)
+
+
+def prune_old_diffs(days: int = 14) -> None:
+    if not DIFFS_DIR.exists():
+        return
+    cutoff = datetime.now().date() - timedelta(days=days)
+    for diff in DIFFS_DIR.glob("*.json"):
+        try:
+            diff_date = datetime.strptime(diff.stem, "%Y-%m-%d").date()
+        except ValueError:
+            continue
+        if diff_date < cutoff:
+            diff.unlink(missing_ok=True)
+
+
+def _load_snapshot(path: Path) -> Dict:
+    return load_json(path, default={}) or {}
+
+
+def build_diff(prev: Dict, curr: Dict, *, dead_links: List[Dict] | None = None) -> Dict:
+    prev_entries = prev.get("entries", {})
+    curr_entries = curr.get("entries", {})
+
+    new_high: List[Dict] = []
+    promoted: List[Dict] = []
+    demoted: List[Dict] = []
+
+    for cve_id, repos in curr_entries.items():
+        for repo_name, info in repos.items():
+            tier = info.get("tier")
+            if tier != "high":
+                continue
+            prev_info = (prev_entries.get(cve_id) or {}).get(repo_name)
+            if not prev_info:
+                new_high.append({"cve_id": cve_id, "repo_full_name": repo_name, "score": info.get("score")})
+            elif prev_info.get("tier") != "high":
+                promoted.append(
+                    {
+                        "cve_id": cve_id,
+                        "repo_full_name": repo_name,
+                        "score": info.get("score"),
+                        "previous_tier": prev_info.get("tier"),
+                    }
+                )
+
+    for cve_id, repos in prev_entries.items():
+        for repo_name, info in repos.items():
+            if info.get("tier") != "high":
+                continue
+            curr_info = (curr_entries.get(cve_id) or {}).get(repo_name)
+            if not curr_info or curr_info.get("tier") != "high":
+                demoted.append(
+                    {
+                        "cve_id": cve_id,
+                        "repo_full_name": repo_name,
+                        "previous_score": info.get("score"),
+                        "previous_tier": info.get("tier"),
+                        "current_tier": curr_info.get("tier") if curr_info else None,
+                    }
+                )
+
+    return {
+        "generated": curr.get("generated"),
+        "new_high_conf_pocs": new_high,
+        "promoted_to_high": promoted,
+        "demoted_or_removed": demoted,
+        "dead_links": dead_links or [],
+    }
+
+
+def write_diff(diff: Dict) -> Path:
+    ensure_dirs(DIFFS_DIR)
+    target = DIFFS_DIR / f"{diff['generated']}.json"
+    save_json(target, diff)
+    save_json(DIFFS_DIR / "latest.json", diff)
+    return target
+
+
+def latest_snapshots() -> Tuple[Dict, Dict]:
+    if not SNAPSHOT_DIR.exists():
+        return {}, {}
+    snaps = sorted(SNAPSHOT_DIR.glob("*.json"))
+    if not snaps:
+        return {}, {}
+    curr = _load_snapshot(snaps[-1])
+    prev = _load_snapshot(snaps[-2]) if len(snaps) > 1 else {}
+    return prev, curr
@@ -0,0 +1,274 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from datetime import date, datetime, timedelta
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+
+from github_client import GitHubClient, SearchResult, build_client
+from poc_scoring import match_score, score_repo
+from utils import API_DIR, EVIDENCE_DIR, chunked, cve_year, ensure_dirs, isoformat, load_blacklist, load_json, save_json, today_str
+
+
+LANG_PARTITIONS = ("python", "go", "c", "shell", "powershell", "java", "ruby", "js")
+CVE_RE = re.compile(r"CVE-\d{4}-\d{4,}", re.IGNORECASE)
+
+
+@dataclass
+class MatchEvidence:
+    path: str
+    match_type: str
+    query: str
+    score: float | None = None
+
+
+@dataclass
+class RepoCandidate:
+    cve_id: str
+    repo_full_name: str
+    repo_url: str
+    matches: List[MatchEvidence] = field(default_factory=list)
+    metadata: Dict[str, object] = field(default_factory=dict)
+
+    def add_match(self, path: str, match_type: str, query: str) -> None:
+        key = (path, match_type)
+        existing = {(m.path, m.match_type) for m in self.matches}
+        if key in existing:
+            return
+        self.matches.append(MatchEvidence(path=path, match_type=match_type, query=query))
+
+
+def build_created_ranges(days: int, *, window: int = 7) -> List[Tuple[str, str]]:
+    end = date.today()
+    start = end - timedelta(days=max(days, 1))
+    ranges: List[Tuple[str, str]] = []
+    cursor = start
+    while cursor <= end:
+        window_end = min(cursor + timedelta(days=window - 1), end)
+        ranges.append((cursor.isoformat(), window_end.isoformat()))
+        cursor = window_end + timedelta(days=1)
+    return ranges or [(start.isoformat(), end.isoformat())]
+
+
+def build_query_pack(cve_id: str, created_range: Tuple[str, str] | None = None) -> List[Dict[str, str]]:
+    base_repo = f'{cve_id} in:name,description,readme fork:false'
+    enriched_repo = f'{cve_id} (poc OR exploit) in:name,description,readme fork:false'
+    topic_query = f"topic:{cve_id.lower()} fork:false"
+    created_suffix = ""
+    if created_range:
+        created_suffix = f" created:{created_range[0]}..{created_range[1]}"
+
+    queries = [
+        {"kind": "repositories", "query": base_repo + created_suffix, "match_type": "name"},
+        {"kind": "repositories", "query": enriched_repo + created_suffix, "match_type": "description"},
+        {"kind": "repositories", "query": topic_query + created_suffix, "match_type": "topic"},
+    ]
+
+    for lang in LANG_PARTITIONS:
+        base_code = f'{cve_id} in:file language:{lang}{created_suffix}'
+        queries.append({"kind": "code", "query": base_code, "match_type": "code"})
+
+    # generic code search without language partition for the most recent window
+    queries.append({"kind": "code", "query": f"{cve_id} in:file{created_suffix}", "match_type": "code"})
+    return queries
+
+
+def parse_repo_from_item(item: Dict) -> Tuple[str | None, str | None]:
+    repo_full_name = item.get("full_name") or item.get("repository", {}).get("full_name")
+    repo_url = item.get("html_url") or item.get("repository", {}).get("html_url")
+    if not repo_full_name and "repository" in item:
+        repo_full_name = item["repository"].get("owner", {}).get("login", "")
+        if repo_full_name:
+            repo_full_name = f"{repo_full_name}/{item['repository'].get('name', '')}"
+    return repo_full_name, repo_url
+
+
+def extract_matches(item: Dict, default_type: str, query: str) -> List[MatchEvidence]:
+    matches: List[MatchEvidence] = []
+    for text_match in item.get("text_matches", []) or []:
+        prop = text_match.get("property") or text_match.get("object_type") or ""
+        fragment = text_match.get("fragment") or text_match.get("path") or prop or ""
+        match_type = prop if prop else default_type
+        matches.append(MatchEvidence(path=str(fragment), match_type=str(match_type), query=query))
+    if not matches:
+        path = item.get("path") or default_type
+        matches.append(MatchEvidence(path=str(path), match_type=default_type, query=query))
+    return matches
+
+
+def normalise_metadata(meta: Dict, fallback_full_name: str, fallback_url: str) -> Dict:
+    topics = []
+    if meta.get("repositoryTopics"):
+        for node in meta["repositoryTopics"].get("nodes", []):
+            topic = (node.get("topic") or {}).get("name")
+            if topic:
+                topics.append(topic)
+    primary_language = None
+    if meta.get("primaryLanguage"):
+        primary_language = meta["primaryLanguage"].get("name")
+    parent = meta.get("parent") or {}
+    return {
+        "repo_full_name": meta.get("nameWithOwner") or fallback_full_name,
+        "repo_url": meta.get("url") or fallback_url,
+        "description": meta.get("description") or "",
+        "is_fork": bool(meta.get("isFork")),
+        "parent_repo_url": parent.get("url"),
+        "stars": meta.get("stargazerCount") or 0,
+        "forks": meta.get("forkCount") or 0,
+        "archived": bool(meta.get("isArchived")),
+        "pushed_at": meta.get("pushedAt"),
+        "updated_at": meta.get("updatedAt"),
+        "topics": topics,
+        "primary_language": primary_language,
+    }
+
+
+class PoCPipeline:
+    def __init__(
+        self,
+        client: GitHubClient | None = None,
+        *,
+        blacklist_path: Path | None = None,
+        search_ttl: int = 3 * 3600,
+    ) -> None:
+        self.client = client or build_client()
+        self.blacklist = load_blacklist(blacklist_path)
+        self.search_ttl = search_ttl
+
+    def _run_query(self, query: Dict, page: int) -> SearchResult:
+        if query["kind"] == "repositories":
+            return self.client.search_repositories(query["query"], page=page, per_page=50, ttl=self.search_ttl)
+        if query["kind"] == "code":
+            return self.client.search_code(query["query"], page=page, per_page=50, ttl=self.search_ttl)
+        return self.client.search_topics(query["query"], page=page, per_page=50, ttl=self.search_ttl)
+
+    def discover_for_cve(self, cve_id: str, *, days: int, max_pages_repo: int = 2, max_pages_code: int = 2) -> Dict:
+        ranges = build_created_ranges(days)
+        candidates: Dict[str, RepoCandidate] = {}
+        query_log: List[Dict] = []
+
+        for created_range in ranges:
+            query_pack = build_query_pack(cve_id, created_range)
+            for query in query_pack:
+                query_log.append({"query": query["query"], "kind": query["kind"], "window": created_range})
+                page_limit = max_pages_code if query["kind"] == "code" else max_pages_repo
+                for page in range(1, page_limit + 1):
+                    result = self._run_query(query, page)
+                    items = result.payload.get("items", [])
+                    for item in items:
+                        repo_full_name, repo_url = parse_repo_from_item(item)
+                        if not repo_full_name or not repo_url:
+                            continue
+                        candidate = candidates.setdefault(
+                            repo_full_name,
+                            RepoCandidate(cve_id=cve_id, repo_full_name=repo_full_name, repo_url=repo_url),
+                        )
+                        for match in extract_matches(item, query["match_type"], query["query"]):
+                            candidate.add_match(match.path, match.match_type, match.query)
+                    if len(items) < 50:
+                        break
+
+        metadata = self.client.fetch_repo_metadata(candidates.keys())
+        for repo_full_name, candidate in candidates.items():
+            meta = metadata.get(repo_full_name, {})
+            candidate.metadata = normalise_metadata(meta, repo_full_name, candidate.repo_url)
+
+        repos: List[Dict] = []
+        for candidate in candidates.values():
+            matches_dicts = []
+            for m in candidate.matches:
+                m.score = match_score({"path": m.path, "match_type": m.match_type})
+                matches_dicts.append({"path": m.path, "match_type": m.match_type, "query": m.query, "score": m.score})
+            score, tier = score_repo(candidate.metadata, matches_dicts, self.blacklist)
+            repo_entry = {
+                **candidate.metadata,
+                "matches": matches_dicts,
+                "confidence_score": score,
+                "confidence_tier": tier,
+                "cve_id": cve_id,
+            }
+            repos.append(repo_entry)
+
+        repos.sort(key=lambda r: (-r["confidence_score"], -r.get("stars", 0)))
+
+        evidence = {
+            "queries": query_log,
+            "candidates": [
+                {
+                    "repo_full_name": r["repo_full_name"],
+                    "matches": r["matches"],
+                    "match_count": len(r["matches"]),
+                    "score": r["confidence_score"],
+                    "tier": r["confidence_tier"],
+                }
+                for r in repos
+            ],
+        }
+        return {"cve_id": cve_id, "last_updated": isoformat(), "pocs": repos, "evidence": evidence}
+
+    def discover_many(self, cve_ids: Iterable[str], *, days: int, limit: Optional[int] = None) -> List[Dict]:
+        results: List[Dict] = []
+        for idx, cve_id in enumerate(cve_ids):
+            if limit and idx >= limit:
+                break
+            results.append(self.discover_for_cve(cve_id, days=days))
+        return results
+
+
+def persist_evidence(results: List[Dict]) -> None:
+    ensure_dirs(EVIDENCE_DIR)
+    for result in results:
+        cve_id = result["cve_id"]
+        evidence_path = EVIDENCE_DIR / f"{cve_id}.json"
+        save_json(evidence_path, result.get("evidence", {}))
+
+
+def discover_from_github_list(path: Path) -> List[str]:
+    if not path.exists():
+        return []
+    ids: List[str] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        matches = CVE_RE.findall(line)
+        for match in matches:
+            if match.upper() not in ids:
+                ids.append(match.upper())
+    return ids
+
+
+def load_existing_cves(api_dir: Path = API_DIR / "cve") -> List[str]:
+    if not api_dir.exists():
+        return []
+    return sorted({p.stem.upper() for p in api_dir.glob("CVE-*.json") if CVE_RE.match(p.stem)})
+
+
+def build_scope(
+    days: int,
+    *,
+    github_list: Path,
+    existing_api: Path,
+    prefer_recent_years: bool = True,
+    max_cves: int | None = None,
+    low_conf_threshold: int = 1,
+) -> List[str]:
+    seeds = discover_from_github_list(github_list)
+    existing = load_existing_cves(existing_api)
+    candidates = seeds or existing
+
+    if prefer_recent_years:
+        current_year = date.today().year
+        candidates = [cve for cve in candidates if cve_year(cve) and cve_year(cve) >= current_year - 2] or candidates
+
+    index_path = API_DIR / "index.json"
+    low_conf: List[str] = []
+    if index_path.exists():
+        index_payload = load_json(index_path, default={}) or {}
+        for item in index_payload.get("items", []):
+            score = (item.get("high_confidence", 0) or 0) + (item.get("medium_confidence", 0) or 0)
+            if score <= low_conf_threshold:
+                low_conf.append(item.get("cve_id"))
+
+    scoped = candidates + [cve for cve in low_conf if cve and cve not in candidates]
+    if max_cves:
+        scoped = scoped[:max_cves]
+    return scoped
@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+import re
+from datetime import datetime, timedelta, timezone
+from typing import Dict, Iterable, List, Tuple
+
+from utils import clamp, parse_date
+
+DOC_EXTS = {"md", "txt", "rst", "adoc", "markdown", "mkd", "mdown"}
+POSITIVE_KEYWORDS = ("poc", "exploit", "rce", "lpe", "auth bypass", "bypass")
+NEGATIVE_KEYWORDS = ("report", "writeup", "advisory", "changelog")
+
+
+def is_doc_path(path: str) -> bool:
+    lower = path.lower()
+    if lower.endswith("/"):
+        return True
+    if "." not in lower:
+        return False
+    ext = lower.rsplit(".", 1)[-1]
+    return ext in DOC_EXTS
+
+
+def match_score(match: Dict) -> float:
+    path = str(match.get("path", ""))
+    match_type = str(match.get("match_type", "")).lower()
+    base = 50 if not is_doc_path(path) else 30
+    if match_type in ("code",):
+        base += 10
+    if "readme" in match_type:
+        base += 5
+    if "topic" in match_type:
+        base -= 5
+    return clamp(base, 0, 100)
+
+
+def tier_for_score(score: float) -> str:
+    if score >= 75:
+        return "high"
+    if score >= 45:
+        return "medium"
+    return "low"
+
+
+def keyword_hits(text: str, keywords: Iterable[str]) -> int:
+    if not text:
+        return 0
+    lower = text.lower()
+    return sum(1 for kw in keywords if kw in lower)
+
+
+def recency_bonus(pushed_at: str | None) -> float:
+    if not pushed_at:
+        return 0.0
+    dt = parse_date(pushed_at)
+    if not dt:
+        return 0.0
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    delta = datetime.now(timezone.utc) - dt
+    if delta <= timedelta(days=30):
+        return 18.0
+    if delta <= timedelta(days=90):
+        return 10.0
+    if delta <= timedelta(days=180):
+        return 5.0
+    return 0.0
+
+
+def score_repo(repo: Dict, matches: List[Dict], blacklist: List[str]) -> Tuple[float, str]:
+    stars = repo.get("stargazerCount") or repo.get("stars") or 0
+    forks = repo.get("forkCount") or repo.get("forks") or 0
+    is_fork = bool(repo.get("isFork"))
+    archived = bool(repo.get("isArchived"))
+    topics = [t.lower() for t in repo.get("topics", []) if t]
+    name = str(repo.get("nameWithOwner") or repo.get("repo_full_name") or "").lower()
+    description = str(repo.get("description") or "").lower()
+
+    non_doc_matches = [m for m in matches if not is_doc_path(str(m.get("path", "")))]
+    doc_matches = [m for m in matches if is_doc_path(str(m.get("path", "")))]
+
+    score = 12.0
+    if non_doc_matches:
+        score += 25 + min(len(non_doc_matches) * 2, 10)
+    if doc_matches and not non_doc_matches:
+        score -= 20
+
+    score += recency_bonus(repo.get("pushed_at") or repo.get("pushedAt") or repo.get("updated_at"))
+
+    score += min(stars / 50.0, 25.0)
+    score += min(forks / 200.0, 5.0)
+
+    score += keyword_hits(description, POSITIVE_KEYWORDS) * 4.0
+    score += keyword_hits(" ".join(topics), POSITIVE_KEYWORDS) * 4.0
+
+    negative_bias = keyword_hits(description, NEGATIVE_KEYWORDS)
+    if negative_bias and not non_doc_matches:
+        score -= 15
+
+    if is_fork:
+        score -= 12
+    if archived:
+        score -= 30
+
+    lowered_blacklist = [entry.lower() for entry in blacklist]
+    for forbidden in lowered_blacklist:
+        if not forbidden:
+            continue
+        if forbidden.endswith("*"):
+            prefix = forbidden[:-1]
+            if prefix and name.startswith(prefix):
+                score -= 40
+                break
+        elif forbidden in name:
+            score -= 40
+            break
+
+    for match in matches:
+        score += match_score(match) / 25.0
+
+    return clamp(score, 0, 100), tier_for_score(score)
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, List
+
+from jinja2 import Environment, FileSystemLoader, select_autoescape
+
+from utils import DOCS_DIR, TEMPLATES_DIR, ensure_dirs
+
+
+def build_env() -> Environment:
+    loader = FileSystemLoader(str(TEMPLATES_DIR))
+    env = Environment(loader=loader, autoescape=select_autoescape(["html", "xml"]))
+    env.trim_blocks = True
+    env.lstrip_blocks = True
+    return env
+
+
+class SiteRenderer:
+    def __init__(
+        self,
+        *,
+        results: List[Dict],
+        index_payload: Dict,
+        top_payload: Dict,
+        diff_payload: Dict | None = None,
+    ) -> None:
+        self.results = []
+        for result in results:
+            visible = [p for p in result.get("pocs", []) if p.get("confidence_tier") in {"high", "medium"}]
+            if not visible:
+                visible = result.get("pocs", [])
+            self.results.append({**result, "visible_pocs": visible})
+        self.index_payload = index_payload
+        self.top_payload = top_payload
+        self.diff_payload = diff_payload or {}
+        self.env = build_env()
+        ensure_dirs(
+            DOCS_DIR,
+            DOCS_DIR / "pocs",
+            DOCS_DIR / "cve",
+            DOCS_DIR / "diffs",
+            DOCS_DIR / "assets",
+        )
+
+    def render(self, template_name: str, context: Dict, target: Path) -> None:
+        html = self.env.get_template(template_name).render(**context)
+        target.parent.mkdir(parents=True, exist_ok=True)
+        target.write_text(html, encoding="utf-8")
+
+    def build(self) -> None:
+        generated = self.index_payload.get("generated")
+        summary = {
+            "generated": generated,
+            "total_cves": len(self.index_payload.get("items", [])),
+            "total_pocs": sum(item.get("poc_count", 0) for item in self.index_payload.get("items", [])),
+            "high_total": sum(item.get("high_confidence", 0) for item in self.index_payload.get("items", [])),
+            "medium_total": sum(item.get("medium_confidence", 0) for item in self.index_payload.get("items", [])),
+        }
+        self.render(
+            "pipeline_index.html",
+            {
+                "summary": summary,
+                "top": self.top_payload.get("items", [])[:25],
+                "diff": self.diff_payload or {},
+            },
+            DOCS_DIR / "index.html",
+        )
+
+        self.render(
+            "pipeline_pocs.html",
+            {
+                "generated": generated,
+                "index": self.index_payload.get("items", []),
+                "top": self.top_payload.get("items", [])[:100],
+            },
+            DOCS_DIR / "pocs" / "index.html",
+        )
+
+        for result in self.results:
+            self.render(
+                "pipeline_cve.html",
+                {"cve": result, "generated": generated},
+                DOCS_DIR / "cve" / f"{result['cve_id']}.html",
+            )
+
+        if self.diff_payload:
+            diff_date = self.diff_payload.get("generated")
+            self.render(
+                "pipeline_diff.html",
+                {"diff": self.diff_payload, "generated": generated},
+                DOCS_DIR / "diffs" / "index.html",
+            )
+            if diff_date:
+                self.render(
+                    "pipeline_diff.html",
+                    {"diff": self.diff_payload, "generated": generated},
+                    DOCS_DIR / "diffs" / f"{diff_date}.html",
+                )
@@ -13,8 +13,13 @@ DATA_DIR = ROOT / "data"
 DOCS_DIR = ROOT / "docs"
 API_DIR = DOCS_DIR / "api" / "v1"
 SNAPSHOT_DIR = API_DIR / "snapshots"
+DIFFS_DIR = API_DIR / "diffs"
+TOP_DIR = API_DIR / "top"
 TEMPLATES_DIR = ROOT / "templates"
 ASSETS_DIR = DOCS_DIR / "assets"
+CACHE_DIR = DATA_DIR / "cache"
+STATE_DIR = DATA_DIR / "state"
+EVIDENCE_DIR = DATA_DIR / "evidence"


 def ensure_dirs(*paths: Path) -> None:
@@ -45,6 +50,21 @@ def today_str() -> str:
    return datetime.now(timezone.utc).date().isoformat()


+def now_utc() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+def isoformat(dt: datetime | None = None) -> str:
+    return (dt or now_utc()).isoformat()
+
+
+def parse_date(value: str) -> datetime | None:
+    try:
+        return datetime.fromisoformat(value.replace("Z", "+00:00"))
+    except ValueError:
+        return None
+
+
 def slugify(text: str) -> str:
    cleaned = re.sub(r"[^A-Za-z0-9]+", "-", text.strip().lower())
    cleaned = cleaned.strip("-")
@@ -79,6 +99,7 @@ CVE_SECTION_RE = re.compile(r"^CVE-\d{4}-\d{4,}$", re.IGNORECASE)
 def load_poc_index() -> Dict[str, Dict[str, object]]:
    """Load CVE → {desc, poc} mapping from docs/CVE_list.json or markdown files."""
    cve_json = DOCS_DIR / "CVE_list.json"
+    blacklist = load_blacklist()
    if cve_json.exists():
        data = load_json(cve_json, default=[]) or []
        mapping = {}
@@ -86,32 +107,35 @@ def load_poc_index() -> Dict[str, Dict[str, object]]:
            cve = str(entry.get("cve", "")).upper()
            if not is_valid_cve(cve):
                continue
+            poc_links = stable_unique(entry.get("poc", []) or [])
+            poc_links = filter_links_by_blacklist(poc_links, blacklist)
            mapping[cve] = {
                "desc": entry.get("desc", ""),
-                "poc": stable_unique(entry.get("poc", []) or []),
+                "poc": poc_links,
            }
        return mapping

-    return build_poc_index_from_markdown()
+    return build_poc_index_from_markdown(blacklist=blacklist)


-def build_poc_index_from_markdown() -> Dict[str, Dict[str, object]]:
+def build_poc_index_from_markdown(*, blacklist: Optional[List[str]] = None) -> Dict[str, Dict[str, object]]:
    mapping: Dict[str, Dict[str, object]] = {}
    for md_path in sorted(ROOT.glob("[12][0-9][0-9][0-9]/CVE-*.md")):
        cve = md_path.stem.upper()
        if not is_valid_cve(cve):
            continue
-        desc, poc_links = parse_cve_markdown(md_path)
+        desc, poc_links = parse_cve_markdown(md_path, blacklist=blacklist)
        mapping[cve] = {"desc": desc, "poc": poc_links}
    return mapping


-def parse_cve_markdown(path: Path) -> Tuple[str, List[str]]:
+def parse_cve_markdown(path: Path, *, blacklist: Optional[List[str]] = None) -> Tuple[str, List[str]]:
    text = path.read_text(encoding="utf-8")
    sections = parse_sections(text)
    description = normalise_block(sections.get("### Description", ""))
-    references = collect_links(sections.get("#### Reference", ""))
-    github_links = collect_links(sections.get("#### Github", ""))
+    blacklist = blacklist or []
+    references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist)
+    github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist)
    poc_links = stable_unique([*references, *github_links])
    return description, poc_links

@@ -144,7 +168,7 @@ def parse_sections(content: str) -> Dict[str, str]:
    return sections


-def collect_links(block: str) -> List[str]:
+def collect_links(block: str, *, blacklist: Optional[List[str]] = None) -> List[str]:
    links: List[str] = []
    for raw in block.splitlines():
        entry = raw.strip()
@@ -154,7 +178,7 @@ def collect_links(block: str) -> List[str]:
            entry = entry[2:].strip()
        if entry and entry not in links:
            links.append(entry)
-    return links
+    return filter_links_by_blacklist(links, blacklist or [])


 def is_valid_cve(cve_id: str) -> bool:
@@ -165,6 +189,15 @@ def is_valid_cve(cve_id: str) -> bool:
    return year.isdigit() and parts[2].isdigit()


+def cve_year(cve_id: str) -> int | None:
+    if not is_valid_cve(cve_id):
+        return None
+    try:
+        return int(cve_id.split("-")[1])
+    except (TypeError, ValueError):
+        return None
+
+
 # --- Trending PoCs -------------------------------------------------------

 TREND_ROW_RE = re.compile(r"^\|\s*(?P<stars>\d+)\s*⭐\s*\|\s*(?P<updated>[^|]+)\|\s*\[(?P<name>[^\]]+)\]\((?P<url>[^)]+)\)\s*\|\s*(?P<desc>.*)\|$")
@@ -199,3 +232,87 @@ def read_text(path: Path) -> str:
 def write_text(path: Path, content: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")
+
+
+# --- New helpers for PoC discovery -------------------------------------------------
+
+
+def clamp(value: float, minimum: float = 0, maximum: float = 100) -> float:
+    return max(minimum, min(maximum, value))
+
+
+def chunked(iterable: Iterable, size: int) -> Iterable[List]:
+    chunk: List = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) >= size:
+            yield chunk
+            chunk = []
+    if chunk:
+        yield chunk
+
+
+def hash_key(text: str) -> str:
+    import hashlib
+
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+def load_blacklist(path: Path | None = None) -> List[str]:
+    target = path or ROOT / "blacklist.txt"
+    if not target.exists():
+        return []
+    entries: List[str] = []
+    for raw in target.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if line and not line.startswith("#"):
+            entries.append(line)
+    return entries
+
+
+def extract_repo_from_url(url: str) -> str:
+    """Return repository name segment from a URL (best effort)."""
+    try:
+        from urllib.parse import urlparse
+
+        parsed = urlparse(url)
+        host = (parsed.netloc or "").lower()
+        if host and "github" not in host:
+            return ""
+        path = parsed.path or url
+    except Exception:
+        path = url
+    parts = path.strip("/").split("/")
+    if len(parts) >= 2:
+        return parts[1].lower()
+    if parts:
+        return parts[-1].lower()
+    return ""
+
+
+def is_blacklisted_repo(url: str, blacklist: List[str]) -> bool:
+    repo = extract_repo_from_url(url)
+    if not repo:
+        return False
+    for entry in blacklist:
+        slug = entry.strip().lower()
+        if not slug:
+            continue
+        if slug.endswith("*"):
+            prefix = slug[:-1]
+            if prefix and repo.startswith(prefix):
+                return True
+        elif repo == slug:
+            return True
+    return False
+
+
+def filter_links_by_blacklist(links: List[str], blacklist: List[str]) -> List[str]:
+    if not blacklist:
+        return links
+    filtered: List[str] = []
+    for link in links:
+        if is_blacklisted_repo(link, blacklist):
+            continue
+        filtered.append(link)
+    return filtered