Add PoC pipeline with blacklist filtering and Pages build

2026-05-24 15:54:10 +02:00 · 2025-12-17 15:53:37 +01:00
parent b1085c10f5
commit 1f0cd8e78b
20 changed files with 188921 additions and 56 deletions
@@ -13,8 +13,13 @@ DATA_DIR = ROOT / "data"
 DOCS_DIR = ROOT / "docs"
 API_DIR = DOCS_DIR / "api" / "v1"
 SNAPSHOT_DIR = API_DIR / "snapshots"
+DIFFS_DIR = API_DIR / "diffs"
+TOP_DIR = API_DIR / "top"
 TEMPLATES_DIR = ROOT / "templates"
 ASSETS_DIR = DOCS_DIR / "assets"
+CACHE_DIR = DATA_DIR / "cache"
+STATE_DIR = DATA_DIR / "state"
+EVIDENCE_DIR = DATA_DIR / "evidence"


 def ensure_dirs(*paths: Path) -> None:
@@ -45,6 +50,21 @@ def today_str() -> str:
    return datetime.now(timezone.utc).date().isoformat()


+def now_utc() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+def isoformat(dt: datetime | None = None) -> str:
+    return (dt or now_utc()).isoformat()
+
+
+def parse_date(value: str) -> datetime | None:
+    try:
+        return datetime.fromisoformat(value.replace("Z", "+00:00"))
+    except ValueError:
+        return None
+
+
 def slugify(text: str) -> str:
    cleaned = re.sub(r"[^A-Za-z0-9]+", "-", text.strip().lower())
    cleaned = cleaned.strip("-")
@@ -79,6 +99,7 @@ CVE_SECTION_RE = re.compile(r"^CVE-\d{4}-\d{4,}$", re.IGNORECASE)
 def load_poc_index() -> Dict[str, Dict[str, object]]:
    """Load CVE → {desc, poc} mapping from docs/CVE_list.json or markdown files."""
    cve_json = DOCS_DIR / "CVE_list.json"
+    blacklist = load_blacklist()
    if cve_json.exists():
        data = load_json(cve_json, default=[]) or []
        mapping = {}
@@ -86,32 +107,35 @@ def load_poc_index() -> Dict[str, Dict[str, object]]:
            cve = str(entry.get("cve", "")).upper()
            if not is_valid_cve(cve):
                continue
+            poc_links = stable_unique(entry.get("poc", []) or [])
+            poc_links = filter_links_by_blacklist(poc_links, blacklist)
            mapping[cve] = {
                "desc": entry.get("desc", ""),
-                "poc": stable_unique(entry.get("poc", []) or []),
+                "poc": poc_links,
            }
        return mapping

-    return build_poc_index_from_markdown()
+    return build_poc_index_from_markdown(blacklist=blacklist)


-def build_poc_index_from_markdown() -> Dict[str, Dict[str, object]]:
+def build_poc_index_from_markdown(*, blacklist: Optional[List[str]] = None) -> Dict[str, Dict[str, object]]:
    mapping: Dict[str, Dict[str, object]] = {}
    for md_path in sorted(ROOT.glob("[12][0-9][0-9][0-9]/CVE-*.md")):
        cve = md_path.stem.upper()
        if not is_valid_cve(cve):
            continue
-        desc, poc_links = parse_cve_markdown(md_path)
+        desc, poc_links = parse_cve_markdown(md_path, blacklist=blacklist)
        mapping[cve] = {"desc": desc, "poc": poc_links}
    return mapping


-def parse_cve_markdown(path: Path) -> Tuple[str, List[str]]:
+def parse_cve_markdown(path: Path, *, blacklist: Optional[List[str]] = None) -> Tuple[str, List[str]]:
    text = path.read_text(encoding="utf-8")
    sections = parse_sections(text)
    description = normalise_block(sections.get("### Description", ""))
-    references = collect_links(sections.get("#### Reference", ""))
-    github_links = collect_links(sections.get("#### Github", ""))
+    blacklist = blacklist or []
+    references = collect_links(sections.get("#### Reference", ""), blacklist=blacklist)
+    github_links = collect_links(sections.get("#### Github", ""), blacklist=blacklist)
    poc_links = stable_unique([*references, *github_links])
    return description, poc_links

@@ -144,7 +168,7 @@ def parse_sections(content: str) -> Dict[str, str]:
    return sections


-def collect_links(block: str) -> List[str]:
+def collect_links(block: str, *, blacklist: Optional[List[str]] = None) -> List[str]:
    links: List[str] = []
    for raw in block.splitlines():
        entry = raw.strip()
@@ -154,7 +178,7 @@ def collect_links(block: str) -> List[str]:
            entry = entry[2:].strip()
        if entry and entry not in links:
            links.append(entry)
-    return links
+    return filter_links_by_blacklist(links, blacklist or [])


 def is_valid_cve(cve_id: str) -> bool:
@@ -165,6 +189,15 @@ def is_valid_cve(cve_id: str) -> bool:
    return year.isdigit() and parts[2].isdigit()


+def cve_year(cve_id: str) -> int | None:
+    if not is_valid_cve(cve_id):
+        return None
+    try:
+        return int(cve_id.split("-")[1])
+    except (TypeError, ValueError):
+        return None
+
+
 # --- Trending PoCs -------------------------------------------------------

 TREND_ROW_RE = re.compile(r"^\|\s*(?P<stars>\d+)\s*⭐\s*\|\s*(?P<updated>[^|]+)\|\s*\[(?P<name>[^\]]+)\]\((?P<url>[^)]+)\)\s*\|\s*(?P<desc>.*)\|$")
@@ -199,3 +232,87 @@ def read_text(path: Path) -> str:
 def write_text(path: Path, content: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content, encoding="utf-8")
+
+
+# --- New helpers for PoC discovery -------------------------------------------------
+
+
+def clamp(value: float, minimum: float = 0, maximum: float = 100) -> float:
+    return max(minimum, min(maximum, value))
+
+
+def chunked(iterable: Iterable, size: int) -> Iterable[List]:
+    chunk: List = []
+    for item in iterable:
+        chunk.append(item)
+        if len(chunk) >= size:
+            yield chunk
+            chunk = []
+    if chunk:
+        yield chunk
+
+
+def hash_key(text: str) -> str:
+    import hashlib
+
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+def load_blacklist(path: Path | None = None) -> List[str]:
+    target = path or ROOT / "blacklist.txt"
+    if not target.exists():
+        return []
+    entries: List[str] = []
+    for raw in target.read_text(encoding="utf-8").splitlines():
+        line = raw.strip()
+        if line and not line.startswith("#"):
+            entries.append(line)
+    return entries
+
+
+def extract_repo_from_url(url: str) -> str:
+    """Return repository name segment from a URL (best effort)."""
+    try:
+        from urllib.parse import urlparse
+
+        parsed = urlparse(url)
+        host = (parsed.netloc or "").lower()
+        if host and "github" not in host:
+            return ""
+        path = parsed.path or url
+    except Exception:
+        path = url
+    parts = path.strip("/").split("/")
+    if len(parts) >= 2:
+        return parts[1].lower()
+    if parts:
+        return parts[-1].lower()
+    return ""
+
+
+def is_blacklisted_repo(url: str, blacklist: List[str]) -> bool:
+    repo = extract_repo_from_url(url)
+    if not repo:
+        return False
+    for entry in blacklist:
+        slug = entry.strip().lower()
+        if not slug:
+            continue
+        if slug.endswith("*"):
+            prefix = slug[:-1]
+            if prefix and repo.startswith(prefix):
+                return True
+        elif repo == slug:
+            return True
+    return False
+
+
+def filter_links_by_blacklist(links: List[str], blacklist: List[str]) -> List[str]:
+    if not blacklist:
+        return links
+    filtered: List[str] = []
+    for link in links:
+        if is_blacklisted_repo(link, blacklist):
+            continue
+        filtered.append(link)
+    return filtered