Restore README trending tables and auto-build site

2026-05-23 23:14:03 +02:00 · 2025-12-17 21:39:16 +01:00
parent 0afef5597e
commit 5cb3a28aeb
4 changed files with 173 additions and 43 deletions
@@ -1,11 +1,12 @@
 #!/usr/bin/env python3
 """Regenerate the Trending PoCs tables in README.md.

- Consider the latest 4 years (current year and previous 3).
- Require repository name to contain a CVE for that year (e.g., CVE-2025-1234).
- Require a non-empty description (we only want actual PoCs, not empty shells).
- Restrict to repositories updated in the last 4 days.
- Sort by most recently updated, then stars, and emit up to 20 rows per year.
+Goals (matching the legacy README that worked well):
+- Cover the current year plus the previous three.
+- Keep the familiar heading “Latest 20 of N Repositories”.
+- Only show repos updated in the last WINDOW_DAYS.
+- Require a CVE-shaped repo name for that year and a non-empty description.
+- Sort newest first, then by stars, and cap at MAX_ROWS per year.
 """

 from __future__ import annotations
@@ -21,6 +22,7 @@ import requests
 WINDOW_DAYS = 4
 MAX_ROWS = 20
 YEARS_BACK = 4
+MIN_STARS = 0  # keep low to capture fresh repos


 class Repo(TypedDict):
@@ -53,34 +55,69 @@ def time_ago(updated_at: str, now: datetime) -> str:
    return "just now"


-def fetch_trending(year: int, cutoff: datetime) -> List[Repo]:
-    query = f"CVE-{year} in:name stars:>2 pushed:>={cutoff.date().isoformat()} archived:false"
+def _search_total(year: int) -> int:
+    """Return total repositories matching CVE-year (used for table heading)."""
+    stars_clause = f"stars:>{MIN_STARS}" if MIN_STARS >= 0 else "stars:>0"
+    query = f"CVE-{year} in:name {stars_clause} archived:false"
    url = "https://api.github.com/search/repositories"
-    params = {
-        "q": query,
-        "sort": "updated",
-        "order": "desc",
-        "per_page": 100,
-        "page": 1,
-    }
-    resp = requests.get(url, params=params, headers=github_headers(), timeout=30)
+    resp = requests.get(
+        url, params={"q": query, "per_page": 1}, headers=github_headers(), timeout=30
+    )
    resp.raise_for_status()
-    items: Iterable[Repo] = resp.json().get("items", [])
+    return int(resp.json().get("total_count", 0))
+
+
+def fetch_trending(year: int, cutoff: datetime) -> tuple[List[Repo], int]:
+    """Fetch and filter trending repos for a year, returning rows and total_count."""
+    stars_clause = f"stars:>{MIN_STARS}" if MIN_STARS >= 0 else "stars:>0"
+    query = f"CVE-{year} in:name {stars_clause} archived:false pushed:>={cutoff.date().isoformat()}"
+    url = "https://api.github.com/search/repositories"
+    total_count = _search_total(year)
    pattern = re.compile(rf"cve-{year}-\d+", re.IGNORECASE)
    filtered: List[Repo] = []
-    for item in items:
-        name = item.get("name", "")
-        updated_at = item.get("updated_at")
-        description = (item.get("description") or "").strip()
-        if not updated_at or not pattern.search(name or "") or not description:
-            continue
-        updated_dt = datetime.strptime(updated_at, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
-        if updated_dt < cutoff:
-            continue
-        filtered.append(item)
+    seen_urls: set[str] = set()
+
+    # Walk multiple pages to gather enough fresh repos (up to MAX_ROWS).
+    for page in range(1, 2):
+        params = {
+            "q": query,
+            "sort": "updated",
+            "order": "desc",
+            "per_page": 100,
+            "page": page,
+        }
+        resp = requests.get(url, params=params, headers=github_headers(), timeout=30)
+        resp.raise_for_status()
+        items: Iterable[Repo] = resp.json().get("items", [])
+        if not items:
+            break
+        for item in items:
+            name = item.get("name", "")
+            updated_at = item.get("updated_at")
+            description = (item.get("description") or "").strip()
+            html_url = item.get("html_url")
+            if not updated_at or not html_url or not description:
+                continue
+            if not pattern.search(name or ""):
+                continue
+            updated_dt = datetime.strptime(updated_at, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
+            if updated_dt < cutoff:
+                continue
+            if html_url in seen_urls:
+                continue
+            seen_urls.add(html_url)
+            filtered.append(item)
+        if len(filtered) >= MAX_ROWS:
+            break
+
    # Already sorted by updated desc; break ties by stars
-    filtered.sort(key=lambda r: (-datetime.strptime(r["updated_at"], "%Y-%m-%dT%H:%M:%SZ").timestamp(), -int(r.get("stargazers_count", 0))))
-    return filtered[:MAX_ROWS]
+    filtered.sort(
+        key=lambda r: (
+            -datetime.strptime(r["updated_at"], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
+            -int(r.get("stargazers_count", 0)),
+        )
+    )
+    return filtered[:MAX_ROWS], total_count


 def build_rows(repos: List[Repo], now: datetime) -> List[str]:
@@ -94,16 +131,16 @@ def build_rows(repos: List[Repo], now: datetime) -> List[str]:


 def main() -> None:
-    current_year = datetime.now(timezone.utc).year
-    cutoff = datetime.now(timezone.utc) - timedelta(days=WINDOW_DAYS)
    now = datetime.now(timezone.utc)
+    current_year = now.year
+    cutoff = now - timedelta(days=WINDOW_DAYS)

    output: List[str] = ['<h1 align="center">Recently updated Proof-of-Concepts</h1>']

    for year in range(current_year, current_year - YEARS_BACK, -1):
-        repos = fetch_trending(year, cutoff)
+        repos, total = fetch_trending(year, cutoff)
        output.append(f"\n\n## {year}\n")
-        output.append(f"### Updated in the last {WINDOW_DAYS} days (up to {MAX_ROWS} repos)\n")
+        output.append(f"### Latest {MAX_ROWS} of {total} Repositories\n")
        output.append("| Stars | Updated | Name | Description |")
        output.append("| --- | --- | --- | --- |")
        if repos:
@@ -49,7 +49,7 @@ jobs:
          python scripts/fetch_epss.py

      - name: Build site
-        run: python scripts/build_site.py
+        run: python scripts/build_site.py --html-mode summary

      - name: Configure Pages
        uses: actions/configure-pages@v5