From 1d7e79f7c3b3a8bed43f2cf0547eb3ceb356b6ab Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Mon, 23 Mar 2026 05:25:13 -0700
Subject: [PATCH] =?UTF-8?q?feat:=20parallelize=20CI=20evals=20=E2=80=94=20?=
 =?UTF-8?q?12=20runners=20(1=20per=20suite)=20for=20~3min=20wall=20clock?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Matrix strategy spins up 12 ubicloud-standard-2 runners simultaneously,
one per test file. Separate report job aggregates all artifacts into a
single PR comment. Bun dependency cache cuts install from ~30s to ~3s.

Runner cost: ~$0.048 (from $0.024) — negligible vs $3-4 API costs.
Wall clock: ~3-4min (from ~8min).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .github/workflows/evals.yml | 95 +++++++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml
index 84a29eee..6fa54a6c 100644
--- a/.github/workflows/evals.yml
+++ b/.github/workflows/evals.yml
@@ -10,7 +10,35 @@ concurrency:
 jobs:
   evals:
     runs-on: ubicloud-standard-2
-    timeout-minutes: 45
+    timeout-minutes: 20
+    strategy:
+      fail-fast: false
+      matrix:
+        suite:
+          - name: llm-judge
+            file: test/skill-llm-eval.test.ts
+          - name: e2e-browse
+            file: test/skill-e2e-browse.test.ts
+          - name: e2e-plan
+            file: test/skill-e2e-plan.test.ts
+          - name: e2e-deploy
+            file: test/skill-e2e-deploy.test.ts
+          - name: e2e-design
+            file: test/skill-e2e-design.test.ts
+          - name: e2e-qa-bugs
+            file: test/skill-e2e-qa-bugs.test.ts
+          - name: e2e-qa-workflow
+            file: test/skill-e2e-qa-workflow.test.ts
+          - name: e2e-review
+            file: test/skill-e2e-review.test.ts
+          - name: e2e-workflow
+            file: test/skill-e2e-workflow.test.ts
+          - name: e2e-routing
+            file: test/skill-routing-e2e.test.ts
+          - name: e2e-codex
+            file: test/codex-e2e.test.ts
+          - name: e2e-gemini
+            file: test/gemini-e2e.test.ts
     steps:
       - uses: actions/checkout@v4
         with:
@@ -18,61 +46,60 @@ jobs:
 
       - uses: oven-sh/setup-bun@v2
 
+      - name: Cache bun dependencies
+        uses: actions/cache@v4
+        with:
+          path: ~/.bun/install/cache
+          key: bun-${{ hashFiles('bun.lockb') }}
+          restore-keys: bun-
+
       - run: bun install
 
       - run: bun run build
 
-      - name: Verify browse binary
-        run: test -f browse/dist/browse || (echo "Browse binary missing after build" && exit 1)
-
       - name: Install Claude CLI
         run: npm i -g @anthropic-ai/claude-code
 
-      - name: Download previous eval baseline
-        uses: dawidd6/action-download-artifact@v6
-        with:
-          name: eval-results
-          branch: main
-          path: /tmp/eval-baseline
-          if_no_artifact_found: warn
-        continue-on-error: true
-
-      - name: Copy baseline for comparison
-        run: |
-          if [ -d /tmp/eval-baseline ]; then
-            mkdir -p ~/.gstack-dev/evals
-            cp /tmp/eval-baseline/*.json ~/.gstack-dev/evals/ 2>/dev/null || true
-          fi
-
-      - name: Run E2E evals
+      - name: Run ${{ matrix.suite.name }}
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
           EVALS_CONCURRENCY: "40"
-        run: bun run test:evals
+        run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}
 
       - name: Upload eval results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: eval-results
+          name: eval-${{ matrix.suite.name }}
           path: ~/.gstack-dev/evals/*.json
           retention-days: 90
 
+  report:
+    runs-on: ubicloud-standard-2
+    needs: evals
+    if: always() && github.event_name == 'pull_request'
+    timeout-minutes: 5
+    steps:
+      - name: Download all eval artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: eval-*
+          path: /tmp/eval-results
+          merge-multiple: true
+
       - name: Post PR comment
-        if: always() && github.event_name == 'pull_request'
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          # Aggregate results across ALL eval suites (not just the latest file)
-          RESULTS=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial)
+          RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
           if [ -z "$RESULTS" ]; then
             echo "No eval results found"
             exit 0
           fi
 
-          TOTAL=0; PASSED=0; FAILED=0; COST=0
+          TOTAL=0; PASSED=0; FAILED=0; COST="0"
           SUITE_LINES=""
           for f in $RESULTS; do
             T=$(jq -r '.total_tests // 0' "$f")
@@ -95,11 +122,14 @@ jobs:
 
           BODY="## E2E Evals: ${STATUS}
 
-          **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost
+          **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**
 
           | Suite | Result | Status | Cost |
           |-------|--------|--------|------|
-          $(echo -e "$SUITE_LINES")"
+          $(echo -e "$SUITE_LINES")
+
+          ---
+          *12x ubicloud-standard-2 ($0.0008/min each) | Wall clock ≈ slowest suite*"
 
           if [ "$FAILED" -gt 0 ]; then
             FAILURES=""
@@ -115,12 +145,7 @@ jobs:
           $(echo -e "$FAILURES")"
           fi
 
-          BODY="${BODY}
-
-          ---
-          *Runner: ubicloud-standard-2 ($0.0008/min) | Concurrency: 40*"
-
-          # Update existing comment or create new one (prevents duplicates on re-runs)
+          # Update existing comment or create new one
           COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
             --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)