From a5c70977f15aeba46a30d88d23107e97267a67d3 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 23 Mar 2026 05:37:59 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20parallelize=20CI=20evals=20=E2=80=94=20?= =?UTF-8?q?12=20runners=20(1=20per=20suite)=20for=20~3min=20wall=20clock?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch eval workflow to use Docker container image with pre-baked toolchain. Each of 12 matrix runners pulls the image, hardlinks cached node_modules, builds browse, and runs one test suite. Setup drops from ~70s to ~19s per runner. Wall clock is dominated by the slowest individual test, not sequential sum. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/evals.yml | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 6fa54a6c..69fac93a 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -10,6 +10,11 @@ concurrency: jobs: evals: runs-on: ubicloud-standard-2 + container: + image: ghcr.io/${{ github.repository }}/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} timeout-minutes: 20 strategy: fail-fast: false @@ -44,22 +49,18 @@ jobs: with: fetch-depth: 0 - - uses: oven-sh/setup-bun@v2 - - - name: Cache bun dependencies - uses: actions/cache@v4 - with: - path: ~/.bun/install/cache - key: bun-${{ hashFiles('bun.lockb') }} - restore-keys: bun- - - - run: bun install + # Restore pre-installed node_modules from Docker image (~1s vs ~15s install) + # If lockfile changed since image was built, fall back to fresh install + - name: Restore deps + run: | + if diff -q /opt/node_modules_cache/.package-lock.json package.json >/dev/null 2>&1; then + cp -al /opt/node_modules_cache node_modules + else + bun install + fi - run: bun run build - - name: Install Claude CLI - run: npm i -g @anthropic-ai/claude-code - - name: Run ${{ matrix.suite.name }} env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} @@ -78,6 +79,11 @@ jobs: report: runs-on: ubicloud-standard-2 + container: + image: ghcr.io/${{ github.repository }}/ci:latest + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} needs: evals if: always() && github.event_name == 'pull_request' timeout-minutes: 5 @@ -129,7 +135,7 @@ jobs: $(echo -e "$SUITE_LINES") --- - *12x ubicloud-standard-2 ($0.0008/min each) | Wall clock ≈ slowest suite*" + *12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*" if [ "$FAILED" -gt 0 ]; then FAILURES=""