gstack/.github/workflows/evals.yml

name: E2E Evals
on:
  pull_request:
    branches: [main]
  workflow_dispatch:

concurrency:
  group: evals-${{ github.head_ref }}
  cancel-in-progress: true

env:
  IMAGE: ghcr.io/${{ github.repository }}/ci
  EVALS_TIER: gate

jobs:
  # Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change)
  build-image:
    runs-on: ubicloud-standard-8
    permissions:
      contents: read
      packages: write
    outputs:
      image-tag: ${{ steps.meta.outputs.tag }}
    steps:
      - uses: actions/checkout@v4

      - id: meta
        run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json', 'bun.lock') }}" >> "$GITHUB_OUTPUT"

      - uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Check if image exists
        id: check
        run: |
          if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
            echo "exists=true" >> "$GITHUB_OUTPUT"
          else
            echo "exists=false" >> "$GITHUB_OUTPUT"
          fi

      - if: steps.check.outputs.exists == 'false'
        run: cp package.json bun.lock .github/docker/

      - if: steps.check.outputs.exists == 'false'
        uses: docker/build-push-action@v6
        with:
          context: .github/docker
          file: .github/docker/Dockerfile.ci
          push: true
          tags: |
            ${{ steps.meta.outputs.tag }}
            ${{ env.IMAGE }}:latest

  evals:
    runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-8' }}
    needs: build-image
    container:
      image: ${{ needs.build-image.outputs.image-tag }}
      credentials:
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
      options: --user runner
    timeout-minutes: ${{ matrix.suite.timeout || 25 }}
    strategy:
      fail-fast: false
      matrix:
        suite:
          - name: llm-judge
            file: test/skill-llm-eval.test.ts
          - name: e2e-browse
            file: test/skill-e2e-bws.test.ts
            runner: ubicloud-standard-8
          - name: e2e-plan
            file: test/skill-e2e-plan.test.ts
          - name: e2e-deploy
            file: test/skill-e2e-deploy.test.ts
          - name: e2e-design
            file: test/skill-e2e-design.test.ts
          - name: e2e-qa-bugs
            file: test/skill-e2e-qa-bugs.test.ts
          - name: e2e-qa-workflow
            file: test/skill-e2e-qa-workflow.test.ts
          - name: e2e-review
            file: test/skill-e2e-review.test.ts
          - name: e2e-workflow
            file: test/skill-e2e-workflow.test.ts
          - name: e2e-routing
            file: test/skill-routing-e2e.test.ts
          - name: e2e-codex
            file: test/codex-e2e.test.ts
          - name: e2e-gemini
            file: test/gemini-e2e.test.ts
          # Real-PTY plan-mode smokes. Only the deterministically-reliable ones
          # are CI-gated: office-hours (asks its mode question first, caught by
          # the collapsed/bullet prose-AUQ detector) and plan-mode-no-op (no
          # ask-first dependency). The plan-eng/plan-design plan-mode + floor
          # smokes are periodic (stochastic ask-first — see touchfiles E2E_TIERS).
          # Needs the interactive-config seed step below; PTY sessions otherwise
          # wedge on the fresh-container onboarding/API-key dialog.
          - name: e2e-pty-plan-smoke
            file: test/skill-e2e-office-hours-auto-mode.test.ts test/skill-e2e-plan-mode-no-op.test.ts
            timeout: 35
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      # Bun creates root-owned temp dirs during Docker build. GH Actions runs as
      # runner user with HOME=/github/home. Redirect bun's cache to a writable dir.
      - name: Fix bun temp
        run: |
          mkdir -p /home/runner/.cache/bun
          {
            echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun"
            echo "BUN_TMPDIR=/home/runner/.cache/bun"
            echo "TMPDIR=/home/runner/.cache"
          } >> "$GITHUB_ENV"

      # Restore pre-installed node_modules from Docker image via recursive
      # copy. Symlink (`ln -s`) breaks bun's module resolution because bun
      # resolves a file's realpath when walking up to find node_modules/<dep>;
      # from a symlinked path, realpath escapes the workspace and sibling
      # deps no longer resolve. Hardlink copy (`cp -al`) fails because /opt
      # and /workspace are on different overlay-fs layers ("Invalid
      # cross-device link"). Recursive copy works on every layout. Cost:
      # ~5s for ~200 packages of small JS files vs ~0s for symlink — still
      # vastly cheaper than rerunning `bun install` (network + resolution).
      - name: Restore deps
        run: |
          if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
            cp -r /opt/node_modules_cache node_modules
          else
            bun install
          fi

      - run: bun run build

      # Verify Playwright can launch Chromium (fails fast if sandbox/deps are broken)
      - name: Verify Chromium
        if: matrix.suite.name == 'e2e-browse'
        run: |
          echo "whoami=$(whoami) HOME=$HOME TMPDIR=${TMPDIR:-unset}"
          touch /tmp/.bun-test && rm /tmp/.bun-test && echo "/tmp writable"
          bun -e "import {chromium} from 'playwright';const b=await chromium.launch({args:['--no-sandbox']});console.log('Chromium OK');await b.close()"

      # PTY smokes spawn the interactive `claude` TUI. A fresh container has no
      # ~/.claude.json, so claude wedges on the onboarding + "use detected
      # ANTHROPIC_API_KEY?" dialog and the spawned session never reaches the
      # skill. Seed onboarding-complete + the key approval (mirrors what the
      # hermetic E2E child env seeds). Scoped to this suite; needs its OWN key
      # env (the secrets block below is on the Run step only).
      - name: Seed claude interactive config
        if: matrix.suite.name == 'e2e-pty-plan-smoke'
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
        run: |
          node -e '
            const fs = require("fs"), os = require("os"), path = require("path");
            const p = path.join(os.homedir(), ".claude.json");
            const seed = fs.existsSync(p) ? JSON.parse(fs.readFileSync(p, "utf8")) : {};
            seed.hasCompletedOnboarding = true;
            const key = process.env.ANTHROPIC_API_KEY || "";
            if (key) seed.customApiKeyResponses = { approved: [key.slice(-20)], rejected: [] };
            fs.writeFileSync(p, JSON.stringify(seed, null, 2));
            console.log("seeded", p);
          '

      # PTY smokes drive the interactive `claude` TUI and send /office-hours and
      # /plan-ceo-review. Claude Code discovers user-scoped skills from
      # $HOME/.claude/skills/<name>/SKILL.md, but .claude/skills is gitignored, so
      # a fresh CI checkout has NO registry — claude prints "Unknown command:
      # /plan-ceo-review". Mirror setup's --no-prefix registry minimally: a gstack
      # root symlink (resolves the preamble's absolute ~/.claude/skills/gstack/bin/*
      # and ~/.claude/skills/gstack/<skill>/sections/* paths) plus a per-skill
      # top-level dir holding SKILL.md (+ sections) symlinks for the two skills
      # these tests invoke. No ./setup (it builds binaries, launches Chromium,
      # installs fonts, reads a /dev/tty prompt) and no binary build (SKILL.md +
      # bin/ + sections/ are committed). $HOME is /github/home here; the spawned
      # claude inherits it (this runner adds no HOME/CLAUDE_CONFIG_DIR override,
      # no hermetic mode) and the Seed step already proved claude reads $HOME.
      - name: Register gstack skills for PTY smoke
        if: matrix.suite.name == 'e2e-pty-plan-smoke'
        run: |
          set -eu
          SKILLS_DIR="$HOME/.claude/skills"
          REPO="$GITHUB_WORKSPACE"   # /__w/gstack/gstack
          mkdir -p "$SKILLS_DIR"
          ln -snf "$REPO" "$SKILLS_DIR/gstack"
          for s in office-hours plan-ceo-review; do
            mkdir -p "$SKILLS_DIR/$s"
            ln -snf "$REPO/$s/SKILL.md" "$SKILLS_DIR/$s/SKILL.md"
            ln -snf "$REPO/$s/sections" "$SKILLS_DIR/$s/sections"
          done
          echo "--- registry under $SKILLS_DIR ---"
          ls -la "$SKILLS_DIR/gstack" "$SKILLS_DIR/office-hours" "$SKILLS_DIR/plan-ceo-review"
          # Fail fast if any committed target moved/renamed — a dangling symlink
          # would otherwise resurface as a silent "Unknown command" + 35-min timeout.
          for f in \
            "$SKILLS_DIR/office-hours/SKILL.md" \
            "$SKILLS_DIR/plan-ceo-review/SKILL.md" \
            "$SKILLS_DIR/gstack/bin/gstack-update-check" \
            "$SKILLS_DIR/gstack/office-hours/sections/design-and-handoff.md" \
            "$SKILLS_DIR/gstack/plan-ceo-review/sections/review-sections.md"; do
            if [ ! -e "$f" ]; then
              echo "ERROR: skill-registry target missing (symlink dangles): $f" >&2
              exit 1
            fi
          done
          grep -m1 '^name: office-hours$'    "$SKILLS_DIR/office-hours/SKILL.md"    >/dev/null \
            || { echo "ERROR: office-hours SKILL.md missing 'name: office-hours' frontmatter" >&2; exit 1; }
          grep -m1 '^name: plan-ceo-review$' "$SKILLS_DIR/plan-ceo-review/SKILL.md" >/dev/null \
            || { echo "ERROR: plan-ceo-review SKILL.md missing 'name: plan-ceo-review' frontmatter" >&2; exit 1; }
          echo "skill registry OK"

      - name: Run ${{ matrix.suite.name }}
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          EVALS_CONCURRENCY: "40"
          PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers
        run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}

      - name: Upload eval results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: eval-${{ matrix.suite.name }}
          path: ~/.gstack-dev/evals/*.json
          retention-days: 90

  report:
    runs-on: ubicloud-standard-8
    needs: evals
    if: always() && github.event_name == 'pull_request'
    timeout-minutes: 5
    permissions:
      contents: read
      pull-requests: write
      # The comment upsert below calls the REST `/issues/{n}/comments` endpoints
      # (gh api ... issues/comments). With GITHUB_TOKEN those are gated by the
      # `issues` permission, not `pull-requests` — without it the GET returns 401
      # on every PR that produces eval artifacts (PRs with no artifacts exit
      # early and never hit it, which is why this stayed hidden). See #1802 CI fix.
      issues: write
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 1

      - name: Download all eval artifacts
        uses: actions/download-artifact@v4
        with:
          pattern: eval-*
          path: /tmp/eval-results
          merge-multiple: true

      - name: Post PR comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          # shellcheck disable=SC2086,SC2059
          RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
          if [ -z "$RESULTS" ]; then
            echo "No eval results found"
            exit 0
          fi

          TOTAL=0; PASSED=0; FAILED=0; COST="0"
          SUITE_LINES=""
          for f in $RESULTS; do
            if ! jq -e '.total_tests' "$f" >/dev/null 2>&1; then
              echo "Skipping malformed JSON: $f"
              continue
            fi
            T=$(jq -r '.total_tests // 0' "$f")
            P=$(jq -r '.passed // 0' "$f")
            F=$(jq -r '.failed // 0' "$f")
            C=$(jq -r '.total_cost_usd // 0' "$f")
            TIER=$(jq -r '.tier // "unknown"' "$f")
            [ "$T" -eq 0 ] && continue
            TOTAL=$((TOTAL + T))
            PASSED=$((PASSED + P))
            FAILED=$((FAILED + F))
            COST=$(echo "$COST + $C" | bc)
            STATUS_ICON="✅"
            [ "$F" -gt 0 ] && STATUS_ICON="❌"
            SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
          done

          STATUS="✅ PASS"
          [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"

          BODY="## E2E Evals: ${STATUS}

          **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **13 parallel runners**

          | Suite | Result | Status | Cost |
          |-------|--------|--------|------|
          $(echo -e "$SUITE_LINES")

          ---
          *13x ubicloud-standard-8 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"

          if [ "$FAILED" -gt 0 ]; then
            FAILURES=""
            for f in $RESULTS; do
              if ! jq -e '.failed' "$f" >/dev/null 2>&1; then continue; fi
              F=$(jq -r '.failed // 0' "$f")
              [ "$F" -eq 0 ] && continue
              FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f" 2>/dev/null || echo "- ⚠️ $(basename "$f"): parse error")
              FAILURES="${FAILURES}${FAILS}\n"
            done
            BODY="${BODY}

          ### Failures
          $(echo -e "$FAILURES")"
          fi

          # Update existing comment or create new one
          COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
            --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)

          if [ -n "$COMMENT_ID" ]; then
            gh api "repos/${{ github.repository }}/issues/comments/${COMMENT_ID}" \
              -X PATCH -f body="$BODY"
          else
            gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"
          fi