gstack/.github/workflows/evals.yml

name: E2E Evals
on:
  pull_request:
    branches: [main]

concurrency:
  group: evals-${{ github.head_ref }}
  cancel-in-progress: true

jobs:
  evals:
    runs-on: ubicloud-standard-2
    timeout-minutes: 30
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - uses: oven-sh/setup-bun@v2

      - run: bun install

      - run: bun run build

      - name: Verify browse binary
        run: test -f browse/dist/browse || (echo "Browse binary missing after build" && exit 1)

      - name: Install Claude CLI
        run: npm i -g @anthropic-ai/claude-code

      - name: Download previous eval baseline
        uses: dawidd6/action-download-artifact@v6
        with:
          name: eval-results
          branch: main
          path: /tmp/eval-baseline
          if_no_artifact_found: warn
        continue-on-error: true

      - name: Copy baseline for comparison
        run: |
          if [ -d /tmp/eval-baseline ]; then
            mkdir -p ~/.gstack-dev/evals
            cp /tmp/eval-baseline/*.json ~/.gstack-dev/evals/ 2>/dev/null || true
          fi

      - name: Run E2E evals
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          EVALS_CONCURRENCY: "40"
        run: bun run test:evals

      - name: Upload eval results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: eval-results
          path: ~/.gstack-dev/evals/*.json
          retention-days: 90

      - name: Post PR comment
        if: always() && github.event_name == 'pull_request'
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          RESULT=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial | head -1)
          if [ -z "$RESULT" ]; then
            echo "No eval results found"
            exit 0
          fi

          TOTAL=$(jq .total_tests "$RESULT")
          PASSED=$(jq .passed "$RESULT")
          FAILED=$(jq .failed "$RESULT")
          COST=$(jq .total_cost_usd "$RESULT")
          WALL=$(jq '.wall_clock_ms // 0 | . / 1000 | floor' "$RESULT")

          STATUS="pass"
          [ "$FAILED" -gt 0 ] && STATUS="FAIL"

          BODY="**E2E Evals:** ${STATUS} ${PASSED}/${TOTAL} passed | \$${COST} | ${WALL}s wall clock"

          if [ "$FAILED" -gt 0 ]; then
            FAILURES=$(jq -r '.tests[] | select(.passed == false) | "- FAIL \(.name): \(.exit_reason // "unknown")"' "$RESULT")
            BODY="${BODY}

          Failures:
          ${FAILURES}"
          fi

          gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"