name: E2E Evals
on:
  pull_request:
    branches: [main]

concurrency:
  group: evals-${{ github.head_ref }}
  cancel-in-progress: true

jobs:
  evals:
    runs-on: ubicloud-standard-2
    timeout-minutes: 20
    strategy:
      fail-fast: false
      matrix:
        suite:
          - name: llm-judge
            file: test/skill-llm-eval.test.ts
          - name: e2e-browse
            file: test/skill-e2e-browse.test.ts
          - name: e2e-plan
            file: test/skill-e2e-plan.test.ts
          - name: e2e-deploy
            file: test/skill-e2e-deploy.test.ts
          - name: e2e-design
            file: test/skill-e2e-design.test.ts
          - name: e2e-qa-bugs
            file: test/skill-e2e-qa-bugs.test.ts
          - name: e2e-qa-workflow
            file: test/skill-e2e-qa-workflow.test.ts
          - name: e2e-review
            file: test/skill-e2e-review.test.ts
          - name: e2e-workflow
            file: test/skill-e2e-workflow.test.ts
          - name: e2e-routing
            file: test/skill-routing-e2e.test.ts
          - name: e2e-codex
            file: test/codex-e2e.test.ts
          - name: e2e-gemini
            file: test/gemini-e2e.test.ts
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - uses: oven-sh/setup-bun@v2

      - name: Cache bun dependencies
        uses: actions/cache@v4
        with:
          path: ~/.bun/install/cache
          key: bun-${{ hashFiles('bun.lockb') }}
          restore-keys: bun-

      - run: bun install

      - run: bun run build

      - name: Install Claude CLI
        run: npm i -g @anthropic-ai/claude-code

      - name: Run ${{ matrix.suite.name }}
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          EVALS_CONCURRENCY: "40"
        run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}

      - name: Upload eval results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: eval-${{ matrix.suite.name }}
          path: ~/.gstack-dev/evals/*.json
          retention-days: 90

  report:
    runs-on: ubicloud-standard-2
    needs: evals
    if: always() && github.event_name == 'pull_request'
    timeout-minutes: 5
    steps:
      - name: Download all eval artifacts
        uses: actions/download-artifact@v4
        with:
          pattern: eval-*
          path: /tmp/eval-results
          merge-multiple: true

      - name: Post PR comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
          if [ -z "$RESULTS" ]; then
            echo "No eval results found"
            exit 0
          fi

          TOTAL=0; PASSED=0; FAILED=0; COST="0"
          SUITE_LINES=""
          for f in $RESULTS; do
            T=$(jq -r '.total_tests // 0' "$f")
            P=$(jq -r '.passed // 0' "$f")
            F=$(jq -r '.failed // 0' "$f")
            C=$(jq -r '.total_cost_usd // 0' "$f")
            TIER=$(jq -r '.tier // "unknown"' "$f")
            [ "$T" -eq 0 ] && continue
            TOTAL=$((TOTAL + T))
            PASSED=$((PASSED + P))
            FAILED=$((FAILED + F))
            COST=$(echo "$COST + $C" | bc)
            STATUS_ICON="✅"
            [ "$F" -gt 0 ] && STATUS_ICON="❌"
            SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
          done

          STATUS="✅ PASS"
          [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"

          BODY="## E2E Evals: ${STATUS}

          **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**

          | Suite | Result | Status | Cost |
          |-------|--------|--------|------|
          $(echo -e "$SUITE_LINES")

          ---
          *12x ubicloud-standard-2 ($0.0008/min each) | Wall clock ≈ slowest suite*"

          if [ "$FAILED" -gt 0 ]; then
            FAILURES=""
            for f in $RESULTS; do
              F=$(jq -r '.failed // 0' "$f")
              [ "$F" -eq 0 ] && continue
              FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f")
              FAILURES="${FAILURES}${FAILS}\n"
            done
            BODY="${BODY}

          ### Failures
          $(echo -e "$FAILURES")"
          fi

          # Update existing comment or create new one
          COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
            --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)

          if [ -n "$COMMENT_ID" ]; then
            gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \
              -X PATCH -f body="$BODY"
          else
            gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"
          fi