name: E2E Evals on: pull_request: branches: [main] concurrency: group: evals-${{ github.head_ref }} cancel-in-progress: true jobs: evals: runs-on: ubicloud-standard-2 timeout-minutes: 20 strategy: fail-fast: false matrix: suite: - name: llm-judge file: test/skill-llm-eval.test.ts - name: e2e-browse file: test/skill-e2e-browse.test.ts - name: e2e-plan file: test/skill-e2e-plan.test.ts - name: e2e-deploy file: test/skill-e2e-deploy.test.ts - name: e2e-design file: test/skill-e2e-design.test.ts - name: e2e-qa-bugs file: test/skill-e2e-qa-bugs.test.ts - name: e2e-qa-workflow file: test/skill-e2e-qa-workflow.test.ts - name: e2e-review file: test/skill-e2e-review.test.ts - name: e2e-workflow file: test/skill-e2e-workflow.test.ts - name: e2e-routing file: test/skill-routing-e2e.test.ts - name: e2e-codex file: test/codex-e2e.test.ts - name: e2e-gemini file: test/gemini-e2e.test.ts steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: oven-sh/setup-bun@v2 - name: Cache bun dependencies uses: actions/cache@v4 with: path: ~/.bun/install/cache key: bun-${{ hashFiles('bun.lockb') }} restore-keys: bun- - run: bun install - run: bun run build - name: Install Claude CLI run: npm i -g @anthropic-ai/claude-code - name: Run ${{ matrix.suite.name }} env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} EVALS_CONCURRENCY: "40" run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }} - name: Upload eval results if: always() uses: actions/upload-artifact@v4 with: name: eval-${{ matrix.suite.name }} path: ~/.gstack-dev/evals/*.json retention-days: 90 report: runs-on: ubicloud-standard-2 needs: evals if: always() && github.event_name == 'pull_request' timeout-minutes: 5 steps: - name: Download all eval artifacts uses: actions/download-artifact@v4 with: pattern: eval-* path: /tmp/eval-results merge-multiple: true - name: Post PR comment env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort) if [ -z "$RESULTS" ]; then echo "No eval results found" exit 0 fi TOTAL=0; PASSED=0; FAILED=0; COST="0" SUITE_LINES="" for f in $RESULTS; do T=$(jq -r '.total_tests // 0' "$f") P=$(jq -r '.passed // 0' "$f") F=$(jq -r '.failed // 0' "$f") C=$(jq -r '.total_cost_usd // 0' "$f") TIER=$(jq -r '.tier // "unknown"' "$f") [ "$T" -eq 0 ] && continue TOTAL=$((TOTAL + T)) PASSED=$((PASSED + P)) FAILED=$((FAILED + F)) COST=$(echo "$COST + $C" | bc) STATUS_ICON="✅" [ "$F" -gt 0 ] && STATUS_ICON="❌" SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n" done STATUS="✅ PASS" [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL" BODY="## E2E Evals: ${STATUS} **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners** | Suite | Result | Status | Cost | |-------|--------|--------|------| $(echo -e "$SUITE_LINES") --- *12x ubicloud-standard-2 ($0.0008/min each) | Wall clock ≈ slowest suite*" if [ "$FAILED" -gt 0 ]; then FAILURES="" for f in $RESULTS; do F=$(jq -r '.failed // 0' "$f") [ "$F" -eq 0 ] && continue FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f") FAILURES="${FAILURES}${FAILS}\n" done BODY="${BODY} ### Failures $(echo -e "$FAILURES")" fi # Update existing comment or create new one COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \ --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1) if [ -n "$COMMENT_ID" ]; then gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \ -X PATCH -f body="$BODY" else gh pr comment ${{ github.event.pull_request.number }} --body "$BODY" fi