name: E2E Evals on: pull_request: branches: [main] concurrency: group: evals-${{ github.head_ref }} cancel-in-progress: true jobs: evals: runs-on: ubicloud-standard-2 timeout-minutes: 45 steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - uses: oven-sh/setup-bun@v2 - run: bun install - run: bun run build - name: Verify browse binary run: test -f browse/dist/browse || (echo "Browse binary missing after build" && exit 1) - name: Install Claude CLI run: npm i -g @anthropic-ai/claude-code - name: Download previous eval baseline uses: dawidd6/action-download-artifact@v6 with: name: eval-results branch: main path: /tmp/eval-baseline if_no_artifact_found: warn continue-on-error: true - name: Copy baseline for comparison run: | if [ -d /tmp/eval-baseline ]; then mkdir -p ~/.gstack-dev/evals cp /tmp/eval-baseline/*.json ~/.gstack-dev/evals/ 2>/dev/null || true fi - name: Run E2E evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} EVALS_CONCURRENCY: "40" run: bun run test:evals - name: Upload eval results if: always() uses: actions/upload-artifact@v4 with: name: eval-results path: ~/.gstack-dev/evals/*.json retention-days: 90 - name: Post PR comment if: always() && github.event_name == 'pull_request' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | # Aggregate results across ALL eval suites (not just the latest file) RESULTS=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial) if [ -z "$RESULTS" ]; then echo "No eval results found" exit 0 fi TOTAL=0; PASSED=0; FAILED=0; COST=0 SUITE_LINES="" for f in $RESULTS; do T=$(jq -r '.total_tests // 0' "$f") P=$(jq -r '.passed // 0' "$f") F=$(jq -r '.failed // 0' "$f") C=$(jq -r '.total_cost_usd // 0' "$f") TIER=$(jq -r '.tier // "unknown"' "$f") [ "$T" -eq 0 ] && continue TOTAL=$((TOTAL + T)) PASSED=$((PASSED + P)) FAILED=$((FAILED + F)) COST=$(echo "$COST + $C" | bc) STATUS_ICON="✅" [ "$F" -gt 0 ] && STATUS_ICON="❌" SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n" done STATUS="✅ PASS" [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL" BODY="## E2E Evals: ${STATUS} **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | Suite | Result | Status | Cost | |-------|--------|--------|------| $(echo -e "$SUITE_LINES")" if [ "$FAILED" -gt 0 ]; then FAILURES="" for f in $RESULTS; do F=$(jq -r '.failed // 0' "$f") [ "$F" -eq 0 ] && continue FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f") FAILURES="${FAILURES}${FAILS}\n" done BODY="${BODY} ### Failures $(echo -e "$FAILURES")" fi BODY="${BODY} --- *Runner: ubicloud-standard-2 ($0.0008/min) | Concurrency: 40*" # Update existing comment or create new one (prevents duplicates on re-runs) COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \ --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1) if [ -n "$COMMENT_ID" ]; then gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \ -X PATCH -f body="$BODY" else gh pr comment ${{ github.event.pull_request.number }} --body "$BODY" fi