From 9195b671e891bb0b76f21424405cd1a42de93d41 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 23 Mar 2026 05:21:43 -0700 Subject: [PATCH] =?UTF-8?q?chore:=20optimize=20CI=20eval=20PR=20comment=20?= =?UTF-8?q?=E2=80=94=20aggregate=20all=20suites,=20update-not-duplicate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/evals.yml | 69 +++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index 945a242d..84a29eee 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -10,7 +10,7 @@ concurrency: jobs: evals: runs-on: ubicloud-standard-2 - timeout-minutes: 30 + timeout-minutes: 45 steps: - uses: actions/checkout@v4 with: @@ -65,29 +65,68 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - RESULT=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial | head -1) - if [ -z "$RESULT" ]; then + # Aggregate results across ALL eval suites (not just the latest file) + RESULTS=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial) + if [ -z "$RESULTS" ]; then echo "No eval results found" exit 0 fi - TOTAL=$(jq .total_tests "$RESULT") - PASSED=$(jq .passed "$RESULT") - FAILED=$(jq .failed "$RESULT") - COST=$(jq .total_cost_usd "$RESULT") - WALL=$(jq '.wall_clock_ms // 0 | . / 1000 | floor' "$RESULT") + TOTAL=0; PASSED=0; FAILED=0; COST=0 + SUITE_LINES="" + for f in $RESULTS; do + T=$(jq -r '.total_tests // 0' "$f") + P=$(jq -r '.passed // 0' "$f") + F=$(jq -r '.failed // 0' "$f") + C=$(jq -r '.total_cost_usd // 0' "$f") + TIER=$(jq -r '.tier // "unknown"' "$f") + [ "$T" -eq 0 ] && continue + TOTAL=$((TOTAL + T)) + PASSED=$((PASSED + P)) + FAILED=$((FAILED + F)) + COST=$(echo "$COST + $C" | bc) + STATUS_ICON="✅" + [ "$F" -gt 0 ] && STATUS_ICON="❌" + SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n" + done - STATUS="pass" - [ "$FAILED" -gt 0 ] && STATUS="FAIL" + STATUS="✅ PASS" + [ "$FAILED" -gt 0 ] && STATUS="❌ FAIL" - BODY="**E2E Evals:** ${STATUS} ${PASSED}/${TOTAL} passed | \$${COST} | ${WALL}s wall clock" + BODY="## E2E Evals: ${STATUS} + + **${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost + + | Suite | Result | Status | Cost | + |-------|--------|--------|------| + $(echo -e "$SUITE_LINES")" if [ "$FAILED" -gt 0 ]; then - FAILURES=$(jq -r '.tests[] | select(.passed == false) | "- FAIL \(.name): \(.exit_reason // "unknown")"' "$RESULT") + FAILURES="" + for f in $RESULTS; do + F=$(jq -r '.failed // 0' "$f") + [ "$F" -eq 0 ] && continue + FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f") + FAILURES="${FAILURES}${FAILS}\n" + done BODY="${BODY} - Failures: - ${FAILURES}" + ### Failures + $(echo -e "$FAILURES")" fi - gh pr comment ${{ github.event.pull_request.number }} --body "$BODY" + BODY="${BODY} + + --- + *Runner: ubicloud-standard-2 ($0.0008/min) | Concurrency: 40*" + + # Update existing comment or create new one (prevents duplicates on re-runs) + COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \ + --jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1) + + if [ -n "$COMMENT_ID" ]; then + gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \ + -X PATCH -f body="$BODY" + else + gh pr comment ${{ github.event.pull_request.number }} --body "$BODY" + fi