Files
gstack/.github/workflows/evals.yml
T
2026-03-23 05:21:43 -07:00

133 lines
4.1 KiB
YAML

name: E2E Evals
on:
pull_request:
branches: [main]
concurrency:
group: evals-${{ github.head_ref }}
cancel-in-progress: true
jobs:
evals:
runs-on: ubicloud-standard-2
timeout-minutes: 45
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: oven-sh/setup-bun@v2
- run: bun install
- run: bun run build
- name: Verify browse binary
run: test -f browse/dist/browse || (echo "Browse binary missing after build" && exit 1)
- name: Install Claude CLI
run: npm i -g @anthropic-ai/claude-code
- name: Download previous eval baseline
uses: dawidd6/action-download-artifact@v6
with:
name: eval-results
branch: main
path: /tmp/eval-baseline
if_no_artifact_found: warn
continue-on-error: true
- name: Copy baseline for comparison
run: |
if [ -d /tmp/eval-baseline ]; then
mkdir -p ~/.gstack-dev/evals
cp /tmp/eval-baseline/*.json ~/.gstack-dev/evals/ 2>/dev/null || true
fi
- name: Run E2E evals
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
EVALS_CONCURRENCY: "40"
run: bun run test:evals
- name: Upload eval results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results
path: ~/.gstack-dev/evals/*.json
retention-days: 90
- name: Post PR comment
if: always() && github.event_name == 'pull_request'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Aggregate results across ALL eval suites (not just the latest file)
RESULTS=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial)
if [ -z "$RESULTS" ]; then
echo "No eval results found"
exit 0
fi
TOTAL=0; PASSED=0; FAILED=0; COST=0
SUITE_LINES=""
for f in $RESULTS; do
T=$(jq -r '.total_tests // 0' "$f")
P=$(jq -r '.passed // 0' "$f")
F=$(jq -r '.failed // 0' "$f")
C=$(jq -r '.total_cost_usd // 0' "$f")
TIER=$(jq -r '.tier // "unknown"' "$f")
[ "$T" -eq 0 ] && continue
TOTAL=$((TOTAL + T))
PASSED=$((PASSED + P))
FAILED=$((FAILED + F))
COST=$(echo "$COST + $C" | bc)
STATUS_ICON="✅"
[ "$F" -gt 0 ] && STATUS_ICON="❌"
SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
done
STATUS="✅ PASS"
[ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"
BODY="## E2E Evals: ${STATUS}
**${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost
| Suite | Result | Status | Cost |
|-------|--------|--------|------|
$(echo -e "$SUITE_LINES")"
if [ "$FAILED" -gt 0 ]; then
FAILURES=""
for f in $RESULTS; do
F=$(jq -r '.failed // 0' "$f")
[ "$F" -eq 0 ] && continue
FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f")
FAILURES="${FAILURES}${FAILS}\n"
done
BODY="${BODY}
### Failures
$(echo -e "$FAILURES")"
fi
BODY="${BODY}
---
*Runner: ubicloud-standard-2 ($0.0008/min) | Concurrency: 40*"
# Update existing comment or create new one (prevents duplicates on re-runs)
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
--jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)
if [ -n "$COMMENT_ID" ]; then
gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \
-X PATCH -f body="$BODY"
else
gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"
fi