mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 13:45:35 +02:00
merge: resolve conflicts with origin/main (v0.11.11.0)
Merge origin/main into wave3-bugfixes. Resolved conflicts: - VERSION: bumped to 0.11.11.0 (above main's 0.11.10.0) - CHANGELOG: added wave 3 entry above main's recent entries - gen-skill-docs.ts: took main's find-based zsh compat fix - SKILL.md files: regenerated from templates Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,50 @@
|
||||
# gstack CI eval runner — pre-baked toolchain + deps
|
||||
# Rebuild weekly via ci-image.yml, on Dockerfile changes, or on lockfile changes
|
||||
FROM ubuntu:24.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# System deps
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
git curl unzip ca-certificates jq bc gpg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# GitHub CLI
|
||||
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
|
||||
| gpg --dearmor -o /usr/share/keyrings/githubcli-archive-keyring.gpg \
|
||||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \
|
||||
| tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
|
||||
&& apt-get update && apt-get install -y --no-install-recommends gh \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Node.js 22 LTS (needed for claude CLI)
|
||||
RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
|
||||
&& apt-get install -y --no-install-recommends nodejs \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Bun (install to /usr/local so non-root users can access it)
|
||||
ENV BUN_INSTALL="/usr/local"
|
||||
RUN curl -fsSL https://bun.sh/install | bash
|
||||
|
||||
# Claude CLI
|
||||
RUN npm i -g @anthropic-ai/claude-code
|
||||
|
||||
# Pre-install dependencies (cached layer — only rebuilds when package.json changes)
|
||||
COPY package.json /workspace/
|
||||
WORKDIR /workspace
|
||||
RUN bun install && rm -rf /tmp/*
|
||||
|
||||
# Verify everything works
|
||||
RUN bun --version && node --version && claude --version && jq --version && gh --version
|
||||
|
||||
# At runtime: checkout overwrites /workspace, but node_modules persists
|
||||
# if we move it out of the way and symlink back
|
||||
# Save node_modules + package.json snapshot for cache validation at runtime
|
||||
RUN mv /workspace/node_modules /opt/node_modules_cache \
|
||||
&& cp /workspace/package.json /opt/node_modules_cache/.package.json
|
||||
|
||||
# Claude CLI refuses --dangerously-skip-permissions as root.
|
||||
# Create a non-root user for eval runs (GH Actions overrides USER, so
|
||||
# the workflow must set options.user or use gosu/su-exec at runtime).
|
||||
RUN useradd -m -s /bin/bash runner \
|
||||
&& chmod -R a+rX /opt/node_modules_cache
|
||||
@@ -0,0 +1,40 @@
|
||||
name: Build CI Image
|
||||
on:
|
||||
# Rebuild weekly (Monday 6am UTC) to pick up CLI updates
|
||||
schedule:
|
||||
- cron: '0 6 * * 1'
|
||||
# Rebuild on Dockerfile or lockfile changes
|
||||
push:
|
||||
branches: [main]
|
||||
paths:
|
||||
- '.github/docker/Dockerfile.ci'
|
||||
- 'package.json'
|
||||
# Manual trigger
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubicloud-standard-2
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
# Copy lockfile + package.json into Docker build context
|
||||
- run: cp package.json .github/docker/
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .github/docker
|
||||
file: .github/docker/Dockerfile.ci
|
||||
push: true
|
||||
tags: |
|
||||
ghcr.io/${{ github.repository }}/ci:latest
|
||||
ghcr.io/${{ github.repository }}/ci:${{ github.sha }}
|
||||
@@ -0,0 +1,213 @@
|
||||
name: E2E Evals
|
||||
on:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: evals-${{ github.head_ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
IMAGE: ghcr.io/${{ github.repository }}/ci
|
||||
|
||||
jobs:
|
||||
# Build Docker image with pre-baked toolchain (cached — only rebuilds on Dockerfile/lockfile change)
|
||||
build-image:
|
||||
runs-on: ubicloud-standard-2
|
||||
permissions:
|
||||
contents: read
|
||||
packages: write
|
||||
outputs:
|
||||
image-tag: ${{ steps.meta.outputs.tag }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- id: meta
|
||||
run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Check if image exists
|
||||
id: check
|
||||
run: |
|
||||
if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
|
||||
echo "exists=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "exists=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- if: steps.check.outputs.exists == 'false'
|
||||
run: cp package.json .github/docker/
|
||||
|
||||
- if: steps.check.outputs.exists == 'false'
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .github/docker
|
||||
file: .github/docker/Dockerfile.ci
|
||||
push: true
|
||||
tags: |
|
||||
${{ steps.meta.outputs.tag }}
|
||||
${{ env.IMAGE }}:latest
|
||||
|
||||
evals:
|
||||
runs-on: ubicloud-standard-2
|
||||
needs: build-image
|
||||
container:
|
||||
image: ${{ needs.build-image.outputs.image-tag }}
|
||||
credentials:
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
options: --user runner
|
||||
timeout-minutes: 20
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
suite:
|
||||
- name: llm-judge
|
||||
file: test/skill-llm-eval.test.ts
|
||||
- name: e2e-browse
|
||||
file: test/skill-e2e-browse.test.ts
|
||||
- name: e2e-plan
|
||||
file: test/skill-e2e-plan.test.ts
|
||||
- name: e2e-deploy
|
||||
file: test/skill-e2e-deploy.test.ts
|
||||
- name: e2e-design
|
||||
file: test/skill-e2e-design.test.ts
|
||||
- name: e2e-qa-bugs
|
||||
file: test/skill-e2e-qa-bugs.test.ts
|
||||
- name: e2e-qa-workflow
|
||||
file: test/skill-e2e-qa-workflow.test.ts
|
||||
- name: e2e-review
|
||||
file: test/skill-e2e-review.test.ts
|
||||
- name: e2e-workflow
|
||||
file: test/skill-e2e-workflow.test.ts
|
||||
- name: e2e-routing
|
||||
file: test/skill-routing-e2e.test.ts
|
||||
- name: e2e-codex
|
||||
file: test/codex-e2e.test.ts
|
||||
- name: e2e-gemini
|
||||
file: test/gemini-e2e.test.ts
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
# Restore pre-installed node_modules from Docker image via symlink (~0s vs ~15s install)
|
||||
# If package.json changed since image was built, fall back to fresh install
|
||||
- name: Restore deps
|
||||
run: |
|
||||
if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
|
||||
ln -s /opt/node_modules_cache node_modules
|
||||
else
|
||||
bun install
|
||||
fi
|
||||
|
||||
- run: bun run build
|
||||
|
||||
- name: Run ${{ matrix.suite.name }}
|
||||
env:
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
|
||||
EVALS_CONCURRENCY: "40"
|
||||
run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}
|
||||
|
||||
- name: Upload eval results
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: eval-${{ matrix.suite.name }}
|
||||
path: ~/.gstack-dev/evals/*.json
|
||||
retention-days: 90
|
||||
|
||||
report:
|
||||
runs-on: ubicloud-standard-2
|
||||
needs: evals
|
||||
if: always() && github.event_name == 'pull_request'
|
||||
timeout-minutes: 5
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Download all eval artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: eval-*
|
||||
path: /tmp/eval-results
|
||||
merge-multiple: true
|
||||
|
||||
- name: Post PR comment
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
RESULTS=$(find /tmp/eval-results -name '*.json' 2>/dev/null | sort)
|
||||
if [ -z "$RESULTS" ]; then
|
||||
echo "No eval results found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
TOTAL=0; PASSED=0; FAILED=0; COST="0"
|
||||
SUITE_LINES=""
|
||||
for f in $RESULTS; do
|
||||
T=$(jq -r '.total_tests // 0' "$f")
|
||||
P=$(jq -r '.passed // 0' "$f")
|
||||
F=$(jq -r '.failed // 0' "$f")
|
||||
C=$(jq -r '.total_cost_usd // 0' "$f")
|
||||
TIER=$(jq -r '.tier // "unknown"' "$f")
|
||||
[ "$T" -eq 0 ] && continue
|
||||
TOTAL=$((TOTAL + T))
|
||||
PASSED=$((PASSED + P))
|
||||
FAILED=$((FAILED + F))
|
||||
COST=$(echo "$COST + $C" | bc)
|
||||
STATUS_ICON="✅"
|
||||
[ "$F" -gt 0 ] && STATUS_ICON="❌"
|
||||
SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
|
||||
done
|
||||
|
||||
STATUS="✅ PASS"
|
||||
[ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"
|
||||
|
||||
BODY="## E2E Evals: ${STATUS}
|
||||
|
||||
**${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost | **12 parallel runners**
|
||||
|
||||
| Suite | Result | Status | Cost |
|
||||
|-------|--------|--------|------|
|
||||
$(echo -e "$SUITE_LINES")
|
||||
|
||||
---
|
||||
*12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"
|
||||
|
||||
if [ "$FAILED" -gt 0 ]; then
|
||||
FAILURES=""
|
||||
for f in $RESULTS; do
|
||||
F=$(jq -r '.failed // 0' "$f")
|
||||
[ "$F" -eq 0 ] && continue
|
||||
FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f")
|
||||
FAILURES="${FAILURES}${FAILS}\n"
|
||||
done
|
||||
BODY="${BODY}
|
||||
|
||||
### Failures
|
||||
$(echo -e "$FAILURES")"
|
||||
fi
|
||||
|
||||
# Update existing comment or create new one
|
||||
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
|
||||
--jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)
|
||||
|
||||
if [ -n "$COMMENT_ID" ]; then
|
||||
gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \
|
||||
-X PATCH -f body="$BODY"
|
||||
else
|
||||
gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"
|
||||
fi
|
||||
Reference in New Issue
Block a user