diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml new file mode 100644 index 00000000..945a242d --- /dev/null +++ b/.github/workflows/evals.yml @@ -0,0 +1,93 @@ +name: E2E Evals +on: + pull_request: + branches: [main] + +concurrency: + group: evals-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + evals: + runs-on: ubicloud-standard-2 + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: oven-sh/setup-bun@v2 + + - run: bun install + + - run: bun run build + + - name: Verify browse binary + run: test -f browse/dist/browse || (echo "Browse binary missing after build" && exit 1) + + - name: Install Claude CLI + run: npm i -g @anthropic-ai/claude-code + + - name: Download previous eval baseline + uses: dawidd6/action-download-artifact@v6 + with: + name: eval-results + branch: main + path: /tmp/eval-baseline + if_no_artifact_found: warn + continue-on-error: true + + - name: Copy baseline for comparison + run: | + if [ -d /tmp/eval-baseline ]; then + mkdir -p ~/.gstack-dev/evals + cp /tmp/eval-baseline/*.json ~/.gstack-dev/evals/ 2>/dev/null || true + fi + + - name: Run E2E evals + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + EVALS_CONCURRENCY: "40" + run: bun run test:evals + + - name: Upload eval results + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-results + path: ~/.gstack-dev/evals/*.json + retention-days: 90 + + - name: Post PR comment + if: always() && github.event_name == 'pull_request' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + RESULT=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial | head -1) + if [ -z "$RESULT" ]; then + echo "No eval results found" + exit 0 + fi + + TOTAL=$(jq .total_tests "$RESULT") + PASSED=$(jq .passed "$RESULT") + FAILED=$(jq .failed "$RESULT") + COST=$(jq .total_cost_usd "$RESULT") + WALL=$(jq '.wall_clock_ms // 0 | . / 1000 | floor' "$RESULT") + + STATUS="pass" + [ "$FAILED" -gt 0 ] && STATUS="FAIL" + + BODY="**E2E Evals:** ${STATUS} ${PASSED}/${TOTAL} passed | \$${COST} | ${WALL}s wall clock" + + if [ "$FAILED" -gt 0 ]; then + FAILURES=$(jq -r '.tests[] | select(.passed == false) | "- FAIL \(.name): \(.exit_reason // "unknown")"' "$RESULT") + BODY="${BODY} + + Failures: + ${FAILURES}" + fi + + gh pr comment ${{ github.event.pull_request.number }} --body "$BODY" diff --git a/SKILL.md b/SKILL.md index 75ce5b8f..f9be0a14 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,6 +1,5 @@ --- name: gstack -version: 1.1.0 description: | Fast headless browser for QA testing and site dogfooding. Navigate pages, interact with elements, verify state, diff before/after, take annotated screenshots, test responsive @@ -14,11 +13,6 @@ description: | /unfreeze; gstack upgrades /gstack-upgrade. If the user opts out of suggestions, stop and run gstack-config set proactive false; if they opt back in, run gstack-config set proactive true. -allowed-tools: - - Bash - - Read - - AskUserQuestion - --- @@ -26,23 +20,28 @@ allowed-tools: ## Preamble (run first) ```bash -_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true) +_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) +GSTACK_ROOT="$HOME/.codex/skills/gstack" +[ -n "$_ROOT" ] && [ -d "$_ROOT/.agents/skills/gstack" ] && GSTACK_ROOT="$_ROOT/.agents/skills/gstack" +GSTACK_BIN="$GSTACK_ROOT/bin" +GSTACK_BROWSE="$GSTACK_ROOT/browse/dist" +_UPD=$($GSTACK_BIN/gstack-update-check 2>/dev/null || .agents/skills/gstack/bin/gstack-update-check 2>/dev/null || true) [ -n "$_UPD" ] && echo "$_UPD" || true mkdir -p ~/.gstack/sessions touch ~/.gstack/sessions/"$PPID" _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ') find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true -_CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) -_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_CONTRIB=$($GSTACK_BIN/gstack-config get gstack_contributor 2>/dev/null || true) +_PROACTIVE=$($GSTACK_BIN/gstack-config get proactive 2>/dev/null || echo "true") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" -source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true +source <($GSTACK_BIN/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") echo "LAKE_INTRO: $_LAKE_SEEN" -_TEL=$(~/.claude/skills/gstack/bin/gstack-config get telemetry 2>/dev/null || true) +_TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true) _TEL_PROMPTED=$([ -f ~/.gstack/.telemetry-prompted ] && echo "yes" || echo "no") _TEL_START=$(date +%s) _SESSION_ID="$$-$(date +%s)" @@ -50,13 +49,13 @@ echo "TELEMETRY: ${_TEL:-off}" echo "TEL_PROMPTED: $_TEL_PROMPTED" mkdir -p ~/.gstack/analytics echo '{"skill":"gstack","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename "$(git rev-parse --show-toplevel 2>/dev/null)" 2>/dev/null || echo "unknown")'"}' >> ~/.gstack/analytics/skill-usage.jsonl 2>/dev/null || true -for _PF in ~/.gstack/analytics/.pending-*; do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done +for _PF in ~/.gstack/analytics/.pending-*; do [ -f "$_PF" ] && $GSTACK_BIN/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke them when the user explicitly asks. The user opted out of proactive suggestions. -If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. +If output shows `UPGRADE_AVAILABLE `: read `$GSTACK_ROOT/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete @@ -82,7 +81,7 @@ Options: - A) Help gstack get better! (recommended) - B) No thanks -If A: run `~/.claude/skills/gstack/bin/gstack-config set telemetry community` +If A: run `$GSTACK_BIN/gstack-config set telemetry community` If B: ask a follow-up AskUserQuestion: @@ -93,8 +92,8 @@ Options: - A) Sure, anonymous is fine - B) No thanks, fully off -If B→A: run `~/.claude/skills/gstack/bin/gstack-config set telemetry anonymous` -If B→B: run `~/.claude/skills/gstack/bin/gstack-config set telemetry off` +If B→A: run `$GSTACK_BIN/gstack-config set telemetry anonymous` +If B→B: run `$GSTACK_BIN/gstack-config set telemetry off` Always run: ```bash @@ -154,7 +153,7 @@ Never let a noticed issue silently pass. The whole point is proactive communicat ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `$GSTACK_ROOT/ETHOS.md` for the full philosophy. **Three layers of knowledge:** - **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. @@ -252,7 +251,7 @@ Run this bash: _TEL_END=$(date +%s) _TEL_DUR=$(( _TEL_END - _TEL_START )) rm -f ~/.gstack/analytics/.pending-"$_SESSION_ID" 2>/dev/null || true -~/.claude/skills/gstack/bin/gstack-telemetry-log \ +$GSTACK_ROOT/bin/gstack-telemetry-log \ --skill "SKILL_NAME" --duration "$_TEL_DUR" --outcome "OUTCOME" \ --used-browse "USED_BROWSE" --session-id "$_SESSION_ID" 2>/dev/null & ``` @@ -271,7 +270,7 @@ When you are in plan mode and about to call ExitPlanMode: 3. If it does NOT — run this command: \`\`\`bash -~/.claude/skills/gstack/bin/gstack-review-read +$GSTACK_ROOT/bin/gstack-review-read \`\`\` Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: @@ -312,8 +311,8 @@ Auto-shuts down after 30 min idle. State persists between calls (cookies, tabs, ```bash _ROOT=$(git rev-parse --show-toplevel 2>/dev/null) B="" -[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse" -[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse +[ -n "$_ROOT" ] && [ -x "$_ROOT/.agents/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.agents/skills/gstack/browse/dist/browse" +[ -z "$B" ] && B=$GSTACK_BROWSE/browse if [ -x "$B" ]; then echo "READY: $B" else diff --git a/TODOS.md b/TODOS.md index f30f5550..b25c059d 100644 --- a/TODOS.md +++ b/TODOS.md @@ -338,17 +338,6 @@ **Depends on:** Video recording -### GitHub Actions eval upload - -**What:** Run eval suite in CI, upload result JSON as artifact, post summary comment on PR. - -**Why:** CI integration catches quality regressions before merge and provides persistent eval records per PR. - -**Context:** Requires `ANTHROPIC_API_KEY` in CI secrets. Cost is ~$4/run. Eval persistence system (v0.3.6) writes JSON to `~/.gstack-dev/evals/` — CI would upload as GitHub Actions artifacts and use `eval:compare` to post delta comment. - -**Effort:** M -**Priority:** P2 -**Depends on:** Eval persistence (shipped in v0.3.6) ### E2E model pinning — SHIPPED @@ -539,6 +528,14 @@ Shipped in v0.6.5. TemplateContext in gen-skill-docs.ts bakes skill name into pr ## Completed +### CI eval pipeline (v0.9.9.0) +- GitHub Actions eval upload on Ubicloud runners ($0.006/run) +- Within-file test concurrency (test() → testConcurrentIfSelected()) +- Eval artifact upload + PR comment with pass/fail + cost +- Baseline comparison via artifact download from main +- EVALS_CONCURRENCY=40 for ~6min wall clock (was ~18min) +**Completed:** v0.9.9.0 + ### Deploy pipeline (v0.9.8.0) - /land-and-deploy — merge PR, wait for CI/deploy, canary verification - /canary — post-deploy monitoring loop with anomaly detection