diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index d8a052ab..6f4b74b8 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -7,6 +7,7 @@ description: | issues interactively with opinionated recommendations. allowed-tools: - Read + - Write - Grep - Glob - AskUserQuestion @@ -92,6 +93,41 @@ For LLM/prompt changes: check the "Prompt/LLM changes" file patterns listed in C **STOP.** For each issue found in this section, call AskUserQuestion individually. One issue per call. Present options, state your recommendation, explain WHY. Do NOT batch multiple issues into one AskUserQuestion. Only proceed to the next section after ALL issues in this section are resolved. +### Test Plan Artifact + +After producing the test diagram, write a test plan artifact to the project directory so `/qa` and `/qa-only` can consume it as primary test input (replacing the lossy git-diff heuristic): + +```bash +SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') +BRANCH=$(git rev-parse --abbrev-ref HEAD) +USER=$(whoami) +DATETIME=$(date +%Y%m%d-%H%M%S) +mkdir -p ~/.gstack/projects/$SLUG +``` + +Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-plan-{datetime}.md`: + +```markdown +# Test Plan +Generated by /plan-eng-review on {date} +Branch: {branch} +Repo: {owner/repo} + +## Affected Pages/Routes +- {URL path} — {what to test and why} + +## Key Interactions to Verify +- {interaction description} on {page} + +## Edge Cases +- {edge case} on {page} + +## Critical Paths +- {end-to-end flow that must work} +``` + +This file is consumed by `/qa` and `/qa-only` as primary test input. Include only the information that helps a QA tester know **what to test and where** — not implementation details. + ### 4. Performance review Evaluate: * N+1 queries and database access patterns. diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index 245e0cb4..006d1841 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -7,6 +7,7 @@ description: | issues interactively with opinionated recommendations. allowed-tools: - Read + - Write - Grep - Glob - AskUserQuestion @@ -83,6 +84,41 @@ For LLM/prompt changes: check the "Prompt/LLM changes" file patterns listed in C **STOP.** For each issue found in this section, call AskUserQuestion individually. One issue per call. Present options, state your recommendation, explain WHY. Do NOT batch multiple issues into one AskUserQuestion. Only proceed to the next section after ALL issues in this section are resolved. +### Test Plan Artifact + +After producing the test diagram, write a test plan artifact to the project directory so `/qa` and `/qa-only` can consume it as primary test input (replacing the lossy git-diff heuristic): + +```bash +SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') +BRANCH=$(git rev-parse --abbrev-ref HEAD) +USER=$(whoami) +DATETIME=$(date +%Y%m%d-%H%M%S) +mkdir -p ~/.gstack/projects/$SLUG +``` + +Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-plan-{datetime}.md`: + +```markdown +# Test Plan +Generated by /plan-eng-review on {date} +Branch: {branch} +Repo: {owner/repo} + +## Affected Pages/Routes +- {URL path} — {what to test and why} + +## Key Interactions to Verify +- {interaction description} on {page} + +## Edge Cases +- {edge case} on {page} + +## Critical Paths +- {end-to-end flow that must work} +``` + +This file is consumed by `/qa` and `/qa-only` as primary test input. Include only the information that helps a QA tester know **what to test and where** — not implementation details. + ### 4. Performance review Evaluate: * N+1 queries and database access patterns. diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md new file mode 100644 index 00000000..8d6f2b11 --- /dev/null +++ b/qa-only/SKILL.md @@ -0,0 +1,397 @@ +--- +name: qa-only +version: 1.0.0 +description: | + Report-only QA testing. Systematically tests a web application and produces a + structured report with health score, screenshots, and repro steps — but never + fixes anything. Use when asked to "just report bugs", "qa report only", or + "test but don't fix". For the full test-fix-verify loop, use /qa instead. +allowed-tools: + - Bash + - Read + - Write + - AskUserQuestion +--- + + + +## Update Check (run first) + +```bash +_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true) +[ -n "$_UPD" ] && echo "$_UPD" || true +``` + +If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. + +# /qa-only: Report-Only QA Testing + +You are a QA engineer. Test web applications like a real user — click everything, fill every form, check every state. Produce a structured report with evidence. **NEVER fix anything.** + +## Setup + +**Parse the user's request for these parameters:** + +| Parameter | Default | Override example | +|-----------|---------|-----------------:| +| Target URL | (auto-detect or required) | `https://myapp.com`, `http://localhost:3000` | +| Mode | full | `--quick`, `--regression .gstack/qa-reports/baseline.json` | +| Output dir | `.gstack/qa-reports/` | `Output to /tmp/qa` | +| Scope | Full app (or diff-scoped) | `Focus on the billing page` | +| Auth | None | `Sign in to user@example.com`, `Import cookies from cookies.json` | + +**If no URL is given and you're on a feature branch:** Automatically enter **diff-aware mode** (see Modes below). This is the most common case — the user just shipped code on a branch and wants to verify it works. + +**Find the browse binary:** + +## SETUP (run this check BEFORE any browse command) + +```bash +_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) +B="" +[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse" +[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse +if [ -x "$B" ]; then + echo "READY: $B" +else + echo "NEEDS_SETUP" +fi +``` + +If `NEEDS_SETUP`: +1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait. +2. Run: `cd && ./setup` +3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` + +**Create output directories:** + +```bash +REPORT_DIR=".gstack/qa-reports" +mkdir -p "$REPORT_DIR/screenshots" +``` + +--- + +## Test Plan Context + +Before falling back to git diff heuristics, check for richer test plan sources: + +1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo + ```bash + SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') + ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 + ``` +2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation +3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. + +--- + +## Modes + +### Diff-aware (automatic when on a feature branch with no URL) + +This is the **primary mode** for developers verifying their work. When the user says `/qa` without a URL and the repo is on a feature branch, automatically: + +1. **Analyze the branch diff** to understand what changed: + ```bash + git diff main...HEAD --name-only + git log main..HEAD --oneline + ``` + +2. **Identify affected pages/routes** from the changed files: + - Controller/route files → which URL paths they serve + - View/template/component files → which pages render them + - Model/service files → which pages use those models (check controllers that reference them) + - CSS/style files → which pages include those stylesheets + - API endpoints → test them directly with `$B js "await fetch('/api/...')"` + - Static pages (markdown, HTML) → navigate to them directly + +3. **Detect the running app** — check common local dev ports: + ```bash + $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \ + $B goto http://localhost:4000 2>/dev/null && echo "Found app on :4000" || \ + $B goto http://localhost:8080 2>/dev/null && echo "Found app on :8080" + ``` + If no local app is found, check for a staging/preview URL in the PR or environment. If nothing works, ask the user for the URL. + +4. **Test each affected page/route:** + - Navigate to the page + - Take a screenshot + - Check console for errors + - If the change was interactive (forms, buttons, flows), test the interaction end-to-end + - Use `snapshot -D` before and after actions to verify the change had the expected effect + +5. **Cross-reference with commit messages and PR description** to understand *intent* — what should the change do? Verify it actually does that. + +6. **Check TODOS.md** (if it exists) for known bugs or issues related to the changed files. If a TODO describes a bug that this branch should fix, add it to your test plan. If you find a new bug during QA that isn't in TODOS.md, note it in the report. + +7. **Report findings** scoped to the branch changes: + - "Changes tested: N pages/routes affected by this branch" + - For each: does it work? Screenshot evidence. + - Any regressions on adjacent pages? + +**If the user provides a URL with diff-aware mode:** Use that URL as the base but still scope testing to the changed files. + +### Full (default when URL is provided) +Systematic exploration. Visit every reachable page. Document 5-10 well-evidenced issues. Produce health score. Takes 5-15 minutes depending on app size. + +### Quick (`--quick`) +30-second smoke test. Visit homepage + top 5 navigation targets. Check: page loads? Console errors? Broken links? Produce health score. No detailed issue documentation. + +### Regression (`--regression `) +Run full mode, then load `baseline.json` from a previous run. Diff: which issues are fixed? Which are new? What's the score delta? Append regression section to report. + +--- + +## Workflow + +### Phase 1: Initialize + +1. Find browse binary (see Setup above) +2. Create output directories +3. Copy report template from `qa/templates/qa-report-template.md` to output dir +4. Start timer for duration tracking + +### Phase 2: Authenticate (if needed) + +**If the user specified auth credentials:** + +```bash +$B goto +$B snapshot -i # find the login form +$B fill @e3 "user@example.com" +$B fill @e4 "[REDACTED]" # NEVER include real passwords in report +$B click @e5 # submit +$B snapshot -D # verify login succeeded +``` + +**If the user provided a cookie file:** + +```bash +$B cookie-import cookies.json +$B goto +``` + +**If 2FA/OTP is required:** Ask the user for the code and wait. + +**If CAPTCHA blocks you:** Tell the user: "Please complete the CAPTCHA in the browser, then tell me to continue." + +### Phase 3: Orient + +Get a map of the application: + +```bash +$B goto +$B snapshot -i -a -o "$REPORT_DIR/screenshots/initial.png" +$B links # map navigation structure +$B console --errors # any errors on landing? +``` + +**Detect framework** (note in report metadata): +- `__next` in HTML or `_next/data` requests → Next.js +- `csrf-token` meta tag → Rails +- `wp-content` in URLs → WordPress +- Client-side routing with no page reloads → SPA + +**For SPAs:** The `links` command may return few results because navigation is client-side. Use `snapshot -i` to find nav elements (buttons, menu items) instead. + +### Phase 4: Explore + +Visit pages systematically. At each page: + +```bash +$B goto +$B snapshot -i -a -o "$REPORT_DIR/screenshots/page-name.png" +$B console --errors +``` + +Then follow the **per-page exploration checklist** (see `qa/references/issue-taxonomy.md`): + +1. **Visual scan** — Look at the annotated screenshot for layout issues +2. **Interactive elements** — Click buttons, links, controls. Do they work? +3. **Forms** — Fill and submit. Test empty, invalid, edge cases +4. **Navigation** — Check all paths in and out +5. **States** — Empty state, loading, error, overflow +6. **Console** — Any new JS errors after interactions? +7. **Responsiveness** — Check mobile viewport if relevant: + ```bash + $B viewport 375x812 + $B screenshot "$REPORT_DIR/screenshots/page-mobile.png" + $B viewport 1280x720 + ``` + +**Depth judgment:** Spend more time on core features (homepage, dashboard, checkout, search) and less on secondary pages (about, terms, privacy). + +**Quick mode:** Only visit homepage + top 5 navigation targets from the Orient phase. Skip the per-page checklist — just check: loads? Console errors? Broken links visible? + +### Phase 5: Document + +Document each issue **immediately when found** — don't batch them. + +**Two evidence tiers:** + +**Interactive bugs** (broken flows, dead buttons, form failures): +1. Take a screenshot before the action +2. Perform the action +3. Take a screenshot showing the result +4. Use `snapshot -D` to show what changed +5. Write repro steps referencing screenshots + +```bash +$B screenshot "$REPORT_DIR/screenshots/issue-001-step-1.png" +$B click @e5 +$B screenshot "$REPORT_DIR/screenshots/issue-001-result.png" +$B snapshot -D +``` + +**Static bugs** (typos, layout issues, missing images): +1. Take a single annotated screenshot showing the problem +2. Describe what's wrong + +```bash +$B snapshot -i -a -o "$REPORT_DIR/screenshots/issue-002.png" +``` + +**Write each issue to the report immediately** using the template format from `qa/templates/qa-report-template.md`. + +### Phase 6: Wrap Up + +1. **Compute health score** using the rubric below +2. **Write "Top 3 Things to Fix"** — the 3 highest-severity issues +3. **Write console health summary** — aggregate all console errors seen across pages +4. **Update severity counts** in the summary table +5. **Fill in report metadata** — date, duration, pages visited, screenshot count, framework +6. **Save baseline** — write `baseline.json` with: + ```json + { + "date": "YYYY-MM-DD", + "url": "", + "healthScore": N, + "issues": [{ "id": "ISSUE-001", "title": "...", "severity": "...", "category": "..." }], + "categoryScores": { "console": N, "links": N, ... } + } + ``` + +**Regression mode:** After writing the report, load the baseline file. Compare: +- Health score delta +- Issues fixed (in baseline but not current) +- New issues (in current but not baseline) +- Append the regression section to the report + +--- + +## Health Score Rubric + +Compute each category score (0-100), then take the weighted average. + +### Console (weight: 15%) +- 0 errors → 100 +- 1-3 errors → 70 +- 4-10 errors → 40 +- 10+ errors → 10 + +### Links (weight: 10%) +- 0 broken → 100 +- Each broken link → -15 (minimum 0) + +### Per-Category Scoring (Visual, Functional, UX, Content, Performance, Accessibility) +Each category starts at 100. Deduct per finding: +- Critical issue → -25 +- High issue → -15 +- Medium issue → -8 +- Low issue → -3 +Minimum 0 per category. + +### Weights +| Category | Weight | +|----------|--------| +| Console | 15% | +| Links | 10% | +| Visual | 10% | +| Functional | 20% | +| UX | 15% | +| Performance | 10% | +| Content | 5% | +| Accessibility | 15% | + +### Final Score +`score = Σ (category_score × weight)` + +--- + +## Framework-Specific Guidance + +### Next.js +- Check console for hydration errors (`Hydration failed`, `Text content did not match`) +- Monitor `_next/data` requests in network — 404s indicate broken data fetching +- Test client-side navigation (click links, don't just `goto`) — catches routing issues +- Check for CLS (Cumulative Layout Shift) on pages with dynamic content + +### Rails +- Check for N+1 query warnings in console (if development mode) +- Verify CSRF token presence in forms +- Test Turbo/Stimulus integration — do page transitions work smoothly? +- Check for flash messages appearing and dismissing correctly + +### WordPress +- Check for plugin conflicts (JS errors from different plugins) +- Verify admin bar visibility for logged-in users +- Test REST API endpoints (`/wp-json/`) +- Check for mixed content warnings (common with WP) + +### General SPA (React, Vue, Angular) +- Use `snapshot -i` for navigation — `links` command misses client-side routes +- Check for stale state (navigate away and back — does data refresh?) +- Test browser back/forward — does the app handle history correctly? +- Check for memory leaks (monitor console after extended use) + +--- + +## Important Rules + +1. **Repro is everything.** Every issue needs at least one screenshot. No exceptions. +2. **Verify before documenting.** Retry the issue once to confirm it's reproducible, not a fluke. +3. **Never include credentials.** Write `[REDACTED]` for passwords in repro steps. +4. **Write incrementally.** Append each issue to the report as you find it. Don't batch. +5. **Never read source code.** Test as a user, not a developer. +6. **Check console after every interaction.** JS errors that don't surface visually are still bugs. +7. **Test like a user.** Use realistic data. Walk through complete workflows end-to-end. +8. **Depth over breadth.** 5-10 well-documented issues with evidence > 20 vague descriptions. +9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. +10. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses. + +--- + +## Output + +Write the report to both local and project-scoped locations: + +**Local:** `.gstack/qa-reports/qa-report-{domain}-{YYYY-MM-DD}.md` + +**Project-scoped:** Write test outcome artifact for cross-session context: +```bash +SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') +mkdir -p ~/.gstack/projects/$SLUG +``` +Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` + +### Output Structure + +``` +.gstack/qa-reports/ +├── qa-report-{domain}-{YYYY-MM-DD}.md # Structured report +├── screenshots/ +│ ├── initial.png # Landing page annotated screenshot +│ ├── issue-001-step-1.png # Per-issue evidence +│ ├── issue-001-result.png +│ └── ... +└── baseline.json # For regression mode +``` + +Report filenames use the domain and date: `qa-report-myapp-com-2026-03-12.md` + +--- + +## Additional Rules (qa-only specific) + +11. **Never fix bugs.** Find and document only. Do not read source code, edit files, or suggest fixes in the report. Your job is to report what's broken, not to fix it. Use `/qa` for the test-fix-verify loop. diff --git a/qa-only/SKILL.md.tmpl b/qa-only/SKILL.md.tmpl new file mode 100644 index 00000000..7ad936bb --- /dev/null +++ b/qa-only/SKILL.md.tmpl @@ -0,0 +1,99 @@ +--- +name: qa-only +version: 1.0.0 +description: | + Report-only QA testing. Systematically tests a web application and produces a + structured report with health score, screenshots, and repro steps — but never + fixes anything. Use when asked to "just report bugs", "qa report only", or + "test but don't fix". For the full test-fix-verify loop, use /qa instead. +allowed-tools: + - Bash + - Read + - Write + - AskUserQuestion +--- + +{{UPDATE_CHECK}} + +# /qa-only: Report-Only QA Testing + +You are a QA engineer. Test web applications like a real user — click everything, fill every form, check every state. Produce a structured report with evidence. **NEVER fix anything.** + +## Setup + +**Parse the user's request for these parameters:** + +| Parameter | Default | Override example | +|-----------|---------|-----------------:| +| Target URL | (auto-detect or required) | `https://myapp.com`, `http://localhost:3000` | +| Mode | full | `--quick`, `--regression .gstack/qa-reports/baseline.json` | +| Output dir | `.gstack/qa-reports/` | `Output to /tmp/qa` | +| Scope | Full app (or diff-scoped) | `Focus on the billing page` | +| Auth | None | `Sign in to user@example.com`, `Import cookies from cookies.json` | + +**If no URL is given and you're on a feature branch:** Automatically enter **diff-aware mode** (see Modes below). This is the most common case — the user just shipped code on a branch and wants to verify it works. + +**Find the browse binary:** + +{{BROWSE_SETUP}} + +**Create output directories:** + +```bash +REPORT_DIR=".gstack/qa-reports" +mkdir -p "$REPORT_DIR/screenshots" +``` + +--- + +## Test Plan Context + +Before falling back to git diff heuristics, check for richer test plan sources: + +1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo + ```bash + SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') + ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 + ``` +2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation +3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. + +--- + +{{QA_METHODOLOGY}} + +--- + +## Output + +Write the report to both local and project-scoped locations: + +**Local:** `.gstack/qa-reports/qa-report-{domain}-{YYYY-MM-DD}.md` + +**Project-scoped:** Write test outcome artifact for cross-session context: +```bash +SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') +mkdir -p ~/.gstack/projects/$SLUG +``` +Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` + +### Output Structure + +``` +.gstack/qa-reports/ +├── qa-report-{domain}-{YYYY-MM-DD}.md # Structured report +├── screenshots/ +│ ├── initial.png # Landing page annotated screenshot +│ ├── issue-001-step-1.png # Per-issue evidence +│ ├── issue-001-result.png +│ └── ... +└── baseline.json # For regression mode +``` + +Report filenames use the domain and date: `qa-report-myapp-com-2026-03-12.md` + +--- + +## Additional Rules (qa-only specific) + +11. **Never fix bugs.** Find and document only. Do not read source code, edit files, or suggest fixes in the report. Your job is to report what's broken, not to fix it. Use `/qa` for the test-fix-verify loop. diff --git a/qa/SKILL.md b/qa/SKILL.md index dd4b888d..d6acc6af 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -1,16 +1,20 @@ --- name: qa -version: 1.0.0 +version: 2.0.0 description: | - Systematically QA test a web application. Use when asked to "qa", "QA", "test this site", - "find bugs", "dogfood", or review quality. Four modes: diff-aware (automatic on feature - branches — analyzes git diff, identifies affected pages, tests them), full (systematic - exploration), quick (30-second smoke test), regression (compare against baseline). Produces - structured report with health score, screenshots, and repro steps. + Systematically QA test a web application and fix bugs found. Runs QA testing, + then iteratively fixes bugs in source code, committing each fix atomically and + re-verifying. Use when asked to "qa", "QA", "test this site", "find bugs", + "test and fix", or "fix what's broken". Three tiers: Quick (critical/high only), + Standard (+ medium), Exhaustive (+ cosmetic). Produces before/after health scores, + fix evidence, and a ship-readiness summary. For report-only mode, use /qa-only. allowed-tools: - Bash - Read - Write + - Edit + - Glob + - Grep - AskUserQuestion --- @@ -25,24 +29,38 @@ _UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/sk If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. -# /qa: Systematic QA Testing +# /qa: Test → Fix → Verify -You are a QA engineer. Test web applications like a real user — click everything, fill every form, check every state. Produce a structured report with evidence. +You are a QA engineer AND a bug-fix engineer. Test web applications like a real user — click everything, fill every form, check every state. When you find bugs, fix them in source code with atomic commits, then re-verify. Produce a structured report with before/after evidence. ## Setup **Parse the user's request for these parameters:** | Parameter | Default | Override example | -|-----------|---------|-----------------| +|-----------|---------|-----------------:| | Target URL | (auto-detect or required) | `https://myapp.com`, `http://localhost:3000` | -| Mode | full | `--quick`, `--regression .gstack/qa-reports/baseline.json` | +| Tier | Standard | `--quick`, `--exhaustive` | +| Mode | full | `--regression .gstack/qa-reports/baseline.json` | | Output dir | `.gstack/qa-reports/` | `Output to /tmp/qa` | | Scope | Full app (or diff-scoped) | `Focus on the billing page` | | Auth | None | `Sign in to user@example.com`, `Import cookies from cookies.json` | +**Tiers determine which issues get fixed:** +- **Quick:** Fix critical + high severity only +- **Standard:** + medium severity (default) +- **Exhaustive:** + low/cosmetic severity + **If no URL is given and you're on a feature branch:** Automatically enter **diff-aware mode** (see Modes below). This is the most common case — the user just shipped code on a branch and wants to verify it works. +**Require clean working tree before starting:** +```bash +if [ -n "$(git status --porcelain)" ]; then + echo "ERROR: Working tree is dirty. Commit or stash changes before running /qa." + exit 1 +fi +``` + **Find the browse binary:** ## SETUP (run this check BEFORE any browse command) @@ -73,6 +91,22 @@ mkdir -p "$REPORT_DIR/screenshots" --- +## Test Plan Context + +Before falling back to git diff heuristics, check for richer test plan sources: + +1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo + ```bash + SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') + ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 + ``` +2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation +3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. + +--- + +## Phases 1-6: QA Baseline + ## Modes ### Diff-aware (automatic when on a feature branch with no URL) @@ -347,6 +381,8 @@ Minimum 0 per category. 9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. 10. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses. +Record baseline health score at end of Phase 6. + --- ## Output Structure @@ -358,8 +394,151 @@ Minimum 0 per category. │ ├── initial.png # Landing page annotated screenshot │ ├── issue-001-step-1.png # Per-issue evidence │ ├── issue-001-result.png +│ ├── issue-001-before.png # Before fix (if fixed) +│ ├── issue-001-after.png # After fix (if fixed) │ └── ... └── baseline.json # For regression mode ``` Report filenames use the domain and date: `qa-report-myapp-com-2026-03-12.md` + +--- + +## Phase 7: Triage + +Sort all discovered issues by severity, then decide which to fix based on the selected tier: + +- **Quick:** Fix critical + high only. Mark medium/low as "deferred." +- **Standard:** Fix critical + high + medium. Mark low as "deferred." +- **Exhaustive:** Fix all, including cosmetic/low severity. + +Mark issues that cannot be fixed from source code (e.g., third-party widget bugs, infrastructure issues) as "deferred" regardless of tier. + +--- + +## Phase 8: Fix Loop + +For each fixable issue, in severity order: + +### 8a. Locate source + +```bash +# Grep for error messages, component names, route definitions +# Glob for file patterns matching the affected page +``` + +- Find the source file(s) responsible for the bug +- ONLY modify files directly related to the issue + +### 8b. Fix + +- Read the source code, understand the context +- Make the **minimal fix** — smallest change that resolves the issue +- Do NOT refactor surrounding code, add features, or "improve" unrelated things + +### 8c. Commit + +```bash +git add +git commit -m "fix(qa): ISSUE-NNN — short description" +``` + +- One commit per fix. Never bundle multiple fixes. +- Message format: `fix(qa): ISSUE-NNN — short description` + +### 8d. Re-test + +- Navigate back to the affected page +- Take **before/after screenshot pair** +- Check console for errors +- Use `snapshot -D` to verify the change had the expected effect + +```bash +$B goto +$B screenshot "$REPORT_DIR/screenshots/issue-NNN-after.png" +$B console --errors +$B snapshot -D +``` + +### 8e. Classify + +- **verified**: re-test confirms the fix works, no new errors introduced +- **best-effort**: fix applied but couldn't fully verify (e.g., needs auth state, external service) +- **reverted**: regression detected → `git revert HEAD` → mark issue as "deferred" + +### 8f. Self-Regulation (STOP AND EVALUATE) + +Every 5 fixes (or after any revert), compute the WTF-likelihood: + +``` +WTF-LIKELIHOOD: + Start at 0% + Each revert: +15% + Each fix touching >3 files: +5% + After fix 15: +1% per additional fix + All remaining Low severity: +10% + Touching unrelated files: +20% +``` + +**If WTF > 20%:** STOP immediately. Show the user what you've done so far. Ask whether to continue. + +**Hard cap: 50 fixes.** After 50 fixes, stop regardless of remaining issues. + +--- + +## Phase 9: Final QA + +After all fixes are applied: + +1. Re-run QA on all affected pages +2. Compute final health score +3. **If final score is WORSE than baseline:** WARN prominently — something regressed + +--- + +## Phase 10: Report + +Write the report to both local and project-scoped locations: + +**Local:** `.gstack/qa-reports/qa-report-{domain}-{YYYY-MM-DD}.md` + +**Project-scoped:** Write test outcome artifact for cross-session context: +```bash +SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') +mkdir -p ~/.gstack/projects/$SLUG +``` +Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` + +**Per-issue additions** (beyond standard report template): +- Fix Status: verified / best-effort / reverted / deferred +- Commit SHA (if fixed) +- Files Changed (if fixed) +- Before/After screenshots (if fixed) + +**Summary section:** +- Total issues found +- Fixes applied (verified: X, best-effort: Y, reverted: Z) +- Deferred issues +- Health score delta: baseline → final + +**PR Summary:** Include a one-line summary suitable for PR descriptions: +> "QA found N issues, fixed M, health score X → Y." + +--- + +## Phase 11: TODOS.md Update + +If the repo has a `TODOS.md`: + +1. **New deferred bugs** → add as TODOs with severity, category, and repro steps +2. **Fixed bugs that were in TODOS.md** → annotate with "Fixed by /qa on {branch}, {date}" + +--- + +## Additional Rules (qa-specific) + +11. **Clean working tree required.** Refuse to start if `git status --porcelain` is non-empty. +12. **One commit per fix.** Never bundle multiple fixes into one commit. +13. **Never modify tests or CI configuration.** Only fix application source code. +14. **Revert on regression.** If a fix makes things worse, `git revert HEAD` immediately. +15. **Self-regulate.** Follow the WTF-likelihood heuristic. When in doubt, stop and ask. diff --git a/qa/SKILL.md.tmpl b/qa/SKILL.md.tmpl index 6afadcc7..2fa05d86 100644 --- a/qa/SKILL.md.tmpl +++ b/qa/SKILL.md.tmpl @@ -1,39 +1,57 @@ --- name: qa -version: 1.0.0 +version: 2.0.0 description: | - Systematically QA test a web application. Use when asked to "qa", "QA", "test this site", - "find bugs", "dogfood", or review quality. Four modes: diff-aware (automatic on feature - branches — analyzes git diff, identifies affected pages, tests them), full (systematic - exploration), quick (30-second smoke test), regression (compare against baseline). Produces - structured report with health score, screenshots, and repro steps. + Systematically QA test a web application and fix bugs found. Runs QA testing, + then iteratively fixes bugs in source code, committing each fix atomically and + re-verifying. Use when asked to "qa", "QA", "test this site", "find bugs", + "test and fix", or "fix what's broken". Three tiers: Quick (critical/high only), + Standard (+ medium), Exhaustive (+ cosmetic). Produces before/after health scores, + fix evidence, and a ship-readiness summary. For report-only mode, use /qa-only. allowed-tools: - Bash - Read - Write + - Edit + - Glob + - Grep - AskUserQuestion --- {{UPDATE_CHECK}} -# /qa: Systematic QA Testing +# /qa: Test → Fix → Verify -You are a QA engineer. Test web applications like a real user — click everything, fill every form, check every state. Produce a structured report with evidence. +You are a QA engineer AND a bug-fix engineer. Test web applications like a real user — click everything, fill every form, check every state. When you find bugs, fix them in source code with atomic commits, then re-verify. Produce a structured report with before/after evidence. ## Setup **Parse the user's request for these parameters:** | Parameter | Default | Override example | -|-----------|---------|-----------------| +|-----------|---------|-----------------:| | Target URL | (auto-detect or required) | `https://myapp.com`, `http://localhost:3000` | -| Mode | full | `--quick`, `--regression .gstack/qa-reports/baseline.json` | +| Tier | Standard | `--quick`, `--exhaustive` | +| Mode | full | `--regression .gstack/qa-reports/baseline.json` | | Output dir | `.gstack/qa-reports/` | `Output to /tmp/qa` | | Scope | Full app (or diff-scoped) | `Focus on the billing page` | | Auth | None | `Sign in to user@example.com`, `Import cookies from cookies.json` | +**Tiers determine which issues get fixed:** +- **Quick:** Fix critical + high severity only +- **Standard:** + medium severity (default) +- **Exhaustive:** + low/cosmetic severity + **If no URL is given and you're on a feature branch:** Automatically enter **diff-aware mode** (see Modes below). This is the most common case — the user just shipped code on a branch and wants to verify it works. +**Require clean working tree before starting:** +```bash +if [ -n "$(git status --porcelain)" ]; then + echo "ERROR: Working tree is dirty. Commit or stash changes before running /qa." + exit 1 +fi +``` + **Find the browse binary:** {{BROWSE_SETUP}} @@ -47,279 +65,25 @@ mkdir -p "$REPORT_DIR/screenshots" --- -## Modes +## Test Plan Context -### Diff-aware (automatic when on a feature branch with no URL) +Before falling back to git diff heuristics, check for richer test plan sources: -This is the **primary mode** for developers verifying their work. When the user says `/qa` without a URL and the repo is on a feature branch, automatically: - -1. **Analyze the branch diff** to understand what changed: +1. **Project-scoped test plans:** Check `~/.gstack/projects/` for recent `*-test-plan-*.md` files for this repo ```bash - git diff main...HEAD --name-only - git log main..HEAD --oneline + SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') + ls -t ~/.gstack/projects/$SLUG/*-test-plan-*.md 2>/dev/null | head -1 ``` - -2. **Identify affected pages/routes** from the changed files: - - Controller/route files → which URL paths they serve - - View/template/component files → which pages render them - - Model/service files → which pages use those models (check controllers that reference them) - - CSS/style files → which pages include those stylesheets - - API endpoints → test them directly with `$B js "await fetch('/api/...')"` - - Static pages (markdown, HTML) → navigate to them directly - -3. **Detect the running app** — check common local dev ports: - ```bash - $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \ - $B goto http://localhost:4000 2>/dev/null && echo "Found app on :4000" || \ - $B goto http://localhost:8080 2>/dev/null && echo "Found app on :8080" - ``` - If no local app is found, check for a staging/preview URL in the PR or environment. If nothing works, ask the user for the URL. - -4. **Test each affected page/route:** - - Navigate to the page - - Take a screenshot - - Check console for errors - - If the change was interactive (forms, buttons, flows), test the interaction end-to-end - - Use `snapshot -D` before and after actions to verify the change had the expected effect - -5. **Cross-reference with commit messages and PR description** to understand *intent* — what should the change do? Verify it actually does that. - -6. **Check TODOS.md** (if it exists) for known bugs or issues related to the changed files. If a TODO describes a bug that this branch should fix, add it to your test plan. If you find a new bug during QA that isn't in TODOS.md, note it in the report. - -7. **Report findings** scoped to the branch changes: - - "Changes tested: N pages/routes affected by this branch" - - For each: does it work? Screenshot evidence. - - Any regressions on adjacent pages? - -**If the user provides a URL with diff-aware mode:** Use that URL as the base but still scope testing to the changed files. - -### Full (default when URL is provided) -Systematic exploration. Visit every reachable page. Document 5-10 well-evidenced issues. Produce health score. Takes 5-15 minutes depending on app size. - -### Quick (`--quick`) -30-second smoke test. Visit homepage + top 5 navigation targets. Check: page loads? Console errors? Broken links? Produce health score. No detailed issue documentation. - -### Regression (`--regression `) -Run full mode, then load `baseline.json` from a previous run. Diff: which issues are fixed? Which are new? What's the score delta? Append regression section to report. +2. **Conversation context:** Check if a prior `/plan-eng-review` or `/plan-ceo-review` produced test plan output in this conversation +3. **Use whichever source is richer.** Fall back to git diff analysis only if neither is available. --- -## Workflow +## Phases 1-6: QA Baseline -### Phase 1: Initialize +{{QA_METHODOLOGY}} -1. Find browse binary (see Setup above) -2. Create output directories -3. Copy report template from `qa/templates/qa-report-template.md` to output dir -4. Start timer for duration tracking - -### Phase 2: Authenticate (if needed) - -**If the user specified auth credentials:** - -```bash -$B goto -$B snapshot -i # find the login form -$B fill @e3 "user@example.com" -$B fill @e4 "[REDACTED]" # NEVER include real passwords in report -$B click @e5 # submit -$B snapshot -D # verify login succeeded -``` - -**If the user provided a cookie file:** - -```bash -$B cookie-import cookies.json -$B goto -``` - -**If 2FA/OTP is required:** Ask the user for the code and wait. - -**If CAPTCHA blocks you:** Tell the user: "Please complete the CAPTCHA in the browser, then tell me to continue." - -### Phase 3: Orient - -Get a map of the application: - -```bash -$B goto -$B snapshot -i -a -o "$REPORT_DIR/screenshots/initial.png" -$B links # map navigation structure -$B console --errors # any errors on landing? -``` - -**Detect framework** (note in report metadata): -- `__next` in HTML or `_next/data` requests → Next.js -- `csrf-token` meta tag → Rails -- `wp-content` in URLs → WordPress -- Client-side routing with no page reloads → SPA - -**For SPAs:** The `links` command may return few results because navigation is client-side. Use `snapshot -i` to find nav elements (buttons, menu items) instead. - -### Phase 4: Explore - -Visit pages systematically. At each page: - -```bash -$B goto -$B snapshot -i -a -o "$REPORT_DIR/screenshots/page-name.png" -$B console --errors -``` - -Then follow the **per-page exploration checklist** (see `qa/references/issue-taxonomy.md`): - -1. **Visual scan** — Look at the annotated screenshot for layout issues -2. **Interactive elements** — Click buttons, links, controls. Do they work? -3. **Forms** — Fill and submit. Test empty, invalid, edge cases -4. **Navigation** — Check all paths in and out -5. **States** — Empty state, loading, error, overflow -6. **Console** — Any new JS errors after interactions? -7. **Responsiveness** — Check mobile viewport if relevant: - ```bash - $B viewport 375x812 - $B screenshot "$REPORT_DIR/screenshots/page-mobile.png" - $B viewport 1280x720 - ``` - -**Depth judgment:** Spend more time on core features (homepage, dashboard, checkout, search) and less on secondary pages (about, terms, privacy). - -**Quick mode:** Only visit homepage + top 5 navigation targets from the Orient phase. Skip the per-page checklist — just check: loads? Console errors? Broken links visible? - -### Phase 5: Document - -Document each issue **immediately when found** — don't batch them. - -**Two evidence tiers:** - -**Interactive bugs** (broken flows, dead buttons, form failures): -1. Take a screenshot before the action -2. Perform the action -3. Take a screenshot showing the result -4. Use `snapshot -D` to show what changed -5. Write repro steps referencing screenshots - -```bash -$B screenshot "$REPORT_DIR/screenshots/issue-001-step-1.png" -$B click @e5 -$B screenshot "$REPORT_DIR/screenshots/issue-001-result.png" -$B snapshot -D -``` - -**Static bugs** (typos, layout issues, missing images): -1. Take a single annotated screenshot showing the problem -2. Describe what's wrong - -```bash -$B snapshot -i -a -o "$REPORT_DIR/screenshots/issue-002.png" -``` - -**Write each issue to the report immediately** using the template format from `qa/templates/qa-report-template.md`. - -### Phase 6: Wrap Up - -1. **Compute health score** using the rubric below -2. **Write "Top 3 Things to Fix"** — the 3 highest-severity issues -3. **Write console health summary** — aggregate all console errors seen across pages -4. **Update severity counts** in the summary table -5. **Fill in report metadata** — date, duration, pages visited, screenshot count, framework -6. **Save baseline** — write `baseline.json` with: - ```json - { - "date": "YYYY-MM-DD", - "url": "", - "healthScore": N, - "issues": [{ "id": "ISSUE-001", "title": "...", "severity": "...", "category": "..." }], - "categoryScores": { "console": N, "links": N, ... } - } - ``` - -**Regression mode:** After writing the report, load the baseline file. Compare: -- Health score delta -- Issues fixed (in baseline but not current) -- New issues (in current but not baseline) -- Append the regression section to the report - ---- - -## Health Score Rubric - -Compute each category score (0-100), then take the weighted average. - -### Console (weight: 15%) -- 0 errors → 100 -- 1-3 errors → 70 -- 4-10 errors → 40 -- 10+ errors → 10 - -### Links (weight: 10%) -- 0 broken → 100 -- Each broken link → -15 (minimum 0) - -### Per-Category Scoring (Visual, Functional, UX, Content, Performance, Accessibility) -Each category starts at 100. Deduct per finding: -- Critical issue → -25 -- High issue → -15 -- Medium issue → -8 -- Low issue → -3 -Minimum 0 per category. - -### Weights -| Category | Weight | -|----------|--------| -| Console | 15% | -| Links | 10% | -| Visual | 10% | -| Functional | 20% | -| UX | 15% | -| Performance | 10% | -| Content | 5% | -| Accessibility | 15% | - -### Final Score -`score = Σ (category_score × weight)` - ---- - -## Framework-Specific Guidance - -### Next.js -- Check console for hydration errors (`Hydration failed`, `Text content did not match`) -- Monitor `_next/data` requests in network — 404s indicate broken data fetching -- Test client-side navigation (click links, don't just `goto`) — catches routing issues -- Check for CLS (Cumulative Layout Shift) on pages with dynamic content - -### Rails -- Check for N+1 query warnings in console (if development mode) -- Verify CSRF token presence in forms -- Test Turbo/Stimulus integration — do page transitions work smoothly? -- Check for flash messages appearing and dismissing correctly - -### WordPress -- Check for plugin conflicts (JS errors from different plugins) -- Verify admin bar visibility for logged-in users -- Test REST API endpoints (`/wp-json/`) -- Check for mixed content warnings (common with WP) - -### General SPA (React, Vue, Angular) -- Use `snapshot -i` for navigation — `links` command misses client-side routes -- Check for stale state (navigate away and back — does data refresh?) -- Test browser back/forward — does the app handle history correctly? -- Check for memory leaks (monitor console after extended use) - ---- - -## Important Rules - -1. **Repro is everything.** Every issue needs at least one screenshot. No exceptions. -2. **Verify before documenting.** Retry the issue once to confirm it's reproducible, not a fluke. -3. **Never include credentials.** Write `[REDACTED]` for passwords in repro steps. -4. **Write incrementally.** Append each issue to the report as you find it. Don't batch. -5. **Never read source code.** Test as a user, not a developer. -6. **Check console after every interaction.** JS errors that don't surface visually are still bugs. -7. **Test like a user.** Use realistic data. Walk through complete workflows end-to-end. -8. **Depth over breadth.** 5-10 well-documented issues with evidence > 20 vague descriptions. -9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. -10. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses. +Record baseline health score at end of Phase 6. --- @@ -332,8 +96,151 @@ Minimum 0 per category. │ ├── initial.png # Landing page annotated screenshot │ ├── issue-001-step-1.png # Per-issue evidence │ ├── issue-001-result.png +│ ├── issue-001-before.png # Before fix (if fixed) +│ ├── issue-001-after.png # After fix (if fixed) │ └── ... └── baseline.json # For regression mode ``` Report filenames use the domain and date: `qa-report-myapp-com-2026-03-12.md` + +--- + +## Phase 7: Triage + +Sort all discovered issues by severity, then decide which to fix based on the selected tier: + +- **Quick:** Fix critical + high only. Mark medium/low as "deferred." +- **Standard:** Fix critical + high + medium. Mark low as "deferred." +- **Exhaustive:** Fix all, including cosmetic/low severity. + +Mark issues that cannot be fixed from source code (e.g., third-party widget bugs, infrastructure issues) as "deferred" regardless of tier. + +--- + +## Phase 8: Fix Loop + +For each fixable issue, in severity order: + +### 8a. Locate source + +```bash +# Grep for error messages, component names, route definitions +# Glob for file patterns matching the affected page +``` + +- Find the source file(s) responsible for the bug +- ONLY modify files directly related to the issue + +### 8b. Fix + +- Read the source code, understand the context +- Make the **minimal fix** — smallest change that resolves the issue +- Do NOT refactor surrounding code, add features, or "improve" unrelated things + +### 8c. Commit + +```bash +git add +git commit -m "fix(qa): ISSUE-NNN — short description" +``` + +- One commit per fix. Never bundle multiple fixes. +- Message format: `fix(qa): ISSUE-NNN — short description` + +### 8d. Re-test + +- Navigate back to the affected page +- Take **before/after screenshot pair** +- Check console for errors +- Use `snapshot -D` to verify the change had the expected effect + +```bash +$B goto +$B screenshot "$REPORT_DIR/screenshots/issue-NNN-after.png" +$B console --errors +$B snapshot -D +``` + +### 8e. Classify + +- **verified**: re-test confirms the fix works, no new errors introduced +- **best-effort**: fix applied but couldn't fully verify (e.g., needs auth state, external service) +- **reverted**: regression detected → `git revert HEAD` → mark issue as "deferred" + +### 8f. Self-Regulation (STOP AND EVALUATE) + +Every 5 fixes (or after any revert), compute the WTF-likelihood: + +``` +WTF-LIKELIHOOD: + Start at 0% + Each revert: +15% + Each fix touching >3 files: +5% + After fix 15: +1% per additional fix + All remaining Low severity: +10% + Touching unrelated files: +20% +``` + +**If WTF > 20%:** STOP immediately. Show the user what you've done so far. Ask whether to continue. + +**Hard cap: 50 fixes.** After 50 fixes, stop regardless of remaining issues. + +--- + +## Phase 9: Final QA + +After all fixes are applied: + +1. Re-run QA on all affected pages +2. Compute final health score +3. **If final score is WORSE than baseline:** WARN prominently — something regressed + +--- + +## Phase 10: Report + +Write the report to both local and project-scoped locations: + +**Local:** `.gstack/qa-reports/qa-report-{domain}-{YYYY-MM-DD}.md` + +**Project-scoped:** Write test outcome artifact for cross-session context: +```bash +SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\([^/]*/[^/]*\)\.git$|\1|;s|.*[:/]\([^/]*/[^/]*\)$|\1|' | tr '/' '-') +mkdir -p ~/.gstack/projects/$SLUG +``` +Write to `~/.gstack/projects/{slug}/{user}-{branch}-test-outcome-{datetime}.md` + +**Per-issue additions** (beyond standard report template): +- Fix Status: verified / best-effort / reverted / deferred +- Commit SHA (if fixed) +- Files Changed (if fixed) +- Before/After screenshots (if fixed) + +**Summary section:** +- Total issues found +- Fixes applied (verified: X, best-effort: Y, reverted: Z) +- Deferred issues +- Health score delta: baseline → final + +**PR Summary:** Include a one-line summary suitable for PR descriptions: +> "QA found N issues, fixed M, health score X → Y." + +--- + +## Phase 11: TODOS.md Update + +If the repo has a `TODOS.md`: + +1. **New deferred bugs** → add as TODOs with severity, category, and repro steps +2. **Fixed bugs that were in TODOS.md** → annotate with "Fixed by /qa on {branch}, {date}" + +--- + +## Additional Rules (qa-specific) + +11. **Clean working tree required.** Refuse to start if `git status --porcelain` is non-empty. +12. **One commit per fix.** Never bundle multiple fixes into one commit. +13. **Never modify tests or CI configuration.** Only fix application source code. +14. **Revert on regression.** If a fix makes things worse, `git revert HEAD` immediately. +15. **Self-regulate.** Follow the WTF-likelihood heuristic. When in doubt, stop and ask. diff --git a/qa/templates/qa-report-template.md b/qa/templates/qa-report-template.md index c02eb836..5466bda4 100644 --- a/qa/templates/qa-report-template.md +++ b/qa/templates/qa-report-template.md @@ -72,6 +72,33 @@ --- +## Fixes Applied (if applicable) + +| Issue | Fix Status | Commit | Files Changed | +|-------|-----------|--------|---------------| +| ISSUE-NNN | verified / best-effort / reverted / deferred | {SHA} | {files} | + +### Before/After Evidence + +#### ISSUE-NNN: {title} +**Before:** ![Before](screenshots/issue-NNN-before.png) +**After:** ![After](screenshots/issue-NNN-after.png) + +--- + +## Ship Readiness + +| Metric | Value | +|--------|-------| +| Health score | {before} → {after} ({delta}) | +| Issues found | N | +| Fixes applied | N (verified: X, best-effort: Y, reverted: Z) | +| Deferred | N | + +**PR Summary:** "QA found N issues, fixed M, health score X → Y." + +--- + ## Regression (if applicable) | Metric | Baseline | Current | Delta | diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 7f6bd249..52c3042c 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -126,11 +126,288 @@ If \`NEEDS_SETUP\`: 3. If \`bun\` is not installed: \`curl -fsSL https://bun.sh/install | bash\``; } +function generateQAMethodology(): string { + return `## Modes + +### Diff-aware (automatic when on a feature branch with no URL) + +This is the **primary mode** for developers verifying their work. When the user says \`/qa\` without a URL and the repo is on a feature branch, automatically: + +1. **Analyze the branch diff** to understand what changed: + \`\`\`bash + git diff main...HEAD --name-only + git log main..HEAD --oneline + \`\`\` + +2. **Identify affected pages/routes** from the changed files: + - Controller/route files → which URL paths they serve + - View/template/component files → which pages render them + - Model/service files → which pages use those models (check controllers that reference them) + - CSS/style files → which pages include those stylesheets + - API endpoints → test them directly with \`$B js "await fetch('/api/...')"\` + - Static pages (markdown, HTML) → navigate to them directly + +3. **Detect the running app** — check common local dev ports: + \`\`\`bash + $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \\ + $B goto http://localhost:4000 2>/dev/null && echo "Found app on :4000" || \\ + $B goto http://localhost:8080 2>/dev/null && echo "Found app on :8080" + \`\`\` + If no local app is found, check for a staging/preview URL in the PR or environment. If nothing works, ask the user for the URL. + +4. **Test each affected page/route:** + - Navigate to the page + - Take a screenshot + - Check console for errors + - If the change was interactive (forms, buttons, flows), test the interaction end-to-end + - Use \`snapshot -D\` before and after actions to verify the change had the expected effect + +5. **Cross-reference with commit messages and PR description** to understand *intent* — what should the change do? Verify it actually does that. + +6. **Check TODOS.md** (if it exists) for known bugs or issues related to the changed files. If a TODO describes a bug that this branch should fix, add it to your test plan. If you find a new bug during QA that isn't in TODOS.md, note it in the report. + +7. **Report findings** scoped to the branch changes: + - "Changes tested: N pages/routes affected by this branch" + - For each: does it work? Screenshot evidence. + - Any regressions on adjacent pages? + +**If the user provides a URL with diff-aware mode:** Use that URL as the base but still scope testing to the changed files. + +### Full (default when URL is provided) +Systematic exploration. Visit every reachable page. Document 5-10 well-evidenced issues. Produce health score. Takes 5-15 minutes depending on app size. + +### Quick (\`--quick\`) +30-second smoke test. Visit homepage + top 5 navigation targets. Check: page loads? Console errors? Broken links? Produce health score. No detailed issue documentation. + +### Regression (\`--regression \`) +Run full mode, then load \`baseline.json\` from a previous run. Diff: which issues are fixed? Which are new? What's the score delta? Append regression section to report. + +--- + +## Workflow + +### Phase 1: Initialize + +1. Find browse binary (see Setup above) +2. Create output directories +3. Copy report template from \`qa/templates/qa-report-template.md\` to output dir +4. Start timer for duration tracking + +### Phase 2: Authenticate (if needed) + +**If the user specified auth credentials:** + +\`\`\`bash +$B goto +$B snapshot -i # find the login form +$B fill @e3 "user@example.com" +$B fill @e4 "[REDACTED]" # NEVER include real passwords in report +$B click @e5 # submit +$B snapshot -D # verify login succeeded +\`\`\` + +**If the user provided a cookie file:** + +\`\`\`bash +$B cookie-import cookies.json +$B goto +\`\`\` + +**If 2FA/OTP is required:** Ask the user for the code and wait. + +**If CAPTCHA blocks you:** Tell the user: "Please complete the CAPTCHA in the browser, then tell me to continue." + +### Phase 3: Orient + +Get a map of the application: + +\`\`\`bash +$B goto +$B snapshot -i -a -o "$REPORT_DIR/screenshots/initial.png" +$B links # map navigation structure +$B console --errors # any errors on landing? +\`\`\` + +**Detect framework** (note in report metadata): +- \`__next\` in HTML or \`_next/data\` requests → Next.js +- \`csrf-token\` meta tag → Rails +- \`wp-content\` in URLs → WordPress +- Client-side routing with no page reloads → SPA + +**For SPAs:** The \`links\` command may return few results because navigation is client-side. Use \`snapshot -i\` to find nav elements (buttons, menu items) instead. + +### Phase 4: Explore + +Visit pages systematically. At each page: + +\`\`\`bash +$B goto +$B snapshot -i -a -o "$REPORT_DIR/screenshots/page-name.png" +$B console --errors +\`\`\` + +Then follow the **per-page exploration checklist** (see \`qa/references/issue-taxonomy.md\`): + +1. **Visual scan** — Look at the annotated screenshot for layout issues +2. **Interactive elements** — Click buttons, links, controls. Do they work? +3. **Forms** — Fill and submit. Test empty, invalid, edge cases +4. **Navigation** — Check all paths in and out +5. **States** — Empty state, loading, error, overflow +6. **Console** — Any new JS errors after interactions? +7. **Responsiveness** — Check mobile viewport if relevant: + \`\`\`bash + $B viewport 375x812 + $B screenshot "$REPORT_DIR/screenshots/page-mobile.png" + $B viewport 1280x720 + \`\`\` + +**Depth judgment:** Spend more time on core features (homepage, dashboard, checkout, search) and less on secondary pages (about, terms, privacy). + +**Quick mode:** Only visit homepage + top 5 navigation targets from the Orient phase. Skip the per-page checklist — just check: loads? Console errors? Broken links visible? + +### Phase 5: Document + +Document each issue **immediately when found** — don't batch them. + +**Two evidence tiers:** + +**Interactive bugs** (broken flows, dead buttons, form failures): +1. Take a screenshot before the action +2. Perform the action +3. Take a screenshot showing the result +4. Use \`snapshot -D\` to show what changed +5. Write repro steps referencing screenshots + +\`\`\`bash +$B screenshot "$REPORT_DIR/screenshots/issue-001-step-1.png" +$B click @e5 +$B screenshot "$REPORT_DIR/screenshots/issue-001-result.png" +$B snapshot -D +\`\`\` + +**Static bugs** (typos, layout issues, missing images): +1. Take a single annotated screenshot showing the problem +2. Describe what's wrong + +\`\`\`bash +$B snapshot -i -a -o "$REPORT_DIR/screenshots/issue-002.png" +\`\`\` + +**Write each issue to the report immediately** using the template format from \`qa/templates/qa-report-template.md\`. + +### Phase 6: Wrap Up + +1. **Compute health score** using the rubric below +2. **Write "Top 3 Things to Fix"** — the 3 highest-severity issues +3. **Write console health summary** — aggregate all console errors seen across pages +4. **Update severity counts** in the summary table +5. **Fill in report metadata** — date, duration, pages visited, screenshot count, framework +6. **Save baseline** — write \`baseline.json\` with: + \`\`\`json + { + "date": "YYYY-MM-DD", + "url": "", + "healthScore": N, + "issues": [{ "id": "ISSUE-001", "title": "...", "severity": "...", "category": "..." }], + "categoryScores": { "console": N, "links": N, ... } + } + \`\`\` + +**Regression mode:** After writing the report, load the baseline file. Compare: +- Health score delta +- Issues fixed (in baseline but not current) +- New issues (in current but not baseline) +- Append the regression section to the report + +--- + +## Health Score Rubric + +Compute each category score (0-100), then take the weighted average. + +### Console (weight: 15%) +- 0 errors → 100 +- 1-3 errors → 70 +- 4-10 errors → 40 +- 10+ errors → 10 + +### Links (weight: 10%) +- 0 broken → 100 +- Each broken link → -15 (minimum 0) + +### Per-Category Scoring (Visual, Functional, UX, Content, Performance, Accessibility) +Each category starts at 100. Deduct per finding: +- Critical issue → -25 +- High issue → -15 +- Medium issue → -8 +- Low issue → -3 +Minimum 0 per category. + +### Weights +| Category | Weight | +|----------|--------| +| Console | 15% | +| Links | 10% | +| Visual | 10% | +| Functional | 20% | +| UX | 15% | +| Performance | 10% | +| Content | 5% | +| Accessibility | 15% | + +### Final Score +\`score = Σ (category_score × weight)\` + +--- + +## Framework-Specific Guidance + +### Next.js +- Check console for hydration errors (\`Hydration failed\`, \`Text content did not match\`) +- Monitor \`_next/data\` requests in network — 404s indicate broken data fetching +- Test client-side navigation (click links, don't just \`goto\`) — catches routing issues +- Check for CLS (Cumulative Layout Shift) on pages with dynamic content + +### Rails +- Check for N+1 query warnings in console (if development mode) +- Verify CSRF token presence in forms +- Test Turbo/Stimulus integration — do page transitions work smoothly? +- Check for flash messages appearing and dismissing correctly + +### WordPress +- Check for plugin conflicts (JS errors from different plugins) +- Verify admin bar visibility for logged-in users +- Test REST API endpoints (\`/wp-json/\`) +- Check for mixed content warnings (common with WP) + +### General SPA (React, Vue, Angular) +- Use \`snapshot -i\` for navigation — \`links\` command misses client-side routes +- Check for stale state (navigate away and back — does data refresh?) +- Test browser back/forward — does the app handle history correctly? +- Check for memory leaks (monitor console after extended use) + +--- + +## Important Rules + +1. **Repro is everything.** Every issue needs at least one screenshot. No exceptions. +2. **Verify before documenting.** Retry the issue once to confirm it's reproducible, not a fluke. +3. **Never include credentials.** Write \`[REDACTED]\` for passwords in repro steps. +4. **Write incrementally.** Append each issue to the report as you find it. Don't batch. +5. **Never read source code.** Test as a user, not a developer. +6. **Check console after every interaction.** JS errors that don't surface visually are still bugs. +7. **Test like a user.** Use realistic data. Walk through complete workflows end-to-end. +8. **Depth over breadth.** 5-10 well-documented issues with evidence > 20 vague descriptions. +9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. +10. **Use \`snapshot -C\` for tricky UIs.** Finds clickable divs that the accessibility tree misses.`; +} + const RESOLVERS: Record string> = { COMMAND_REFERENCE: generateCommandReference, SNAPSHOT_FLAGS: generateSnapshotFlags, UPDATE_CHECK: generateUpdateCheck, BROWSE_SETUP: generateBrowseSetup, + QA_METHODOLOGY: generateQAMethodology, }; // ─── Template Processing ──────────────────────────────────── @@ -176,6 +453,7 @@ function findTemplates(): string[] { path.join(ROOT, 'SKILL.md.tmpl'), path.join(ROOT, 'browse', 'SKILL.md.tmpl'), path.join(ROOT, 'qa', 'SKILL.md.tmpl'), + path.join(ROOT, 'qa-only', 'SKILL.md.tmpl'), path.join(ROOT, 'setup-browser-cookies', 'SKILL.md.tmpl'), path.join(ROOT, 'ship', 'SKILL.md.tmpl'), path.join(ROOT, 'review', 'SKILL.md.tmpl'), diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts index fd10b529..591a0c81 100644 --- a/scripts/skill-check.ts +++ b/scripts/skill-check.ts @@ -20,6 +20,7 @@ const SKILL_FILES = [ 'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md', + 'qa-only/SKILL.md', 'ship/SKILL.md', 'review/SKILL.md', 'retro/SKILL.md', diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 264cb904..fc855794 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -61,6 +61,7 @@ describe('gen-skill-docs', () => { { dir: '.', name: 'root gstack' }, { dir: 'browse', name: 'browse' }, { dir: 'qa', name: 'qa' }, + { dir: 'qa-only', name: 'qa-only' }, { dir: 'review', name: 'review' }, { dir: 'ship', name: 'ship' }, { dir: 'plan-ceo-review', name: 'plan-ceo-review' }, @@ -129,6 +130,61 @@ describe('gen-skill-docs', () => { expect(browseTmpl).toContain('{{COMMAND_REFERENCE}}'); expect(browseTmpl).toContain('{{SNAPSHOT_FLAGS}}'); }); + + test('qa and qa-only templates use QA_METHODOLOGY placeholder', () => { + const qaTmpl = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md.tmpl'), 'utf-8'); + expect(qaTmpl).toContain('{{QA_METHODOLOGY}}'); + + const qaOnlyTmpl = fs.readFileSync(path.join(ROOT, 'qa-only', 'SKILL.md.tmpl'), 'utf-8'); + expect(qaOnlyTmpl).toContain('{{QA_METHODOLOGY}}'); + }); + + test('QA_METHODOLOGY appears expanded in both qa and qa-only generated files', () => { + const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + const qaOnlyContent = fs.readFileSync(path.join(ROOT, 'qa-only', 'SKILL.md'), 'utf-8'); + + // Both should contain the health score rubric + expect(qaContent).toContain('Health Score Rubric'); + expect(qaOnlyContent).toContain('Health Score Rubric'); + + // Both should contain framework guidance + expect(qaContent).toContain('Framework-Specific Guidance'); + expect(qaOnlyContent).toContain('Framework-Specific Guidance'); + + // Both should contain the important rules + expect(qaContent).toContain('Important Rules'); + expect(qaOnlyContent).toContain('Important Rules'); + + // Both should contain the 6 phases + expect(qaContent).toContain('Phase 1'); + expect(qaOnlyContent).toContain('Phase 1'); + expect(qaContent).toContain('Phase 6'); + expect(qaOnlyContent).toContain('Phase 6'); + }); + + test('qa-only has no-fix guardrails', () => { + const qaOnlyContent = fs.readFileSync(path.join(ROOT, 'qa-only', 'SKILL.md'), 'utf-8'); + expect(qaOnlyContent).toContain('Never fix bugs'); + expect(qaOnlyContent).toContain('NEVER fix anything'); + // Should not have Edit, Glob, or Grep in allowed-tools + expect(qaOnlyContent).not.toMatch(/allowed-tools:[\s\S]*?Edit/); + expect(qaOnlyContent).not.toMatch(/allowed-tools:[\s\S]*?Glob/); + expect(qaOnlyContent).not.toMatch(/allowed-tools:[\s\S]*?Grep/); + }); + + test('qa has fix-loop tools and phases', () => { + const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); + // Should have Edit, Glob, Grep in allowed-tools + expect(qaContent).toContain('Edit'); + expect(qaContent).toContain('Glob'); + expect(qaContent).toContain('Grep'); + // Should have fix-loop phases + expect(qaContent).toContain('Phase 7'); + expect(qaContent).toContain('Phase 8'); + expect(qaContent).toContain('Fix Loop'); + expect(qaContent).toContain('Triage'); + expect(qaContent).toContain('WTF'); + }); }); /** diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 05a39761..a2ce421c 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -43,6 +43,20 @@ describe('SKILL.md command validation', () => { const result = validateSkill(qaSkill); expect(result.snapshotFlagErrors).toHaveLength(0); }); + + test('all $B commands in qa-only/SKILL.md are valid browse commands', () => { + const qaOnlySkill = path.join(ROOT, 'qa-only', 'SKILL.md'); + if (!fs.existsSync(qaOnlySkill)) return; + const result = validateSkill(qaOnlySkill); + expect(result.invalid).toHaveLength(0); + }); + + test('all snapshot flags in qa-only/SKILL.md are valid', () => { + const qaOnlySkill = path.join(ROOT, 'qa-only', 'SKILL.md'); + if (!fs.existsSync(qaOnlySkill)) return; + const result = validateSkill(qaOnlySkill); + expect(result.snapshotFlagErrors).toHaveLength(0); + }); }); describe('Command registry consistency', () => { @@ -157,6 +171,7 @@ describe('Generated SKILL.md freshness', () => { describe('Update check preamble', () => { const skillsWithUpdateCheck = [ 'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md', + 'qa-only/SKILL.md', 'setup-browser-cookies/SKILL.md', 'ship/SKILL.md', 'review/SKILL.md', 'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md', @@ -261,7 +276,7 @@ describe('Cross-skill path consistency', () => { describe('QA skill structure validation', () => { const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); - test('qa/SKILL.md has all 6 phases', () => { + test('qa/SKILL.md has all 11 phases', () => { const phases = [ 'Phase 1', 'Initialize', 'Phase 2', 'Authenticate', @@ -269,6 +284,11 @@ describe('QA skill structure validation', () => { 'Phase 4', 'Explore', 'Phase 5', 'Document', 'Phase 6', 'Wrap Up', + 'Phase 7', 'Triage', + 'Phase 8', 'Fix Loop', + 'Phase 9', 'Final QA', + 'Phase 10', 'Report', + 'Phase 11', 'TODOS', ]; for (const phase of phases) { expect(qaContent).toContain(phase); @@ -291,6 +311,13 @@ describe('QA skill structure validation', () => { expect(qaContent).toContain('--regression'); }); + test('has all three tiers defined', () => { + const tiers = ['Quick', 'Standard', 'Exhaustive']; + for (const tier of tiers) { + expect(qaContent).toContain(tier); + } + }); + test('health score weights sum to 100%', () => { const weights = extractWeightsFromTable(qaContent); expect(weights.size).toBeGreaterThan(0);