diff --git a/scripts/resolvers/index.ts b/scripts/resolvers/index.ts index 7ac7f1a2..2898b9a8 100644 --- a/scripts/resolvers/index.ts +++ b/scripts/resolvers/index.ts @@ -16,6 +16,7 @@ import { generateSlugEval, generateSlugSetup, generateBaseBranchDetect, generate import { generateLearningsSearch, generateLearningsLog } from './learnings'; import { generateConfidenceCalibration } from './confidence'; import { generateInvokeSkill } from './composition'; +import { generateReviewArmy } from './review-army'; export const RESOLVERS: Record = { SLUG_EVAL: generateSlugEval, @@ -56,4 +57,5 @@ export const RESOLVERS: Record = { CONFIDENCE_CALIBRATION: generateConfidenceCalibration, INVOKE_SKILL: generateInvokeSkill, CHANGELOG_WORKFLOW: generateChangelogWorkflow, + REVIEW_ARMY: generateReviewArmy, }; diff --git a/scripts/resolvers/review-army.ts b/scripts/resolvers/review-army.ts new file mode 100644 index 00000000..c4cee821 --- /dev/null +++ b/scripts/resolvers/review-army.ts @@ -0,0 +1,190 @@ +/** + * Review Army resolver — parallel specialist reviewers for /review + * + * Generates template prose that instructs Claude to: + * 1. Detect stack and scope (via gstack-diff-scope) + * 2. Select and dispatch specialist subagents in parallel + * 3. Collect, parse, merge, and deduplicate JSON findings + * 4. Feed merged findings into the existing Fix-First pipeline + * + * Shipped as Release 2 of the self-learning roadmap (SELF_LEARNING_V0.md). + */ +import type { TemplateContext } from './types'; + +function generateSpecialistSelection(ctx: TemplateContext): string { + return `## Step 4.5: Review Army — Specialist Dispatch + +### Detect stack and scope + +\`\`\`bash +source <(${ctx.paths.binDir}/gstack-diff-scope 2>/dev/null) || true +# Detect stack for specialist context +STACK="" +[ -f Gemfile ] && STACK="\${STACK}ruby " +[ -f package.json ] && STACK="\${STACK}node " +[ -f requirements.txt ] || [ -f pyproject.toml ] && STACK="\${STACK}python " +[ -f go.mod ] && STACK="\${STACK}go " +[ -f Cargo.toml ] && STACK="\${STACK}rust " +echo "STACK: \${STACK:-unknown}" +DIFF_LINES=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0") +echo "DIFF_LINES: $DIFF_LINES" +\`\`\` + +### Select specialists + +Based on the scope signals above, select which specialists to dispatch. + +**Always-on (dispatch on every review with 50+ changed lines):** +1. **Testing** — read \`${ctx.paths.skillRoot}/review/specialists/testing.md\` +2. **Maintainability** — read \`${ctx.paths.skillRoot}/review/specialists/maintainability.md\` + +**If DIFF_LINES < 50:** Skip all specialists. Print: "Small diff ($DIFF_LINES lines) — specialists skipped." Continue to Step 5. + +**Conditional (dispatch if the matching scope signal is true):** +3. **Security** — if SCOPE_AUTH=true, OR if SCOPE_BACKEND=true AND DIFF_LINES > 100. Read \`${ctx.paths.skillRoot}/review/specialists/security.md\` +4. **Performance** — if SCOPE_BACKEND=true OR SCOPE_FRONTEND=true. Read \`${ctx.paths.skillRoot}/review/specialists/performance.md\` +5. **Data Migration** — if SCOPE_MIGRATIONS=true. Read \`${ctx.paths.skillRoot}/review/specialists/data-migration.md\` +6. **API Contract** — if SCOPE_API=true. Read \`${ctx.paths.skillRoot}/review/specialists/api-contract.md\` +7. **Design** — if SCOPE_FRONTEND=true. Use the existing design review checklist at \`${ctx.paths.skillRoot}/review/design-checklist.md\` + +Note which specialists were selected and which were skipped. Print the selection: +"Dispatching N specialists: [names]. Skipped: [names] (scope not detected)."`; +} + +function generateSpecialistDispatch(ctx: TemplateContext): string { + return `### Dispatch specialists in parallel + +For each selected specialist, launch an independent subagent via the Agent tool. +**Launch ALL selected specialists in a single message** (multiple Agent tool calls) +so they run in parallel. Each subagent has fresh context — no prior review bias. + +**Each specialist subagent prompt:** + +Construct the prompt for each specialist. The prompt includes: + +1. The specialist's checklist content (you already read the file above) +2. Stack context: "This is a {STACK} project." +3. Past learnings for this domain (if any exist): + +\`\`\`bash +${ctx.paths.binDir}/gstack-learnings-search --type pitfall --query "{specialist domain}" --limit 5 2>/dev/null || true +\`\`\` + +If learnings are found, include them: "Past learnings for this domain: {learnings}" + +4. Instructions: + +"You are a specialist code reviewer. Read the checklist below, then run +\`git diff origin/\` to get the full diff. Apply the checklist against the diff. + +For each finding, output a JSON object on its own line: +{\\"severity\\":\\"CRITICAL|INFORMATIONAL\\",\\"confidence\\":N,\\"path\\":\\"file\\",\\"line\\":N,\\"category\\":\\"category\\",\\"summary\\":\\"description\\",\\"fix\\":\\"recommended fix\\",\\"fingerprint\\":\\"path:line:category\\",\\"specialist\\":\\"name\\"} + +Required fields: severity, confidence, path, category, summary, specialist. +Optional: line, fix, fingerprint, evidence. + +If no findings: output \`NO FINDINGS\` and nothing else. +Do not output anything else — no preamble, no summary, no commentary. + +Stack context: {STACK} +Past learnings: {learnings or 'none'} + +CHECKLIST: +{checklist content}" + +**Subagent configuration:** +- Use \`subagent_type: "general-purpose"\` +- Do NOT use \`run_in_background\` — all specialists must complete before merge +- If any specialist subagent fails or times out, log the failure and continue with results from successful specialists. Specialists are additive — partial results are better than no results.`; +} + +function generateFindingsMerge(_ctx: TemplateContext): string { + return `### Step 4.6: Collect and merge findings + +After all specialist subagents complete, collect their outputs. + +**Parse findings:** +For each specialist's output: +1. If output is "NO FINDINGS" — skip, this specialist found nothing +2. Otherwise, parse each line as a JSON object. Skip lines that are not valid JSON. +3. Collect all parsed findings into a single list, tagged with their specialist name. + +**Fingerprint and deduplicate:** +For each finding, compute its fingerprint: +- If \`fingerprint\` field is present, use it +- Otherwise: \`{path}:{line}:{category}\` (if line is present) or \`{path}:{category}\` + +Group findings by fingerprint. For findings sharing the same fingerprint: +- Keep the finding with the highest confidence score +- Tag it: "MULTI-SPECIALIST CONFIRMED ({specialist1} + {specialist2})" +- Boost confidence by +1 (cap at 10) +- Note the confirming specialists in the output + +**Apply confidence gates:** +- Confidence 7+: show normally in the findings output +- Confidence 5-6: show with caveat "Medium confidence — verify this is actually an issue" +- Confidence 3-4: move to appendix (suppress from main findings) +- Confidence 1-2: suppress entirely + +**Compute PR Quality Score:** +After merging, compute the quality score: +\`quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))\` +Cap at 10. Log this in the review result at the end. + +**Output merged findings:** +Present the merged findings in the same format as the current review: + +\`\`\` +SPECIALIST REVIEW: N findings (X critical, Y informational) from Z specialists + +[For each finding, in order: CRITICAL first, then INFORMATIONAL, sorted by confidence descending] +[SEVERITY] (confidence: N/10, specialist: name) path:line — summary + Fix: recommended fix + [If MULTI-SPECIALIST CONFIRMED: show confirmation note] + +PR Quality Score: X/10 +\`\`\` + +These findings flow into Step 5 Fix-First alongside the CRITICAL pass findings from Step 4. +The Fix-First heuristic applies identically — specialist findings follow the same AUTO-FIX vs ASK classification.`; +} + +function generateRedTeam(ctx: TemplateContext): string { + return `### Red Team dispatch (conditional) + +**Activation:** Only if DIFF_LINES > 200 OR any specialist produced a CRITICAL finding. + +If activated, dispatch one more subagent via the Agent tool (foreground, not background). + +The Red Team subagent receives: +1. The red-team checklist from \`${ctx.paths.skillRoot}/review/specialists/red-team.md\` +2. The merged specialist findings from Step 4.6 (so it knows what was already caught) +3. The git diff command + +Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists +who found the following issues: {merged findings summary}. Your job is to find what they +MISSED. Read the checklist, run \`git diff origin/\`, and look for gaps. +Output findings as JSON objects (same schema as the specialists). Focus on cross-cutting +concerns, integration boundary issues, and failure modes that specialist checklists +don't cover." + +If the Red Team finds additional issues, merge them into the findings list before +Step 5 Fix-First. Red Team findings are tagged with \`"specialist":"red-team"\`. + +If the Red Team returns NO FINDINGS, note: "Red Team review: no additional issues found." +If the Red Team subagent fails or times out, skip silently and continue.`; +} + +export function generateReviewArmy(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should not run Review Army + if (ctx.host === 'codex') return ''; + + const sections = [ + generateSpecialistSelection(ctx), + generateSpecialistDispatch(ctx), + generateFindingsMerge(ctx), + generateRedTeam(ctx), + ]; + + return sections.join('\n\n---\n\n'); +} diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index 5db22644..55d4abb6 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -784,16 +784,71 @@ After producing the completion checklist: **Include in PR body (Step 8):** Add a \`## Plan Completion\` section with the checklist summary.`); } else { - // review mode + // review mode — enhanced Delivery Integrity (Release 2: Review Army) sections.push(` +### Fallback Intent Sources (when no plan file found) + +When no plan file is detected, use these secondary intent sources: + +1. **Commit messages:** Run \`git log origin/..HEAD --oneline\`. Use judgment to extract real intent: + - Commits with actionable verbs ("add", "implement", "fix", "create", "remove", "update") are intent signals + - Skip noise: "WIP", "tmp", "squash", "merge", "chore", "typo", "fixup" + - Extract the intent behind the commit, not the literal message +2. **TODOS.md:** If it exists, check for items related to this branch or recent dates +3. **PR description:** Run \`gh pr view --json body -q .body 2>/dev/null\` for intent context + +**With fallback sources:** Apply the same Cross-Reference classification (DONE/PARTIAL/NOT DONE/CHANGED) using best-effort matching. Note that fallback-sourced items are lower confidence than plan-file items. + +### Investigation Depth + +For each PARTIAL or NOT DONE item, investigate WHY: + +1. Check \`git log origin/..HEAD --oneline\` for commits that suggest the work was started, attempted, or reverted +2. Read the relevant code to understand what was built instead +3. Determine the likely reason from this list: + - **Scope cut** — evidence of intentional removal (revert commit, removed TODO) + - **Context exhaustion** — work started but stopped mid-way (partial implementation, no follow-up commits) + - **Misunderstood requirement** — something was built but it doesn't match what the plan described + - **Blocked by dependency** — plan item depends on something that isn't available + - **Genuinely forgotten** — no evidence of any attempt + +Output for each discrepancy: +\`\`\` +DISCREPANCY: {PARTIAL|NOT_DONE} | {plan item} | {what was actually delivered} +INVESTIGATION: {likely reason with evidence from git log / code} +IMPACT: {HIGH|MEDIUM|LOW} — {what breaks or degrades if this stays undelivered} +\`\`\` + +### Learnings Logging (plan-file discrepancies only) + +**Only for discrepancies sourced from plan files** (not commit messages or TODOS.md), log a learning so future sessions know this pattern occurred: + +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-learnings-log '{ + "type": "pitfall", + "key": "plan-delivery-gap-KEBAB_SUMMARY", + "insight": "Planned X but delivered Y because Z", + "confidence": 8, + "source": "observed", + "files": ["PLAN_FILE_PATH"] +}' +\`\`\` + +Replace KEBAB_SUMMARY with a kebab-case summary of the gap, and fill in the actual values. + +**Do NOT log learnings from commit-message-derived or TODOS.md-derived discrepancies.** These are informational in the review output but too noisy for durable memory. + ### Integration with Scope Drift Detection The plan completion results augment the existing Scope Drift Detection. If a plan file is found: - **NOT DONE items** become additional evidence for **MISSING REQUIREMENTS** in the scope drift report. - **Items in the diff that don't match any plan item** become evidence for **SCOPE CREEP** detection. +- **HIGH-impact discrepancies** trigger AskUserQuestion: + - Show the investigation findings + - Options: A) Stop and implement missing items, B) Ship anyway + create P1 TODOs, C) Intentionally dropped -This is **INFORMATIONAL** — does not block the review (consistent with existing scope drift behavior). +This is **INFORMATIONAL** unless HIGH-impact discrepancies are found (then it gates via AskUserQuestion). Update the scope drift output to include plan file context: @@ -803,11 +858,11 @@ Intent: Plan: Delivered: <1-line summary of what the diff actually does> Plan items: N DONE, M PARTIAL, K NOT DONE -[If NOT DONE: list each missing item] +[If NOT DONE: list each missing item with investigation] [If scope creep: list each out-of-scope change not in the plan] \`\`\` -**No plan file found:** Fall back to existing scope drift behavior (check TODOS.md and PR description only).`); +**No plan file found:** Use commit messages and TODOS.md as fallback sources (see above). If no intent sources at all, skip with: "No intent sources detected — skipping completion audit."`); } return sections.join('\n');