From 8b16aa977b96bf708b963eef7ca0ae6fbf26678e Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Fri, 20 Mar 2026 08:33:59 -0700
Subject: [PATCH] =?UTF-8?q?feat:=20test=20failure=20ownership=20triage=20?=
 =?UTF-8?q?=E2=80=94=20see=20something=20say=20something?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds two new preamble sections to all gstack skills:
- Repo Ownership Mode: explains solo vs collaborative behavior
- See Something, Say Something: proactive issue flagging principle

Adds {{TEST_FAILURE_TRIAGE}} template variable (opt-in, used by /ship):
- Classifies test failures as in-branch vs pre-existing
- Solo mode defaults to "investigate and fix now"
- Collaborative mode offers "blame + assign GitHub issue" option
- Also offers P0 TODO and skip options

/ship Step 3 now triages test failures instead of hard-stopping on all
failures. In-branch failures still block shipping. Pre-existing failures
get user-directed triage based on repo mode.

Adds P2 TODO for gstack notes system (deferred lightweight reminder).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 TODOS.md                      |  12 ++++
 scripts/gen-skill-docs.ts     | 112 ++++++++++++++++++++++++++++++++++
 ship/SKILL.md.tmpl            |   8 ++-
 test/helpers/touchfiles.ts    |   4 +-
 test/skill-validation.test.ts |  55 +++++++++++++++++
 5 files changed, 187 insertions(+), 4 deletions(-)

diff --git a/TODOS.md b/TODOS.md
index 766c3a78..2b2e324d 100644
--- a/TODOS.md
+++ b/TODOS.md
@@ -163,6 +163,18 @@
 **Priority:** P2
 **Depends on:** None
 
+### Gstack notes system for deferred test failures
+
+**What:** Add lightweight notes persistence — JSON files in `~/.gstack/projects/{SLUG}/notes/`, surfaced at session start via preamble, manual resolve.
+
+**Why:** Gives solo devs a "fix it later" path for pre-existing test failures that auto-surfaces reminders next session. Currently the triage offers fix/TODO/skip but no lightweight "remind me" option.
+
+**Context:** Deferred from the test failure ownership PR because auto-resolve by test name matching is fragile (renamed tests, split failures, changed filenames break matching). Start with manual resolve only. Schema: `{type, title, description, test_file, error_summary, branch_when_noticed, created, priority, status}`. Surface in preamble with cap of 5 notes shown.
+
+**Effort:** S
+**Priority:** P2
+**Depends on:** Test failure ownership triage (bin/gstack-repo-mode + {{TEST_FAILURE_TRIAGE}})
+
 ### Post-deploy verification (ship + browse)
 
 **What:** After push, browse staging/preview URL, screenshot key pages, check console for JS errors, compare staging vs prod via snapshot diff. Include verification screenshots in PR body. STOP if critical errors found.
diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts
index 8ac36a46..820671a6 100644
--- a/scripts/gen-skill-docs.ts
+++ b/scripts/gen-skill-docs.ts
@@ -151,6 +151,8 @@ _PROACTIVE=$(${ctx.paths.binDir}/gstack-config get proactive 2>/dev/null || echo
 _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
 echo "BRANCH: $_BRANCH"
 echo "PROACTIVE: $_PROACTIVE"
+source <(${ctx.paths.binDir}/gstack-repo-mode 2>/dev/null) || REPO_MODE=unknown
+echo "REPO_MODE: $REPO_MODE"
 _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
 echo "LAKE_INTRO: $_LAKE_SEEN"
 _TEL=$(~/.claude/skills/gstack/bin/gstack-config get telemetry 2>/dev/null || true)
@@ -262,6 +264,114 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p
 - BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.")`;
 }
 
+function generateRepoModeSection(): string {
+  return `## Repo Ownership Mode — See Something, Say Something
+
+\`REPO_MODE\` from the preamble tells you who owns issues in this repo:
+
+- **\`solo\`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action.
+- **\`collaborative\`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing.
+- **\`unknown\`** — Treat as collaborative (safer default — ask before fixing).
+
+**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on.
+
+Never let a noticed issue silently pass. The whole point is proactive communication.`;
+}
+
+function generateTestFailureTriage(): string {
+  return `## Test Failure Ownership Triage
+
+When tests fail, do NOT immediately stop. First, determine ownership:
+
+### Step T1: Classify each failure
+
+For each failing test:
+
+1. **Get the files changed on this branch:**
+   \`\`\`bash
+   git diff origin/<base>...HEAD --name-only
+   \`\`\`
+
+2. **Classify the failure:**
+   - **In-branch** if: the failing test file itself was modified on this branch, OR the test output references code that was changed on this branch, OR you can trace the failure to a change in the branch diff.
+   - **Likely pre-existing** if: neither the test file nor the code it tests was modified on this branch, AND the failure is unrelated to any branch change you can identify.
+   - **When ambiguous, default to in-branch.** It is safer to stop the developer than to let a broken test ship. Only classify as pre-existing when you are confident.
+
+   This classification is heuristic — use your judgment reading the diff and the test output. You do not have a programmatic dependency graph.
+
+### Step T2: Handle in-branch failures
+
+**STOP.** These are your failures. Show them and do not proceed. The developer must fix their own broken tests before shipping.
+
+### Step T3: Handle pre-existing failures
+
+Check \`REPO_MODE\` from the preamble output.
+
+**If REPO_MODE is \`solo\`:**
+
+Use AskUserQuestion:
+
+> These test failures appear pre-existing (not caused by your branch changes):
+>
+> [list each failure with file:line and brief error description]
+>
+> Since this is a solo repo, you're the only one who will fix these.
+>
+> RECOMMENDATION: Choose A — fix now while the context is fresh. Completeness: 9/10.
+> A) Investigate and fix now (human: ~2-4h / CC: ~15min) — Completeness: 10/10
+> B) Add as P0 TODO — fix after this branch lands — Completeness: 7/10
+> C) Skip — I know about this, ship anyway — Completeness: 3/10
+
+**If REPO_MODE is \`collaborative\` or \`unknown\`:**
+
+Use AskUserQuestion:
+
+> These test failures appear pre-existing (not caused by your branch changes):
+>
+> [list each failure with file:line and brief error description]
+>
+> This is a collaborative repo — these may be someone else's responsibility.
+>
+> RECOMMENDATION: Choose B — assign it to whoever broke it so the right person fixes it. Completeness: 9/10.
+> A) Investigate and fix now anyway — Completeness: 10/10
+> B) Blame + assign GitHub issue to the author — Completeness: 9/10
+> C) Add as P0 TODO — Completeness: 7/10
+> D) Skip — ship anyway — Completeness: 3/10
+
+### Step T4: Execute the chosen action
+
+**If "Investigate and fix now":**
+- Switch to /investigate mindset: root cause first, then minimal fix.
+- Fix the pre-existing failure.
+- Commit the fix separately from the branch's changes: \`git commit -m "fix: pre-existing test failure in <test-file>"\`
+- Continue with the workflow.
+
+**If "Add as P0 TODO":**
+- If \`TODOS.md\` exists, add the entry following the format in \`review/TODOS-format.md\` (or \`.claude/skills/review/TODOS-format.md\`).
+- If \`TODOS.md\` does not exist, create it with the standard header and add the entry.
+- Entry should include: title, the error output, which branch it was noticed on, and priority P0.
+- Continue with the workflow — treat the pre-existing failure as non-blocking.
+
+**If "Blame + assign GitHub issue" (collaborative only):**
+- Find who last modified the failing area:
+  \`\`\`bash
+  git log --format="%an (%ae)" -1 -- <failing-test-file>
+  \`\`\`
+- Create a GitHub issue assigned to that person:
+  \`\`\`bash
+  gh issue create \\
+    --title "Pre-existing test failure: <test-name>" \\
+    --body "Found failing on branch <current-branch>. Failure is pre-existing.\\n\\n**Error:**\\n\`\`\`\\n<first 10 lines>\\n\`\`\`\\n\\n**Last modified by:** <author>\\n**Noticed by:** gstack /ship on <date>" \\
+    --assignee "<github-username>"
+  \`\`\`
+- If \`gh\` is not available or \`--assignee\` fails (user not in org, etc.), create the issue without assignee and note who should look at it in the body.
+- Continue with the workflow.
+
+**If "Skip":**
+- Continue with the workflow.
+- Note in output: "Pre-existing test failure skipped: <test-name>"`;
+}
+
 function generateContributorMode(): string {
   return `## Contributor Mode
 
@@ -364,6 +474,7 @@ function generatePreamble(ctx: TemplateContext): string {
     generateTelemetryPrompt(ctx),
     generateAskUserFormat(ctx),
     generateCompletenessSection(),
+    generateRepoModeSection(),
     generateContributorMode(),
     generateCompletionStatus(),
   ].join('\n\n');
@@ -1272,6 +1383,7 @@ const RESOLVERS: Record<string, (ctx: TemplateContext) => string> = {
   DESIGN_REVIEW_LITE: generateDesignReviewLite,
   REVIEW_DASHBOARD: generateReviewDashboard,
   TEST_BOOTSTRAP: generateTestBootstrap,
+  TEST_FAILURE_TRIAGE: generateTestFailureTriage,
 };
 
 // ─── Codex Helpers ───────────────────────────────────────────
diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl
index 22dff7d0..d9669bd6 100644
--- a/ship/SKILL.md.tmpl
+++ b/ship/SKILL.md.tmpl
@@ -26,7 +26,7 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat
 **Only stop for:**
 - On the base branch (abort)
 - Merge conflicts that can't be auto-resolved (stop, show conflicts)
-- Test failures (stop, show failures)
+- In-branch test failures (pre-existing failures are triaged, not auto-blocking)
 - Pre-landing review finds ASK items that need user judgment
 - MINOR or MAJOR version bump needed (ask — see Step 4)
 - Greptile review comments that need user decision (complex fixes, false positives)
@@ -118,7 +118,11 @@ wait
 
 After both complete, read the output files and check pass/fail.
 
-**If any test fails:** Show the failures and **STOP**. Do not proceed.
+**If any test fails:** Do NOT immediately stop. Apply the Test Failure Ownership Triage:
+
+{{TEST_FAILURE_TRIAGE}}
+
+**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 3.25.
 
 **If all pass:** Continue silently — just note the counts briefly.
 
diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts
index 53cc709c..2ba0e10b 100644
--- a/test/helpers/touchfiles.ts
+++ b/test/helpers/touchfiles.ts
@@ -64,7 +64,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
   'plan-eng-review-artifact':  ['plan-eng-review/**'],
 
   // Ship
-  'ship-base-branch': ['ship/**'],
+  'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
 
   // Retro
   'retro':             ['retro/**'],
@@ -84,7 +84,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
   'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
 
   // Ship coverage audit
-  'ship-coverage-audit': ['ship/**'],
+  'ship-coverage-audit': ['ship/**', 'bin/gstack-repo-mode'],
 
   // Design
   'design-consultation-core':     ['design-consultation/**'],
diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts
index ea683762..6d68f086 100644
--- a/test/skill-validation.test.ts
+++ b/test/skill-validation.test.ts
@@ -1324,3 +1324,58 @@ describe('Codex skill validation', () => {
     }
   });
 });
+
+// --- Repo mode and test failure triage validation ---
+
+describe('Repo mode preamble validation', () => {
+  test('generated SKILL.md preamble contains REPO_MODE output', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    expect(content).toContain('REPO_MODE:');
+    expect(content).toContain('gstack-repo-mode');
+  });
+
+  test('generated SKILL.md contains See Something Say Something section', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    expect(content).toContain('See Something, Say Something');
+    expect(content).toContain('REPO_MODE');
+    expect(content).toContain('solo');
+    expect(content).toContain('collaborative');
+  });
+});
+
+describe('Test failure triage in ship skill', () => {
+  test('ship/SKILL.md contains Test Failure Ownership Triage', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('Test Failure Ownership Triage');
+  });
+
+  test('ship/SKILL.md triage uses git diff for classification', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('git diff origin/<base>...HEAD --name-only');
+  });
+
+  test('ship/SKILL.md triage has solo and collaborative paths', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('REPO_MODE');
+    expect(content).toContain('solo');
+    expect(content).toContain('collaborative');
+    expect(content).toContain('Investigate and fix now');
+    expect(content).toContain('Add as P0 TODO');
+  });
+
+  test('ship/SKILL.md triage has GitHub issue assignment for collaborative mode', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('gh issue create');
+    expect(content).toContain('--assignee');
+  });
+
+  test('{{TEST_FAILURE_TRIAGE}} placeholder is fully resolved in ship/SKILL.md', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).not.toContain('{{TEST_FAILURE_TRIAGE}}');
+  });
+
+  test('ship/SKILL.md uses in-branch language for stop condition', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('In-branch test failures');
+  });
+});