Merge remote-tracking branch 'origin/main' into garrytan/workspace-aware-ship

Rebumped v1.8.0.0 -> v1.11.0.0 (minor-past main's v1.10.1.0) using bin/gstack-next-version — the same queue-aware path this branch introduces. CHANGELOG repositioned so v1.11.0.0 sits above main's new entries (v1.10.1.0 / v1.10.0.0 / v1.9.0.0). Conflicts resolved: - VERSION, package.json: rebumped to v1.11.0.0 (util-picked) - bin/gstack-config: merged both lists (workspace_root + gbrain keys) - CHANGELOG.md: hoisted v1.11.0.0 entry above main's new entries Pre-existing failures in main (4) documented but not fixed in this PR: 1. gstack-brain-sync secret scan > blocks bearer-json (brain-sync tests) 2. no files larger than 2MB (security-bench fixture, already TODO'd) 3. selectTests > skill-specific change (touchfiles scoping) 4. Opus 4.7 overlay pacing directive (expectation stale after v1.10.1.0 removed the Fan out nudge) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 11:45:20 +02:00 · 2026-04-23 21:20:25 -07:00
parent 416a56a5c8
commit a64d70ba35
87 changed files with 14392 additions and 788 deletions
@@ -0,0 +1,133 @@
+/**
+ * Preflight for the overlay efficacy harness.
+ *
+ * Confirms, before any paid eval runs:
+ *   1. `@anthropic-ai/claude-agent-sdk` loads and `query()` is the expected shape.
+ *   2. `claude-opus-4-7` is a live API model ID (not a Claude Code alias).
+ *   3. The SDK event stream contains the types we assume (system init, assistant,
+ *      result) with the fields we destructure.
+ *   4. `scripts/resolvers/model-overlay.ts` resolves `{{INHERIT:claude}}` against
+ *      `opus-4-7.md` AND the resolved text contains the "Fan out explicitly" nudge.
+ *   5. A local `claude` binary exists at `which claude` so binary pinning is possible.
+ *
+ * Run: bun run scripts/preflight-agent-sdk.ts
+ *
+ * Exit 0 on success. Exit non-zero with a clear message on any failure. No
+ * side effects beyond stdout and a ~15 token API call.
+ */
+
+import { query, type SDKMessage } from '@anthropic-ai/claude-agent-sdk';
+import { readOverlay } from './resolvers/model-overlay';
+import { execSync } from 'child_process';
+
+async function main() {
+  const failures: string[] = [];
+  const pass = (msg: string) => console.log(`  ok  ${msg}`);
+  const fail = (msg: string) => {
+    console.log(`  FAIL  ${msg}`);
+    failures.push(msg);
+  };
+
+  // 1. Overlay resolver + fanout nudge text
+  console.log('1. Overlay resolver');
+  const resolved = readOverlay('opus-4-7');
+  if (!resolved) {
+    fail("readOverlay('opus-4-7') returned empty");
+  } else {
+    pass(`resolved overlay length: ${resolved.length} chars`);
+    if (resolved.includes('{{INHERIT:')) {
+      fail('resolved overlay still contains {{INHERIT:...}} directive');
+    } else {
+      pass('no unresolved INHERIT directives');
+    }
+    if (!/Fan out explicitly/i.test(resolved)) {
+      fail('resolved overlay does not contain "Fan out explicitly" text');
+    } else {
+      pass('fanout nudge text present in resolved overlay');
+    }
+  }
+
+  // 2. Local claude binary exists
+  console.log('\n2. Binary pinning');
+  let claudePath: string | null = null;
+  try {
+    claudePath = execSync('which claude', { encoding: 'utf-8' }).trim();
+    pass(`local claude binary: ${claudePath}`);
+  } catch {
+    fail('`which claude` failed — cannot pin binary');
+  }
+
+  // 3. SDK query end-to-end
+  console.log('\n3. SDK query end-to-end');
+  if (!process.env.ANTHROPIC_API_KEY) {
+    console.log('  skip  ANTHROPIC_API_KEY not set — cannot test live query');
+  } else {
+    try {
+      const events: SDKMessage[] = [];
+      const q = query({
+        prompt: 'say pong',
+        options: {
+          model: 'claude-opus-4-7',
+          systemPrompt: '',
+          tools: [],
+          permissionMode: 'bypassPermissions',
+          allowDangerouslySkipPermissions: true,
+          settingSources: [],
+          maxTurns: 1,
+          pathToClaudeCodeExecutable: claudePath ?? undefined,
+          env: { ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY },
+        },
+      });
+      for await (const ev of q) events.push(ev);
+      pass(`received ${events.length} events`);
+
+      const init = events.find(
+        (e) => e.type === 'system' && (e as { subtype?: string }).subtype === 'init',
+      ) as { claude_code_version?: string; model?: string } | undefined;
+      if (!init) {
+        fail('no system/init event received');
+      } else {
+        pass(`system init: claude_code_version=${init.claude_code_version}, model=${init.model}`);
+      }
+
+      const assistantEvents = events.filter((e) => e.type === 'assistant');
+      if (assistantEvents.length === 0) {
+        fail('no assistant events received — model ID may be rejected');
+      } else {
+        pass(`received ${assistantEvents.length} assistant event(s)`);
+        const first = assistantEvents[0] as { message?: { content?: unknown[] } };
+        const content = first.message?.content;
+        if (!Array.isArray(content)) {
+          fail('first assistant event has no content[] array');
+        } else {
+          pass(`first assistant content[] has ${content.length} block(s)`);
+        }
+      }
+
+      const result = events.find((e) => e.type === 'result') as
+        | { subtype?: string; total_cost_usd?: number; num_turns?: number }
+        | undefined;
+      if (!result) {
+        fail('no result event received');
+      } else {
+        pass(
+          `result: subtype=${result.subtype}, cost=$${result.total_cost_usd?.toFixed(4)}, turns=${result.num_turns}`,
+        );
+      }
+    } catch (err) {
+      fail(`SDK query threw: ${err instanceof Error ? err.message : String(err)}`);
+    }
+  }
+
+  console.log();
+  if (failures.length > 0) {
+    console.log(`PREFLIGHT FAILED: ${failures.length} check(s) failed`);
+    process.exit(1);
+  }
+  console.log('PREFLIGHT OK');
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
@@ -24,7 +24,7 @@ const OVERLAY_DIR = path.resolve(import.meta.dir, '../../model-overlays');

 const INHERIT_RE = /^\s*\{\{INHERIT:([a-z0-9-]+(?:\.[0-9]+)*)\}\}\s*\n/;

-function readOverlay(model: string, seen: Set<string> = new Set()): string {
+export function readOverlay(model: string, seen: Set<string> = new Set()): string {
  if (seen.has(model)) return ''; // cycle guard
  seen.add(model);

@@ -37,6 +37,9 @@ import { generateWritingStyleMigration } from './preamble/generate-writing-style
 // Host-specific instructions
 import { generateBrainHealthInstruction } from './preamble/generate-brain-health-instruction';

+// GBrain cross-machine sync (runs at skill start; end-side handled in completion-status)
+import { generateBrainSyncBlock } from './preamble/generate-brain-sync-block';
+
 // Behavioral / voice
 import { generateVoiceDirective } from './preamble/generate-voice-directive';

@@ -84,11 +87,16 @@ export function generatePreamble(ctx: TemplateContext): string {
    generateVendoringDeprecation(ctx),
    generateSpawnedSessionCheck(),
    generateBrainHealthInstruction(ctx),
+    // AskUserQuestion Format renders BEFORE the model overlay so the pacing rule
+    // is the ambient default; the overlay's behavioral nudges land as subordinate
+    // patches. Opus 4.7 reads top-to-bottom and absorbs the first pacing directive
+    // it hits; reversing this order regresses plan-review cadence (v1.6.4.0 bug).
+    ...(tier >= 2 ? [generateAskUserFormat(ctx)] : []),
+    generateBrainSyncBlock(ctx),
    generateModelOverlay(ctx),
    generateVoiceDirective(tier),
    ...(tier >= 2 ? [
      generateContextRecovery(ctx),
-      generateAskUserFormat(ctx),
      generateWritingStyle(ctx),
      generateCompletenessSection(),
      generateConfusionProtocol(),
@@ -3,16 +3,130 @@ import type { TemplateContext } from '../types';
 export function generateAskUserFormat(_ctx: TemplateContext): string {
  return `## AskUserQuestion Format

-**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.**
+**ALWAYS follow this structure for every AskUserQuestion call. Every element is non-skippable. If you find yourself about to skip any of them, stop and back up.**

-1. **Re-ground:** State the project, the current branch (use the \`_BRANCH\` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
-2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time.
-3. **Recommend (ALWAYS):** Every question ends with \`RECOMMENDATION: Choose [X] because [one-line reason]\` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind.
-4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with \`Completeness: N/10\` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip \`Completeness: N/10\` entirely and write one line: \`Note: options differ in kind, not coverage — no completeness score.\` Do not fabricate filler scores.
-5. **Options:** Lettered options: \`A) ... B) ... C) ...\` — when an option involves effort, show both scales: \`(human: ~X / CC: ~Y)\`
+### Required shape

-Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
+Every AskUserQuestion reads like a decision brief, not a bullet list:

-Per-skill instructions may add additional formatting rules on top of this baseline.`;
+\`\`\`
+D<N> — <one-line question title>
+
+ELI10: <plain English a 16-year-old could follow, 2-4 sentences, name the stakes>
+
+Stakes if we pick wrong: <one sentence on what breaks, what user sees, what's lost>
+
+Recommendation: <choice> because <one-line reason>
+
+Completeness: A=X/10, B=Y/10   (or: Note: options differ in kind, not coverage — no completeness score)
+
+Pros / cons:
+
+A) <option label> (recommended)
+  ✅ <pro — concrete, observable, ≥40 chars>
+  ✅ <pro>
+  ❌ <con — honest, ≥40 chars>
+
+B) <option label>
+  ✅ <pro>
+  ❌ <con>
+
+Net: <one-line synthesis of what you're actually trading off>
+\`\`\`
+
+### Element rules
+
+1. **D-numbering.** First question in a skill invocation is \`D1\`. Increment per
+   question within the same skill. This is a model-level instruction, not a
+   runtime counter — you count your own questions. Nested skill invocation
+   (e.g., \`/plan-ceo-review\` running \`/office-hours\` inline) starts its own
+   D1; label as \`D1 (office-hours)\` to disambiguate when the user will see
+   both. Drift is expected over long sessions; minor inconsistency is fine.
+
+2. **Re-ground.** Before ELI10, state the project, current branch (use the
+   \`_BRANCH\` value from the preamble, NOT conversation history or gitStatus),
+   and the current plan/task. 1-2 sentences. Assume the user hasn't looked at
+   this window in 20 minutes.
+
+3. **ELI10 (ALWAYS).** Explain in plain English a smart 16-year-old could
+   follow. Concrete examples and analogies, not function names. Say what it
+   DOES, not what it's called. This is not preamble — the user is about to
+   make a decision and needs context. Even in terse mode, emit the ELI10.
+
+4. **Stakes if we pick wrong (ALWAYS).** One sentence naming what breaks in
+   concrete terms (pain avoided / capability unlocked / consequence named).
+   "Users see a 3-second spinner" beats "performance may degrade." Forces
+   the trade-off to be real.
+
+5. **Recommendation (ALWAYS).** \`Recommendation: <choice> because <one-line
+   reason>\` on its own line. Never omit it. Required for every AskUserQuestion,
+   even when neutral-posture (see rule 8). The \`(recommended)\` label on the
+   option is REQUIRED — \`scripts/resolvers/question-tuning.ts\` reads it to
+   power the AUTO_DECIDE path. Omitting it breaks auto-decide.
+
+6. **Completeness scoring (when meaningful).** When options differ in
+   coverage (full test coverage vs happy path vs shortcut, complete error
+   handling vs partial), score each \`Completeness: N/10\` on its own line.
+   Calibration: 10 = complete, 7 = happy path only, 3 = shortcut. Flag any
+   option ≤5 where a higher-completeness option exists. When options differ
+   in kind (review posture, architectural A-vs-B, cherry-pick Add/Defer/Skip,
+   two different kinds of systems), SKIP the score and write one line:
+   \`Note: options differ in kind, not coverage — no completeness score.\`
+   Do NOT fabricate filler scores — empty 10/10 on every option is worse
+   than no score.
+
+7. **Pros / cons block.** Every option gets per-bullet ✅ (pro) and ❌ (con)
+   markers. Rules:
+   - **Minimum 2 pros and 1 con per option.** If you can't name a con for
+     the recommended option, the recommendation is hollow — go find one. If
+     you can't name a pro for the rejected option, the question isn't real.
+   - **Minimum 40 characters per bullet.** \`✅ Simple\` is not a pro. \`✅
+     Reuses the YAML frontmatter format already in MEMORY.md, zero new
+     parser\` is a pro. Concrete, observable, specific.
+   - **Hard-stop escape** for genuinely one-sided choices (destructive-action
+     confirmation, one-way doors): a single bullet \`✅ No cons — this is a
+     hard-stop choice\` satisfies the rule. Use sparingly; overuse flips a
+     decision brief into theater.
+
+8. **Net line (ALWAYS).** Closes the decision with a one-sentence synthesis
+   of what the user is actually trading off. From the reference screenshot:
+   *"The new-format case is speculative. The copy-format case is immediate
+   leverage. Copy now, evolve later if a real pattern emerges."* Not a
+   summary — a verdict frame.
+
+9. **Neutral-posture handling.** When the skill explicitly says "neutral
+   recommendation posture" (SELECTIVE EXPANSION cherry-picks, taste calls,
+   kind-differentiated choices where neither side dominates), the
+   Recommendation line reads: \`Recommendation: <default-choice> — this is a
+   taste call, no strong preference either way\`. The \`(recommended)\` label
+   STAYS on the default option (machine-readable hint for AUTO_DECIDE). The
+   \`— this is a taste call\` prose is the human-readable neutrality signal.
+   Both coexist.
+
+10. **Effort both-scales.** When an option involves effort, show both human
+    and CC scales: \`(human: ~2 days / CC: ~15 min)\`.
+
+11. **Tool_use, not prose.** A markdown block labeled \`Question:\` is not a
+    question — the user never sees it as interactive. If you wrote one in
+    prose, stop and reissue as an actual AskUserQuestion tool_use. The rich
+    markdown goes in the question body; the \`options\` array stays short
+    labels (A, B, C).
+
+### Self-check before emitting
+
+Before calling AskUserQuestion, verify:
+- [ ] D<N> header present
+- [ ] ELI10 paragraph present (stakes line too)
+- [ ] Recommendation line present with concrete reason
+- [ ] Completeness scored (coverage) OR kind-note present (kind)
+- [ ] Every option has ≥2 ✅ and ≥1 ❌, each ≥40 chars (or hard-stop escape)
+- [ ] (recommended) label on one option (even for neutral-posture — see rule 9)
+- [ ] Net line closes the decision
+- [ ] You are calling the tool, not writing prose
+
+If you'd need to read the source to understand your own explanation, it's
+too complex — simplify before emitting.
+
+Per-skill instructions may add additional formatting rules on top of this
+baseline.`;
 }
-
@@ -0,0 +1,124 @@
+/**
+ * gbrain-sync preamble block.
+ *
+ * Emits bash that runs at every skill invocation:
+ *   1. If ~/.gstack-brain-remote.txt exists AND ~/.gstack/.git is missing,
+ *      surface a restore-available hint (does NOT auto-run restore).
+ *   2. If sync is on, run `gstack-brain-sync --once` (drain + push).
+ *   3. On first skill of the day (24h cache via .brain-last-pull):
+ *      `git fetch` + ff-only merge (JSONL merge driver handles conflicts).
+ *   4. Emit a `BRAIN_SYNC:` status line so every skill surfaces health.
+ *
+ * Also emits prose instructions for the host LLM to fire a one-time privacy
+ * stop-gate via AskUserQuestion when gbrain_sync_mode is unset and gbrain
+ * is available on the host.
+ *
+ * Block emitted across all tiers. Internal bash short-circuits when feature
+ * is disabled; cost is <5ms.
+ *
+ * Skill-end sync is handled by the completion-status generator via a call
+ * to `gstack-brain-sync --discover-new` + `--once`.
+ */
+import type { TemplateContext } from '../types';
+
+export function generateBrainSyncBlock(ctx: TemplateContext): string {
+  const isBrainHost = ctx.host === 'gbrain' || ctx.host === 'hermes';
+  return `## GBrain Sync (skill start)
+
+\`\`\`bash
+# gbrain-sync: drain pending writes, pull once per day. Silent no-op when
+# the feature isn't initialized or gbrain_sync_mode is "off". See
+# docs/gbrain-sync.md.
+
+_GSTACK_HOME="\${GSTACK_HOME:-$HOME/.gstack}"
+_BRAIN_REMOTE_FILE="$HOME/.gstack-brain-remote.txt"
+_BRAIN_SYNC_BIN="${ctx.paths.binDir}/gstack-brain-sync"
+_BRAIN_CONFIG_BIN="${ctx.paths.binDir}/gstack-config"
+
+_BRAIN_SYNC_MODE=$("$_BRAIN_CONFIG_BIN" get gbrain_sync_mode 2>/dev/null || echo off)
+
+# New-machine hint: URL file present, local .git missing, sync not yet enabled.
+if [ -f "$_BRAIN_REMOTE_FILE" ] && [ ! -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" = "off" ]; then
+  _BRAIN_NEW_URL=$(head -1 "$_BRAIN_REMOTE_FILE" 2>/dev/null | tr -d '[:space:]')
+  if [ -n "$_BRAIN_NEW_URL" ]; then
+    echo "BRAIN_SYNC: brain repo detected: $_BRAIN_NEW_URL"
+    echo "BRAIN_SYNC: run 'gstack-brain-restore' to pull your cross-machine memory (or 'gstack-config set gbrain_sync_mode off' to dismiss forever)"
+  fi
+fi
+
+# Active-sync path.
+if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
+  # Once-per-day pull.
+  _BRAIN_LAST_PULL_FILE="$_GSTACK_HOME/.brain-last-pull"
+  _BRAIN_NOW=$(date +%s)
+  _BRAIN_DO_PULL=1
+  if [ -f "$_BRAIN_LAST_PULL_FILE" ]; then
+    _BRAIN_LAST=$(cat "$_BRAIN_LAST_PULL_FILE" 2>/dev/null || echo 0)
+    _BRAIN_AGE=$(( _BRAIN_NOW - _BRAIN_LAST ))
+    [ "$_BRAIN_AGE" -lt 86400 ] && _BRAIN_DO_PULL=0
+  fi
+  if [ "$_BRAIN_DO_PULL" = "1" ]; then
+    ( cd "$_GSTACK_HOME" && git fetch origin >/dev/null 2>&1 && git merge --ff-only "origin/$(git rev-parse --abbrev-ref HEAD)" >/dev/null 2>&1 ) || true
+    echo "$_BRAIN_NOW" > "$_BRAIN_LAST_PULL_FILE"
+  fi
+  # Drain pending queue, push.
+  "$_BRAIN_SYNC_BIN" --once 2>/dev/null || true
+fi
+
+# Status line — always emitted, easy to grep.
+if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
+  _BRAIN_QUEUE_DEPTH=0
+  [ -f "$_GSTACK_HOME/.brain-queue.jsonl" ] && _BRAIN_QUEUE_DEPTH=$(wc -l < "$_GSTACK_HOME/.brain-queue.jsonl" | tr -d ' ')
+  _BRAIN_LAST_PUSH="never"
+  [ -f "$_GSTACK_HOME/.brain-last-push" ] && _BRAIN_LAST_PUSH=$(cat "$_GSTACK_HOME/.brain-last-push" 2>/dev/null || echo never)
+  echo "BRAIN_SYNC: mode=$_BRAIN_SYNC_MODE | last_push=$_BRAIN_LAST_PUSH | queue=$_BRAIN_QUEUE_DEPTH"
+else
+  echo "BRAIN_SYNC: off"
+fi
+\`\`\`
+
+${isBrainHost ? `If the bash output shows \`BRAIN_SYNC: brain repo detected\`, the user copied their remote URL file to this machine but hasn't restored yet. Offer to run \`gstack-brain-restore\` via AskUserQuestion. If the user agrees, run the command; otherwise continue without sync.` : ''}
+
+**Privacy stop-gate (fires ONCE per machine).**
+
+If the bash output shows \`BRAIN_SYNC: off\` AND the config value
+\`gbrain_sync_mode_prompted\` is \`false\` AND gbrain is detected on this host
+(either \`gbrain doctor --fast --json\` succeeds or the \`gbrain\` binary is in PATH),
+fire a one-time privacy gate via AskUserQuestion:
+
+> gstack can publish your session memory (learnings, plans, designs, retros) to a
+> private GitHub repo that GBrain indexes across your machines. Higher tiers
+> include behavioral data (session timelines, developer profile). How much do you
+> want to sync?
+
+Options:
+- A) Everything allowlisted (recommended — maximum cross-machine memory)
+- B) Only artifacts (plans, designs, retros, learnings) — skip timelines and profile
+- C) Decline — keep everything local
+
+After the user answers, run (substituting the chosen value):
+
+\`\`\`bash
+# Chosen mode: full | artifacts-only | off
+"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode <choice>
+"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode_prompted true
+\`\`\`
+
+If A or B was chosen AND \`~/.gstack/.git\` doesn't exist, ask a follow-up:
+"Set up the GBrain sync repo now? (runs \`gstack-brain-init\`)"
+- A) Yes, run it now
+- B) Show me the command, I'll run it myself
+
+Do not block the skill. Emit the question, continue the skill workflow. The
+next skill run picks up wherever this left off.
+
+**At skill END (before the telemetry block),** run these bash commands to
+catch artifact writes (design docs, plans, retros) that skipped the writer
+shims, plus drain any still-pending queue entries:
+
+\`\`\`bash
+"${ctx.paths.binDir}/gstack-brain-sync" --discover-new 2>/dev/null || true
+"${ctx.paths.binDir}/gstack-brain-sync" --once 2>/dev/null || true
+\`\`\`
+`;
+}