feat(test): transcript-section-logger + ship-action fingerprint (T10)

Pure-analysis module over a SkillTestResult/NDJSON transcript: - extractSectionReads(): which sections/*.md a run opened (post-carve check) - extractShipActions(): observable action fingerprint (merge/test/bump/ changelog/commit/push/pr) that works on the MONOLITH too, so a baseline captured before the carve can detect a sectioned-ship regression - baseline read/write + compareShipActions() for baseline-first dogf(T10) Baseline-first answers the Codex outside-voice critique that a logger in the same PR as the carve is post-failure telemetry without a pre-carve reference. 11 unit tests, all green. Paid monolith baseline capture runs separately. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 15:20:11 +02:00 · 2026-05-29 21:19:10 -07:00
parent ce5fbfa99f
commit 1ed4106399
2 changed files with 332 additions and 0 deletions
@@ -0,0 +1,196 @@
+/**
+ * Transcript section logger (v2 plan T10).
+ *
+ * Two jobs, both pure analysis over a SkillTestResult / NDJSON transcript:
+ *
+ *  1. extractSectionReads()  — which `sections/*.md` files a run actually Read.
+ *     Used by the sectioned world (post-carve) to verify the agent opened the
+ *     chapters its situation required.
+ *
+ *  2. extractShipActions()   — an observable ACTION fingerprint of a /ship run
+ *     (ran tests, bumped VERSION, wrote CHANGELOG, created PR, ...). This works
+ *     on BOTH the monolith and the sectioned skill, which is the whole point:
+ *     capture a baseline on the current monolith ship FIRST, then assert the
+ *     sectioned ship still performs the same actions. A section-read check alone
+ *     can't catch "agent read the chapter but skipped the step"; the action
+ *     fingerprint can.
+ *
+ * Why baseline-first (Codex outside-voice critique on the T9 plan): a logger
+ * shipped in the same PR as the carve is post-failure telemetry unless it has a
+ * pre-carve reference. captureShipBaseline() records the monolith's action
+ * fingerprint so compareShipActions() can flag a regression introduced by the
+ * carve.
+ *
+ * Pure functions, no I/O except the explicit read/write baseline helpers. The
+ * unit tests drive these with synthetic transcripts — no paid run needed to
+ * validate the logic.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/** Minimal shape we need from SkillTestResult — kept structural so callers can
+ *  pass a full SkillTestResult or a hand-built fixture in unit tests. */
+export interface ToolCallLike {
+  tool: string;
+  input: unknown;
+  output?: string;
+}
+export interface TranscriptResultLike {
+  toolCalls: ToolCallLike[];
+  output?: string;
+}
+
+/** Pull the file_path off a tool-call input, tolerating unknown shapes. */
+function readFilePath(input: unknown): string | null {
+  if (input && typeof input === 'object') {
+    const fp = (input as Record<string, unknown>).file_path;
+    if (typeof fp === 'string') return fp;
+  }
+  return null;
+}
+
+/** Pull the command string off a Bash tool-call input. */
+function bashCommand(input: unknown): string | null {
+  if (input && typeof input === 'object') {
+    const cmd = (input as Record<string, unknown>).command;
+    if (typeof cmd === 'string') return cmd;
+  }
+  return null;
+}
+
+/**
+ * Every `sections/<name>.md` file the run Read, normalized to the section
+ * basename (e.g. "version-bump.md"). Deduped, in first-Read order. Matching is
+ * on the path segment `/sections/<file>.md` so it works regardless of whether
+ * the host resolved a relative, absolute, or prefixed install path.
+ */
+export function extractSectionReads(result: TranscriptResultLike): string[] {
+  const seen = new Set<string>();
+  const ordered: string[] = [];
+  for (const call of result.toolCalls) {
+    if (call.tool !== 'Read') continue;
+    const fp = readFilePath(call.input);
+    if (!fp) continue;
+    const m = fp.match(/(?:^|\/)sections\/([A-Za-z0-9._-]+\.md)$/);
+    if (!m) continue;
+    const name = m[1];
+    if (!seen.has(name)) {
+      seen.add(name);
+      ordered.push(name);
+    }
+  }
+  return ordered;
+}
+
+/**
+ * The canonical /ship action vocabulary. Each action is detected from the Bash
+ * commands the agent ran (plus a couple of Write/Edit signals). Order is the
+ * rough ship sequence; detection is order-independent.
+ *
+ * Keep this list aligned with the ship skeleton's numbered steps. The
+ * section-loading eval asserts the sectioned ship still triggers the same
+ * actions a monolith run did for the same fixture situation.
+ */
+export const SHIP_ACTIONS = [
+  'merged_base',       // git merge <base>
+  'ran_tests',         // bun test / npm test / the project test cmd
+  'bumped_version',    // wrote VERSION / package.json version / ran gstack-version-bump
+  'wrote_changelog',   // edited CHANGELOG.md
+  'committed',         // git commit
+  'pushed',            // git push
+  'opened_pr',         // gh pr create / glab mr create
+] as const;
+export type ShipAction = (typeof SHIP_ACTIONS)[number];
+
+const BASH_ACTION_PATTERNS: Array<{ action: ShipAction; re: RegExp }> = [
+  { action: 'merged_base', re: /\bgit\s+merge\b/ },
+  { action: 'ran_tests', re: /\b(bun\s+test|npm\s+(run\s+)?test|yarn\s+test|pytest|go\s+test|cargo\s+test|rspec)\b/ },
+  { action: 'bumped_version', re: /gstack-version-bump\b|gstack-next-version\b|>\s*VERSION\b|npm\s+version\b/ },
+  { action: 'wrote_changelog', re: /CHANGELOG\.md/ },
+  { action: 'committed', re: /\bgit\s+commit\b/ },
+  { action: 'pushed', re: /\bgit\s+push\b/ },
+  { action: 'opened_pr', re: /\bgh\s+pr\s+create\b|\bglab\s+mr\s+create\b/ },
+];
+
+/**
+ * The observable action fingerprint of a ship run. Works on monolith AND
+ * sectioned skills because it reads what the agent DID (Bash + file writes),
+ * not which prose it loaded.
+ */
+export function extractShipActions(result: TranscriptResultLike): ShipAction[] {
+  const found = new Set<ShipAction>();
+  for (const call of result.toolCalls) {
+    if (call.tool === 'Bash') {
+      const cmd = bashCommand(call.input);
+      if (!cmd) continue;
+      for (const { action, re } of BASH_ACTION_PATTERNS) {
+        if (re.test(cmd)) found.add(action);
+      }
+    } else if (call.tool === 'Write' || call.tool === 'Edit') {
+      const fp = readFilePath(call.input);
+      if (fp && /CHANGELOG\.md$/.test(fp)) found.add('wrote_changelog');
+      if (fp && /(?:^|\/)VERSION$/.test(fp)) found.add('bumped_version');
+    }
+  }
+  // Preserve canonical order.
+  return SHIP_ACTIONS.filter(a => found.has(a));
+}
+
+export interface ShipBaseline {
+  tag: string;
+  /** Fixture/situation id this baseline was captured for. */
+  situation: string;
+  /** Action fingerprint observed on the monolith ship. */
+  actions: ShipAction[];
+  /** Section reads observed (empty on the monolith — present after carve). */
+  sectionReads: string[];
+  capturedAt: string;
+}
+
+const DEFAULT_BASELINE_DIR = path.join(os.homedir(), '.gstack-dev', 'ship-baselines');
+
+/** Where a baseline for a given situation lives. */
+export function baselinePath(situation: string, dir = DEFAULT_BASELINE_DIR): string {
+  return path.join(dir, `${situation}.json`);
+}
+
+/** Persist a ship baseline (used once on the monolith, before the carve). */
+export function writeShipBaseline(baseline: ShipBaseline, dir = DEFAULT_BASELINE_DIR): string {
+  fs.mkdirSync(dir, { recursive: true });
+  const p = baselinePath(baseline.situation, dir);
+  fs.writeFileSync(p, JSON.stringify(baseline, null, 2) + '\n');
+  return p;
+}
+
+/** Read a previously-captured baseline, or null if none exists yet. */
+export function readShipBaseline(situation: string, dir = DEFAULT_BASELINE_DIR): ShipBaseline | null {
+  try {
+    return JSON.parse(fs.readFileSync(baselinePath(situation, dir), 'utf-8')) as ShipBaseline;
+  } catch {
+    return null;
+  }
+}
+
+export interface ShipActionDiff {
+  /** Actions the baseline performed that the current run did NOT (the regression set). */
+  missing: ShipAction[];
+  /** Actions the current run performed that the baseline did not (usually fine). */
+  added: ShipAction[];
+  /** True when no baseline action was dropped. */
+  ok: boolean;
+}
+
+/**
+ * Compare a current sectioned-ship run against the monolith baseline. A dropped
+ * action (in baseline, not in current) is the carve regression we care about:
+ * the sectioned ship stopped doing something the monolith did.
+ */
+export function compareShipActions(baseline: ShipBaseline, current: ShipAction[]): ShipActionDiff {
+  const cur = new Set(current);
+  const base = new Set(baseline.actions);
+  const missing = baseline.actions.filter(a => !cur.has(a));
+  const added = current.filter(a => !base.has(a));
+  return { missing, added, ok: missing.length === 0 };
+}
@@ -0,0 +1,136 @@
+/**
+ * Unit tests for the transcript section logger (T10). Pure-function coverage —
+ * no paid run needed. Drives the analyzers with synthetic tool-call transcripts.
+ */
+
+import { describe, test, expect, afterAll } from 'bun:test';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import {
+  extractSectionReads,
+  extractShipActions,
+  compareShipActions,
+  writeShipBaseline,
+  readShipBaseline,
+  baselinePath,
+  SHIP_ACTIONS,
+  type ToolCallLike,
+  type ShipBaseline,
+} from './helpers/transcript-section-logger';
+
+const read = (fp: string): ToolCallLike => ({ tool: 'Read', input: { file_path: fp }, output: '' });
+const bash = (command: string): ToolCallLike => ({ tool: 'Bash', input: { command }, output: '' });
+
+describe('extractSectionReads', () => {
+  test('picks up section reads via the /sections/<file>.md segment', () => {
+    const result = {
+      toolCalls: [
+        read('/Users/x/.claude/skills/gstack-ship/sections/version-bump.md'),
+        read('ship/sections/changelog.md'),
+        read('/abs/.factory/skills/gstack-ship/sections/review-army.md'),
+      ],
+    };
+    expect(extractSectionReads(result)).toEqual(['version-bump.md', 'changelog.md', 'review-army.md']);
+  });
+
+  test('ignores non-section reads and non-Read tools', () => {
+    const result = {
+      toolCalls: [
+        read('ship/SKILL.md'),
+        read('/some/sections-like/notsections/x.md'),
+        bash('cat ship/sections/version-bump.md'), // bash, not a Read
+      ],
+    };
+    expect(extractSectionReads(result)).toEqual([]);
+  });
+
+  test('dedupes and preserves first-read order', () => {
+    const result = {
+      toolCalls: [
+        read('ship/sections/tests.md'),
+        read('ship/sections/version-bump.md'),
+        read('ship/sections/tests.md'),
+      ],
+    };
+    expect(extractSectionReads(result)).toEqual(['tests.md', 'version-bump.md']);
+  });
+});
+
+describe('extractShipActions', () => {
+  test('detects the full action fingerprint from bash + writes', () => {
+    const result = {
+      toolCalls: [
+        bash('git merge origin/main'),
+        bash('bun test'),
+        bash('gstack-version-bump --bump minor'),
+        { tool: 'Edit', input: { file_path: 'CHANGELOG.md' }, output: '' },
+        bash('git commit -m "v1.2.0.0 feat"'),
+        bash('git push origin HEAD'),
+        bash('gh pr create --base main'),
+      ],
+    };
+    expect(extractShipActions(result)).toEqual([...SHIP_ACTIONS]);
+  });
+
+  test('returns canonical order regardless of execution order', () => {
+    const result = {
+      toolCalls: [
+        bash('gh pr create --base main'),
+        bash('git merge origin/main'),
+      ],
+    };
+    expect(extractShipActions(result)).toEqual(['merged_base', 'opened_pr']);
+  });
+
+  test('VERSION write counts as a version bump even without the CLI', () => {
+    const result = { toolCalls: [{ tool: 'Write', input: { file_path: 'VERSION' }, output: '' }] };
+    expect(extractShipActions(result)).toEqual(['bumped_version']);
+  });
+
+  test('empty run produces empty fingerprint', () => {
+    expect(extractShipActions({ toolCalls: [] })).toEqual([]);
+  });
+});
+
+describe('compareShipActions', () => {
+  const baseline: ShipBaseline = {
+    tag: 'monolith',
+    situation: 'fresh-version-changing',
+    actions: ['merged_base', 'ran_tests', 'bumped_version', 'wrote_changelog', 'committed', 'pushed', 'opened_pr'],
+    sectionReads: [],
+    capturedAt: '2026-05-30T00:00:00Z',
+  };
+
+  test('flags a dropped action as the carve regression', () => {
+    const current = baseline.actions.filter(a => a !== 'bumped_version');
+    const diff = compareShipActions(baseline, current);
+    expect(diff.ok).toBe(false);
+    expect(diff.missing).toEqual(['bumped_version']);
+  });
+
+  test('passes when the sectioned run performs every baseline action', () => {
+    const diff = compareShipActions(baseline, [...baseline.actions, 'merged_base']);
+    expect(diff.ok).toBe(true);
+    expect(diff.missing).toEqual([]);
+  });
+});
+
+describe('baseline persistence', () => {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'ship-baseline-'));
+  afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* noop */ } });
+
+  test('round-trips a baseline to disk', () => {
+    const baseline: ShipBaseline = {
+      tag: 'monolith', situation: 'no-plan-file',
+      actions: ['ran_tests', 'committed'], sectionReads: [], capturedAt: '2026-05-30T00:00:00Z',
+    };
+    const p = writeShipBaseline(baseline, dir);
+    expect(p).toBe(baselinePath('no-plan-file', dir));
+    expect(readShipBaseline('no-plan-file', dir)).toEqual(baseline);
+  });
+
+  test('returns null when no baseline captured yet', () => {
+    expect(readShipBaseline('never-captured', dir)).toBeNull();
+  });
+});