feat(test): transcript-section-logger + ship-action fingerprint (T10)

Pure-analysis module over a SkillTestResult/NDJSON transcript:
- extractSectionReads(): which sections/*.md a run opened (post-carve check)
- extractShipActions(): observable action fingerprint (merge/test/bump/
  changelog/commit/push/pr) that works on the MONOLITH too, so a baseline
  captured before the carve can detect a sectioned-ship regression
- baseline read/write + compareShipActions() for baseline-first dogf(T10)

Baseline-first answers the Codex outside-voice critique that a logger in the
same PR as the carve is post-failure telemetry without a pre-carve reference.

11 unit tests, all green. Paid monolith baseline capture runs separately.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-29 21:19:10 -07:00
parent ce5fbfa99f
commit 1ed4106399
2 changed files with 332 additions and 0 deletions
+196
View File
@@ -0,0 +1,196 @@
/**
* Transcript section logger (v2 plan T10).
*
* Two jobs, both pure analysis over a SkillTestResult / NDJSON transcript:
*
* 1. extractSectionReads() — which `sections/*.md` files a run actually Read.
* Used by the sectioned world (post-carve) to verify the agent opened the
* chapters its situation required.
*
* 2. extractShipActions() — an observable ACTION fingerprint of a /ship run
* (ran tests, bumped VERSION, wrote CHANGELOG, created PR, ...). This works
* on BOTH the monolith and the sectioned skill, which is the whole point:
* capture a baseline on the current monolith ship FIRST, then assert the
* sectioned ship still performs the same actions. A section-read check alone
* can't catch "agent read the chapter but skipped the step"; the action
* fingerprint can.
*
* Why baseline-first (Codex outside-voice critique on the T9 plan): a logger
* shipped in the same PR as the carve is post-failure telemetry unless it has a
* pre-carve reference. captureShipBaseline() records the monolith's action
* fingerprint so compareShipActions() can flag a regression introduced by the
* carve.
*
* Pure functions, no I/O except the explicit read/write baseline helpers. The
* unit tests drive these with synthetic transcripts — no paid run needed to
* validate the logic.
*/
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
/** Minimal shape we need from SkillTestResult — kept structural so callers can
* pass a full SkillTestResult or a hand-built fixture in unit tests. */
export interface ToolCallLike {
tool: string;
input: unknown;
output?: string;
}
export interface TranscriptResultLike {
toolCalls: ToolCallLike[];
output?: string;
}
/** Pull the file_path off a tool-call input, tolerating unknown shapes. */
function readFilePath(input: unknown): string | null {
if (input && typeof input === 'object') {
const fp = (input as Record<string, unknown>).file_path;
if (typeof fp === 'string') return fp;
}
return null;
}
/** Pull the command string off a Bash tool-call input. */
function bashCommand(input: unknown): string | null {
if (input && typeof input === 'object') {
const cmd = (input as Record<string, unknown>).command;
if (typeof cmd === 'string') return cmd;
}
return null;
}
/**
* Every `sections/<name>.md` file the run Read, normalized to the section
* basename (e.g. "version-bump.md"). Deduped, in first-Read order. Matching is
* on the path segment `/sections/<file>.md` so it works regardless of whether
* the host resolved a relative, absolute, or prefixed install path.
*/
export function extractSectionReads(result: TranscriptResultLike): string[] {
const seen = new Set<string>();
const ordered: string[] = [];
for (const call of result.toolCalls) {
if (call.tool !== 'Read') continue;
const fp = readFilePath(call.input);
if (!fp) continue;
const m = fp.match(/(?:^|\/)sections\/([A-Za-z0-9._-]+\.md)$/);
if (!m) continue;
const name = m[1];
if (!seen.has(name)) {
seen.add(name);
ordered.push(name);
}
}
return ordered;
}
/**
* The canonical /ship action vocabulary. Each action is detected from the Bash
* commands the agent ran (plus a couple of Write/Edit signals). Order is the
* rough ship sequence; detection is order-independent.
*
* Keep this list aligned with the ship skeleton's numbered steps. The
* section-loading eval asserts the sectioned ship still triggers the same
* actions a monolith run did for the same fixture situation.
*/
export const SHIP_ACTIONS = [
'merged_base', // git merge <base>
'ran_tests', // bun test / npm test / the project test cmd
'bumped_version', // wrote VERSION / package.json version / ran gstack-version-bump
'wrote_changelog', // edited CHANGELOG.md
'committed', // git commit
'pushed', // git push
'opened_pr', // gh pr create / glab mr create
] as const;
export type ShipAction = (typeof SHIP_ACTIONS)[number];
const BASH_ACTION_PATTERNS: Array<{ action: ShipAction; re: RegExp }> = [
{ action: 'merged_base', re: /\bgit\s+merge\b/ },
{ action: 'ran_tests', re: /\b(bun\s+test|npm\s+(run\s+)?test|yarn\s+test|pytest|go\s+test|cargo\s+test|rspec)\b/ },
{ action: 'bumped_version', re: /gstack-version-bump\b|gstack-next-version\b|>\s*VERSION\b|npm\s+version\b/ },
{ action: 'wrote_changelog', re: /CHANGELOG\.md/ },
{ action: 'committed', re: /\bgit\s+commit\b/ },
{ action: 'pushed', re: /\bgit\s+push\b/ },
{ action: 'opened_pr', re: /\bgh\s+pr\s+create\b|\bglab\s+mr\s+create\b/ },
];
/**
* The observable action fingerprint of a ship run. Works on monolith AND
* sectioned skills because it reads what the agent DID (Bash + file writes),
* not which prose it loaded.
*/
export function extractShipActions(result: TranscriptResultLike): ShipAction[] {
const found = new Set<ShipAction>();
for (const call of result.toolCalls) {
if (call.tool === 'Bash') {
const cmd = bashCommand(call.input);
if (!cmd) continue;
for (const { action, re } of BASH_ACTION_PATTERNS) {
if (re.test(cmd)) found.add(action);
}
} else if (call.tool === 'Write' || call.tool === 'Edit') {
const fp = readFilePath(call.input);
if (fp && /CHANGELOG\.md$/.test(fp)) found.add('wrote_changelog');
if (fp && /(?:^|\/)VERSION$/.test(fp)) found.add('bumped_version');
}
}
// Preserve canonical order.
return SHIP_ACTIONS.filter(a => found.has(a));
}
export interface ShipBaseline {
tag: string;
/** Fixture/situation id this baseline was captured for. */
situation: string;
/** Action fingerprint observed on the monolith ship. */
actions: ShipAction[];
/** Section reads observed (empty on the monolith — present after carve). */
sectionReads: string[];
capturedAt: string;
}
const DEFAULT_BASELINE_DIR = path.join(os.homedir(), '.gstack-dev', 'ship-baselines');
/** Where a baseline for a given situation lives. */
export function baselinePath(situation: string, dir = DEFAULT_BASELINE_DIR): string {
return path.join(dir, `${situation}.json`);
}
/** Persist a ship baseline (used once on the monolith, before the carve). */
export function writeShipBaseline(baseline: ShipBaseline, dir = DEFAULT_BASELINE_DIR): string {
fs.mkdirSync(dir, { recursive: true });
const p = baselinePath(baseline.situation, dir);
fs.writeFileSync(p, JSON.stringify(baseline, null, 2) + '\n');
return p;
}
/** Read a previously-captured baseline, or null if none exists yet. */
export function readShipBaseline(situation: string, dir = DEFAULT_BASELINE_DIR): ShipBaseline | null {
try {
return JSON.parse(fs.readFileSync(baselinePath(situation, dir), 'utf-8')) as ShipBaseline;
} catch {
return null;
}
}
export interface ShipActionDiff {
/** Actions the baseline performed that the current run did NOT (the regression set). */
missing: ShipAction[];
/** Actions the current run performed that the baseline did not (usually fine). */
added: ShipAction[];
/** True when no baseline action was dropped. */
ok: boolean;
}
/**
* Compare a current sectioned-ship run against the monolith baseline. A dropped
* action (in baseline, not in current) is the carve regression we care about:
* the sectioned ship stopped doing something the monolith did.
*/
export function compareShipActions(baseline: ShipBaseline, current: ShipAction[]): ShipActionDiff {
const cur = new Set(current);
const base = new Set(baseline.actions);
const missing = baseline.actions.filter(a => !cur.has(a));
const added = current.filter(a => !base.has(a));
return { missing, added, ok: missing.length === 0 };
}
+136
View File
@@ -0,0 +1,136 @@
/**
* Unit tests for the transcript section logger (T10). Pure-function coverage —
* no paid run needed. Drives the analyzers with synthetic tool-call transcripts.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import {
extractSectionReads,
extractShipActions,
compareShipActions,
writeShipBaseline,
readShipBaseline,
baselinePath,
SHIP_ACTIONS,
type ToolCallLike,
type ShipBaseline,
} from './helpers/transcript-section-logger';
const read = (fp: string): ToolCallLike => ({ tool: 'Read', input: { file_path: fp }, output: '' });
const bash = (command: string): ToolCallLike => ({ tool: 'Bash', input: { command }, output: '' });
describe('extractSectionReads', () => {
test('picks up section reads via the /sections/<file>.md segment', () => {
const result = {
toolCalls: [
read('/Users/x/.claude/skills/gstack-ship/sections/version-bump.md'),
read('ship/sections/changelog.md'),
read('/abs/.factory/skills/gstack-ship/sections/review-army.md'),
],
};
expect(extractSectionReads(result)).toEqual(['version-bump.md', 'changelog.md', 'review-army.md']);
});
test('ignores non-section reads and non-Read tools', () => {
const result = {
toolCalls: [
read('ship/SKILL.md'),
read('/some/sections-like/notsections/x.md'),
bash('cat ship/sections/version-bump.md'), // bash, not a Read
],
};
expect(extractSectionReads(result)).toEqual([]);
});
test('dedupes and preserves first-read order', () => {
const result = {
toolCalls: [
read('ship/sections/tests.md'),
read('ship/sections/version-bump.md'),
read('ship/sections/tests.md'),
],
};
expect(extractSectionReads(result)).toEqual(['tests.md', 'version-bump.md']);
});
});
describe('extractShipActions', () => {
test('detects the full action fingerprint from bash + writes', () => {
const result = {
toolCalls: [
bash('git merge origin/main'),
bash('bun test'),
bash('gstack-version-bump --bump minor'),
{ tool: 'Edit', input: { file_path: 'CHANGELOG.md' }, output: '' },
bash('git commit -m "v1.2.0.0 feat"'),
bash('git push origin HEAD'),
bash('gh pr create --base main'),
],
};
expect(extractShipActions(result)).toEqual([...SHIP_ACTIONS]);
});
test('returns canonical order regardless of execution order', () => {
const result = {
toolCalls: [
bash('gh pr create --base main'),
bash('git merge origin/main'),
],
};
expect(extractShipActions(result)).toEqual(['merged_base', 'opened_pr']);
});
test('VERSION write counts as a version bump even without the CLI', () => {
const result = { toolCalls: [{ tool: 'Write', input: { file_path: 'VERSION' }, output: '' }] };
expect(extractShipActions(result)).toEqual(['bumped_version']);
});
test('empty run produces empty fingerprint', () => {
expect(extractShipActions({ toolCalls: [] })).toEqual([]);
});
});
describe('compareShipActions', () => {
const baseline: ShipBaseline = {
tag: 'monolith',
situation: 'fresh-version-changing',
actions: ['merged_base', 'ran_tests', 'bumped_version', 'wrote_changelog', 'committed', 'pushed', 'opened_pr'],
sectionReads: [],
capturedAt: '2026-05-30T00:00:00Z',
};
test('flags a dropped action as the carve regression', () => {
const current = baseline.actions.filter(a => a !== 'bumped_version');
const diff = compareShipActions(baseline, current);
expect(diff.ok).toBe(false);
expect(diff.missing).toEqual(['bumped_version']);
});
test('passes when the sectioned run performs every baseline action', () => {
const diff = compareShipActions(baseline, [...baseline.actions, 'merged_base']);
expect(diff.ok).toBe(true);
expect(diff.missing).toEqual([]);
});
});
describe('baseline persistence', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'ship-baseline-'));
afterAll(() => { try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* noop */ } });
test('round-trips a baseline to disk', () => {
const baseline: ShipBaseline = {
tag: 'monolith', situation: 'no-plan-file',
actions: ['ran_tests', 'committed'], sectionReads: [], capturedAt: '2026-05-30T00:00:00Z',
};
const p = writeShipBaseline(baseline, dir);
expect(p).toBe(baselinePath('no-plan-file', dir));
expect(readShipBaseline('no-plan-file', dir)).toEqual(baseline);
});
test('returns null when no baseline captured yet', () => {
expect(readShipBaseline('never-captured', dir)).toBeNull();
});
});