mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 13:45:35 +02:00
Merge remote-tracking branch 'origin/main' into garrytan/create-gbrain-skill
This commit is contained in:
@@ -0,0 +1,820 @@
|
||||
/**
|
||||
* Unit tests for test/helpers/agent-sdk-runner.ts.
|
||||
*
|
||||
* Runs in free `bun test` (no API calls). Uses a stub QueryProvider to
|
||||
* simulate SDK event streams — happy path, rate-limit retries across all
|
||||
* three shapes, persistent failure, non-retryable error, options
|
||||
* propagation, concurrency cap.
|
||||
*
|
||||
* Also covers validateFixtures() rejections.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import type {
|
||||
SDKMessage,
|
||||
Options,
|
||||
Query,
|
||||
} from '@anthropic-ai/claude-agent-sdk';
|
||||
import {
|
||||
runAgentSdkTest,
|
||||
toSkillTestResult,
|
||||
firstTurnParallelism,
|
||||
isRateLimitThrown,
|
||||
isRateLimitResult,
|
||||
isRateLimitEvent,
|
||||
RateLimitExhaustedError,
|
||||
__resetSemaphoreForTests,
|
||||
type QueryProvider,
|
||||
type AgentSdkResult,
|
||||
} from '../test/helpers/agent-sdk-runner';
|
||||
import {
|
||||
validateFixtures,
|
||||
fanoutPass,
|
||||
type OverlayFixture,
|
||||
} from '../test/fixtures/overlay-nudges';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stub SDK event builders
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
let uuidCounter = 0;
|
||||
function uuid(): string {
|
||||
return `00000000-0000-0000-0000-${String(++uuidCounter).padStart(12, '0')}`;
|
||||
}
|
||||
|
||||
function systemInit(model = 'claude-opus-4-7', version = '2.1.117'): SDKMessage {
|
||||
return {
|
||||
type: 'system',
|
||||
subtype: 'init',
|
||||
apiKeySource: 'user',
|
||||
claude_code_version: version,
|
||||
cwd: '/tmp/x',
|
||||
tools: ['Read'],
|
||||
mcp_servers: [],
|
||||
model,
|
||||
permissionMode: 'bypassPermissions',
|
||||
slash_commands: [],
|
||||
output_style: 'default',
|
||||
skills: [],
|
||||
plugins: [],
|
||||
uuid: uuid(),
|
||||
session_id: 'test-session',
|
||||
} as unknown as SDKMessage;
|
||||
}
|
||||
|
||||
function assistantTurn(
|
||||
blocks: Array<{ type: 'text'; text: string } | { type: 'tool_use'; name: string; input: unknown }>,
|
||||
): SDKMessage {
|
||||
return {
|
||||
type: 'assistant',
|
||||
parent_tool_use_id: null,
|
||||
uuid: uuid(),
|
||||
session_id: 'test-session',
|
||||
message: {
|
||||
id: 'msg_' + uuid(),
|
||||
type: 'message',
|
||||
role: 'assistant',
|
||||
model: 'claude-opus-4-7',
|
||||
content: blocks.map((b) => ({ ...b })),
|
||||
stop_reason: 'end_turn',
|
||||
stop_sequence: null,
|
||||
usage: {
|
||||
input_tokens: 10,
|
||||
output_tokens: 20,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 0,
|
||||
service_tier: 'standard',
|
||||
},
|
||||
},
|
||||
} as unknown as SDKMessage;
|
||||
}
|
||||
|
||||
function resultSuccess(cost = 0.01, turns = 1): SDKMessage {
|
||||
return {
|
||||
type: 'result',
|
||||
subtype: 'success',
|
||||
duration_ms: 100,
|
||||
duration_api_ms: 50,
|
||||
is_error: false,
|
||||
num_turns: turns,
|
||||
result: 'done',
|
||||
stop_reason: 'end_turn',
|
||||
total_cost_usd: cost,
|
||||
usage: {
|
||||
input_tokens: 10,
|
||||
output_tokens: 20,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 0,
|
||||
server_tool_use: {},
|
||||
service_tier: 'standard',
|
||||
},
|
||||
modelUsage: {},
|
||||
permission_denials: [],
|
||||
uuid: uuid(),
|
||||
session_id: 'test-session',
|
||||
} as unknown as SDKMessage;
|
||||
}
|
||||
|
||||
function resultRateLimit(): SDKMessage {
|
||||
return {
|
||||
type: 'result',
|
||||
subtype: 'error_during_execution',
|
||||
duration_ms: 100,
|
||||
duration_api_ms: 50,
|
||||
is_error: true,
|
||||
num_turns: 0,
|
||||
stop_reason: null,
|
||||
total_cost_usd: 0,
|
||||
usage: {
|
||||
input_tokens: 0,
|
||||
output_tokens: 0,
|
||||
cache_creation_input_tokens: 0,
|
||||
cache_read_input_tokens: 0,
|
||||
server_tool_use: {},
|
||||
service_tier: 'standard',
|
||||
},
|
||||
modelUsage: {},
|
||||
permission_denials: [],
|
||||
errors: ['rate limit exceeded (429)'],
|
||||
uuid: uuid(),
|
||||
session_id: 'test-session',
|
||||
} as unknown as SDKMessage;
|
||||
}
|
||||
|
||||
function rateLimitEvent(): SDKMessage {
|
||||
return {
|
||||
type: 'rate_limit_event',
|
||||
rate_limit_info: {
|
||||
status: 'rejected',
|
||||
rateLimitType: 'five_hour',
|
||||
},
|
||||
uuid: uuid(),
|
||||
session_id: 'test-session',
|
||||
} as unknown as SDKMessage;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stub query provider
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface StubConfig {
|
||||
/** One event stream per call. Exhausted calls throw. */
|
||||
streams: SDKMessage[][];
|
||||
/** Throw this error on the Nth call (0-indexed). */
|
||||
throwAt?: number;
|
||||
throwError?: unknown;
|
||||
/** Track calls for assertions. */
|
||||
calls: Array<{ prompt: string; options: Options | undefined; startedAt: number; endedAt?: number }>;
|
||||
}
|
||||
|
||||
function makeStubProvider(config: StubConfig): QueryProvider {
|
||||
let callIdx = -1;
|
||||
const provider: QueryProvider = (params) => {
|
||||
callIdx++;
|
||||
const idx = callIdx;
|
||||
const startedAt = Date.now();
|
||||
const prompt = typeof params.prompt === 'string' ? params.prompt : '<iterable>';
|
||||
config.calls.push({ prompt, options: params.options, startedAt });
|
||||
|
||||
if (config.throwAt !== undefined && idx === config.throwAt) {
|
||||
const err = config.throwError ?? new Error('stub throw');
|
||||
// Return an async generator that throws on first next().
|
||||
const gen = (async function* (): AsyncGenerator<SDKMessage, void> {
|
||||
throw err;
|
||||
})();
|
||||
return gen as unknown as Query;
|
||||
}
|
||||
|
||||
const stream = config.streams[idx];
|
||||
if (!stream) {
|
||||
const gen = (async function* (): AsyncGenerator<SDKMessage, void> {
|
||||
throw new Error(`stub has no stream for call ${idx}`);
|
||||
})();
|
||||
return gen as unknown as Query;
|
||||
}
|
||||
|
||||
const gen = (async function* (): AsyncGenerator<SDKMessage, void> {
|
||||
try {
|
||||
for (const ev of stream) {
|
||||
yield ev;
|
||||
}
|
||||
} finally {
|
||||
config.calls[idx]!.endedAt = Date.now();
|
||||
}
|
||||
})();
|
||||
return gen as unknown as Query;
|
||||
};
|
||||
return provider;
|
||||
}
|
||||
|
||||
const BASE_OPTS = {
|
||||
systemPrompt: '',
|
||||
userPrompt: 'test prompt',
|
||||
workingDirectory: '/tmp/test-dir',
|
||||
maxRetries: 3,
|
||||
};
|
||||
|
||||
// Reset semaphore before each test that depends on fresh capacity.
|
||||
function freshSem(cap = 10): void {
|
||||
__resetSemaphoreForTests(cap);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Happy path
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('runAgentSdkTest — happy path', () => {
|
||||
test('collects events, assistantTurns, toolCalls, and result fields', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [
|
||||
[
|
||||
systemInit(),
|
||||
assistantTurn([
|
||||
{ type: 'text', text: 'reading files' },
|
||||
{ type: 'tool_use', name: 'Read', input: { path: 'a.txt' } },
|
||||
{ type: 'tool_use', name: 'Read', input: { path: 'b.txt' } },
|
||||
]),
|
||||
assistantTurn([{ type: 'text', text: 'done' }]),
|
||||
resultSuccess(0.05, 2),
|
||||
],
|
||||
],
|
||||
calls: [],
|
||||
};
|
||||
const result = await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
|
||||
expect(result.events.length).toBe(4);
|
||||
expect(result.assistantTurns.length).toBe(2);
|
||||
expect(result.toolCalls.length).toBe(2);
|
||||
expect(result.toolCalls[0]!.tool).toBe('Read');
|
||||
expect(result.output).toContain('reading files');
|
||||
expect(result.output).toContain('done');
|
||||
expect(result.exitReason).toBe('success');
|
||||
expect(result.turnsUsed).toBe(2);
|
||||
expect(result.costUsd).toBe(0.05);
|
||||
expect(result.sdkClaudeCodeVersion).toBe('2.1.117');
|
||||
expect(result.model).toBe('claude-opus-4-7');
|
||||
expect(result.firstResponseMs).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
test('first-turn parallelism: 3 tool_use blocks in first assistant turn', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [
|
||||
[
|
||||
systemInit(),
|
||||
assistantTurn([
|
||||
{ type: 'tool_use', name: 'Read', input: { path: 'a' } },
|
||||
{ type: 'tool_use', name: 'Read', input: { path: 'b' } },
|
||||
{ type: 'tool_use', name: 'Read', input: { path: 'c' } },
|
||||
]),
|
||||
resultSuccess(),
|
||||
],
|
||||
],
|
||||
calls: [],
|
||||
};
|
||||
const result = await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
expect(firstTurnParallelism(result.assistantTurns[0])).toBe(3);
|
||||
});
|
||||
|
||||
test('first-turn parallelism: 0 when first turn is text-only', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [
|
||||
[
|
||||
systemInit(),
|
||||
assistantTurn([{ type: 'text', text: 'thinking' }]),
|
||||
resultSuccess(),
|
||||
],
|
||||
],
|
||||
calls: [],
|
||||
};
|
||||
const result = await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
expect(firstTurnParallelism(result.assistantTurns[0])).toBe(0);
|
||||
});
|
||||
|
||||
test('first-turn parallelism: 0 when no first turn', () => {
|
||||
expect(firstTurnParallelism(undefined)).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Options propagation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('runAgentSdkTest — options propagation', () => {
|
||||
test('systemPrompt, model, cwd, allowedTools, disallowedTools, permissionMode, settingSources, env, pathToClaudeCodeExecutable reach query()', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()]],
|
||||
calls: [],
|
||||
};
|
||||
await runAgentSdkTest({
|
||||
systemPrompt: 'you are a test overlay',
|
||||
userPrompt: 'go',
|
||||
workingDirectory: '/tmp/spec-dir',
|
||||
model: 'claude-opus-4-7',
|
||||
maxTurns: 7,
|
||||
allowedTools: ['Read', 'Glob'],
|
||||
disallowedTools: ['Bash', 'Write'],
|
||||
permissionMode: 'bypassPermissions',
|
||||
settingSources: [],
|
||||
env: { ANTHROPIC_API_KEY: 'fake' },
|
||||
pathToClaudeCodeExecutable: '/fake/path/claude',
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
|
||||
const opts = stub.calls[0]!.options!;
|
||||
expect(opts.systemPrompt).toBe('you are a test overlay');
|
||||
expect(opts.model).toBe('claude-opus-4-7');
|
||||
expect(opts.cwd).toBe('/tmp/spec-dir');
|
||||
expect(opts.maxTurns).toBe(7);
|
||||
expect(opts.tools).toEqual(['Read', 'Glob']);
|
||||
expect(opts.allowedTools).toEqual(['Read', 'Glob']);
|
||||
expect(opts.disallowedTools).toEqual(['Bash', 'Write']);
|
||||
expect(opts.permissionMode).toBe('bypassPermissions');
|
||||
expect(opts.allowDangerouslySkipPermissions).toBe(true);
|
||||
expect(opts.settingSources).toEqual([]);
|
||||
expect(opts.env).toEqual({ ANTHROPIC_API_KEY: 'fake' });
|
||||
expect(opts.pathToClaudeCodeExecutable).toBe('/fake/path/claude');
|
||||
});
|
||||
|
||||
test('empty systemPrompt means no systemPrompt option passed', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()]],
|
||||
calls: [],
|
||||
};
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
// systemPrompt is undefined when empty string passed (so SDK uses no override)
|
||||
expect(stub.calls[0]!.options!.systemPrompt).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// canUseTool extension (D10 CEO / D4 eng)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('runAgentSdkTest — canUseTool extension', () => {
|
||||
test('permissionMode flips to "default" when canUseTool is supplied', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()]],
|
||||
calls: [],
|
||||
};
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
canUseTool: async (_toolName, input) => ({ behavior: 'allow', updatedInput: input }),
|
||||
});
|
||||
const opts = stub.calls[0]!.options!;
|
||||
expect(opts.permissionMode).toBe('default');
|
||||
expect(opts.allowDangerouslySkipPermissions).toBe(false);
|
||||
});
|
||||
|
||||
test('permissionMode stays "bypassPermissions" when canUseTool is NOT supplied', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()]],
|
||||
calls: [],
|
||||
};
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
const opts = stub.calls[0]!.options!;
|
||||
expect(opts.permissionMode).toBe('bypassPermissions');
|
||||
expect(opts.allowDangerouslySkipPermissions).toBe(true);
|
||||
});
|
||||
|
||||
test('canUseTool callback reaches the SDK options', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()]],
|
||||
calls: [],
|
||||
};
|
||||
const cb = async (_toolName: string, input: Record<string, unknown>) => ({
|
||||
behavior: 'allow' as const,
|
||||
updatedInput: input,
|
||||
});
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
canUseTool: cb,
|
||||
});
|
||||
const opts = stub.calls[0]!.options! as Options & { canUseTool?: unknown };
|
||||
expect(typeof opts.canUseTool).toBe('function');
|
||||
});
|
||||
|
||||
test('AskUserQuestion is auto-added to allowedTools when canUseTool is supplied', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()]],
|
||||
calls: [],
|
||||
};
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
allowedTools: ['Read', 'Grep'], // explicitly omits AskUserQuestion
|
||||
queryProvider: makeStubProvider(stub),
|
||||
canUseTool: async (_toolName, input) => ({ behavior: 'allow', updatedInput: input }),
|
||||
});
|
||||
const opts = stub.calls[0]!.options!;
|
||||
expect(opts.allowedTools).toContain('AskUserQuestion');
|
||||
expect(opts.tools).toContain('AskUserQuestion');
|
||||
});
|
||||
|
||||
test('AskUserQuestion is NOT auto-added when canUseTool is absent', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()]],
|
||||
calls: [],
|
||||
};
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
allowedTools: ['Read', 'Grep'],
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
const opts = stub.calls[0]!.options!;
|
||||
expect(opts.allowedTools).not.toContain('AskUserQuestion');
|
||||
});
|
||||
|
||||
test('passThroughNonAskUserQuestion helper returns allow+updatedInput', async () => {
|
||||
const { passThroughNonAskUserQuestion } = await import('../test/helpers/agent-sdk-runner');
|
||||
const result = passThroughNonAskUserQuestion('Read', { file_path: '/tmp/x' });
|
||||
expect(result.behavior).toBe('allow');
|
||||
expect(result.updatedInput).toEqual({ file_path: '/tmp/x' });
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Rate-limit retry (three shapes)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('runAgentSdkTest — rate-limit retry', () => {
|
||||
test('retryable on thrown 429-shaped error, then succeeds on 2nd attempt', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [
|
||||
// call 0: throws (handled via throwAt below)
|
||||
[],
|
||||
// call 1: success
|
||||
[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()],
|
||||
],
|
||||
throwAt: 0,
|
||||
throwError: Object.assign(new Error('429 too many requests'), { status: 429 }),
|
||||
calls: [],
|
||||
};
|
||||
const result = await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
maxRetries: 2,
|
||||
});
|
||||
expect(result.exitReason).toBe('success');
|
||||
expect(stub.calls.length).toBe(2);
|
||||
});
|
||||
|
||||
test('retryable on result-message rate-limit, then succeeds', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [
|
||||
[systemInit(), resultRateLimit()],
|
||||
[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()],
|
||||
],
|
||||
calls: [],
|
||||
};
|
||||
const result = await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
maxRetries: 2,
|
||||
});
|
||||
expect(result.exitReason).toBe('success');
|
||||
expect(stub.calls.length).toBe(2);
|
||||
});
|
||||
|
||||
test('retryable on mid-stream SDKRateLimitEvent, then succeeds', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [
|
||||
[systemInit(), rateLimitEvent()],
|
||||
[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()],
|
||||
],
|
||||
calls: [],
|
||||
};
|
||||
const result = await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
maxRetries: 2,
|
||||
});
|
||||
expect(result.exitReason).toBe('success');
|
||||
expect(stub.calls.length).toBe(2);
|
||||
});
|
||||
|
||||
test('onRetry callback is invoked between attempts', async () => {
|
||||
freshSem();
|
||||
const resets: string[] = [];
|
||||
const stub: StubConfig = {
|
||||
streams: [
|
||||
[],
|
||||
[systemInit(), assistantTurn([{ type: 'text', text: 'ok' }]), resultSuccess()],
|
||||
],
|
||||
throwAt: 0,
|
||||
throwError: Object.assign(new Error('429'), { status: 429 }),
|
||||
calls: [],
|
||||
};
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
maxRetries: 2,
|
||||
onRetry: (dir) => resets.push(dir),
|
||||
});
|
||||
expect(resets.length).toBe(1);
|
||||
expect(resets[0]).toBe('/tmp/test-dir');
|
||||
});
|
||||
|
||||
test('persistent 429 throws RateLimitExhaustedError after maxRetries', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[], [], [], []], // 4 empty streams; throw on each
|
||||
calls: [],
|
||||
};
|
||||
// Every call throws:
|
||||
let callCount = 0;
|
||||
const alwaysThrowProvider: QueryProvider = (params) => {
|
||||
callCount++;
|
||||
stub.calls.push({
|
||||
prompt: typeof params.prompt === 'string' ? params.prompt : '',
|
||||
options: params.options,
|
||||
startedAt: Date.now(),
|
||||
});
|
||||
const gen = (async function* (): AsyncGenerator<SDKMessage, void> {
|
||||
throw Object.assign(new Error('429 always'), { status: 429 });
|
||||
})();
|
||||
return gen as unknown as Query;
|
||||
};
|
||||
|
||||
let caught: unknown = null;
|
||||
try {
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: alwaysThrowProvider,
|
||||
maxRetries: 2,
|
||||
});
|
||||
} catch (err) {
|
||||
caught = err;
|
||||
}
|
||||
expect(caught).toBeInstanceOf(RateLimitExhaustedError);
|
||||
expect((caught as RateLimitExhaustedError).attempts).toBe(3); // initial + 2 retries
|
||||
expect(callCount).toBe(3);
|
||||
});
|
||||
|
||||
test('non-429 error is NOT retried, propagates immediately', async () => {
|
||||
__resetSemaphoreForTests(10);
|
||||
let callCount = 0;
|
||||
const throwOnce: QueryProvider = () => {
|
||||
callCount++;
|
||||
const gen = (async function* (): AsyncGenerator<SDKMessage, void> {
|
||||
throw new Error('generic auth failure');
|
||||
})();
|
||||
return gen as unknown as Query;
|
||||
};
|
||||
let caught: unknown = null;
|
||||
try {
|
||||
await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: throwOnce,
|
||||
maxRetries: 3,
|
||||
});
|
||||
} catch (err) {
|
||||
caught = err;
|
||||
}
|
||||
expect(caught).toBeInstanceOf(Error);
|
||||
expect((caught as Error).message).toBe('generic auth failure');
|
||||
expect(callCount).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Rate-limit detectors (unit)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('rate-limit detectors', () => {
|
||||
test('isRateLimitThrown matches status 429, message, name', () => {
|
||||
expect(isRateLimitThrown(Object.assign(new Error('boom'), { status: 429 }))).toBe(true);
|
||||
expect(isRateLimitThrown(new Error('429 Too Many Requests'))).toBe(true);
|
||||
expect(isRateLimitThrown(new Error('rate-limit exceeded'))).toBe(true);
|
||||
expect(isRateLimitThrown(Object.assign(new Error('x'), { name: 'RateLimitError' }))).toBe(true);
|
||||
expect(isRateLimitThrown(new Error('auth failed'))).toBe(false);
|
||||
expect(isRateLimitThrown(null)).toBe(false);
|
||||
});
|
||||
|
||||
test('isRateLimitResult matches error_during_execution with 429-shaped errors', () => {
|
||||
expect(isRateLimitResult(resultRateLimit())).toBe(true);
|
||||
expect(isRateLimitResult(resultSuccess())).toBe(false);
|
||||
});
|
||||
|
||||
test('isRateLimitEvent matches rate_limit_event with status=rejected', () => {
|
||||
expect(isRateLimitEvent(rateLimitEvent())).toBe(true);
|
||||
expect(isRateLimitEvent(resultSuccess())).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Semaphore concurrency cap
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('runAgentSdkTest — concurrency', () => {
|
||||
test('process-level semaphore caps concurrent queries', async () => {
|
||||
__resetSemaphoreForTests(2);
|
||||
let inFlight = 0;
|
||||
let peakInFlight = 0;
|
||||
const slowStub: QueryProvider = () => {
|
||||
const gen = (async function* (): AsyncGenerator<SDKMessage, void> {
|
||||
inFlight++;
|
||||
if (inFlight > peakInFlight) peakInFlight = inFlight;
|
||||
yield systemInit();
|
||||
await new Promise((r) => setTimeout(r, 30));
|
||||
yield assistantTurn([{ type: 'text', text: 'ok' }]);
|
||||
yield resultSuccess();
|
||||
inFlight--;
|
||||
})();
|
||||
return gen as unknown as Query;
|
||||
};
|
||||
|
||||
await Promise.all(
|
||||
Array.from({ length: 6 }, (_, i) =>
|
||||
runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
userPrompt: `trial-${i}`,
|
||||
queryProvider: slowStub,
|
||||
}),
|
||||
),
|
||||
);
|
||||
|
||||
expect(peakInFlight).toBeLessThanOrEqual(2);
|
||||
expect(peakInFlight).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// toSkillTestResult shape
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('toSkillTestResult', () => {
|
||||
test('produces a SkillTestResult-shaped object', async () => {
|
||||
freshSem();
|
||||
const stub: StubConfig = {
|
||||
streams: [[systemInit(), assistantTurn([{ type: 'text', text: 'hi' }]), resultSuccess(0.02, 1)]],
|
||||
calls: [],
|
||||
};
|
||||
const r = await runAgentSdkTest({
|
||||
...BASE_OPTS,
|
||||
queryProvider: makeStubProvider(stub),
|
||||
});
|
||||
const s = toSkillTestResult(r);
|
||||
expect(s.toolCalls).toBeArray();
|
||||
expect(s.browseErrors).toBeArray();
|
||||
expect(s.exitReason).toBe('success');
|
||||
expect(s.duration).toBeNumber();
|
||||
expect(s.output).toBe('hi');
|
||||
expect(s.costEstimate.estimatedCost).toBe(0.02);
|
||||
expect(s.costEstimate.turnsUsed).toBe(1);
|
||||
expect(s.model).toBe('claude-opus-4-7');
|
||||
expect(s.firstResponseMs).toBeNumber();
|
||||
expect(s.maxInterTurnMs).toBeNumber();
|
||||
expect(s.transcript).toBeArray();
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fixture validator
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('validateFixtures', () => {
|
||||
function base(overrides: Partial<OverlayFixture> = {}): OverlayFixture {
|
||||
return {
|
||||
id: 'test-fixture',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
setupWorkspace: () => {},
|
||||
userPrompt: 'go',
|
||||
metric: () => 0,
|
||||
pass: fanoutPass,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
test('passes for a valid fixture', () => {
|
||||
expect(() => validateFixtures([base()])).not.toThrow();
|
||||
});
|
||||
|
||||
test('rejects empty id', () => {
|
||||
expect(() => validateFixtures([base({ id: '' })])).toThrow(/id must be/);
|
||||
});
|
||||
|
||||
test('rejects id with uppercase or unsafe chars', () => {
|
||||
expect(() => validateFixtures([base({ id: 'Test_Fixture' })])).toThrow(/id must be/);
|
||||
});
|
||||
|
||||
test('rejects duplicate ids', () => {
|
||||
expect(() => validateFixtures([base(), base()])).toThrow(/duplicate fixture id/);
|
||||
});
|
||||
|
||||
test('rejects non-integer trials', () => {
|
||||
expect(() => validateFixtures([base({ trials: 3.5 })])).toThrow(/trials must be/);
|
||||
});
|
||||
|
||||
test('rejects trials < 3', () => {
|
||||
expect(() => validateFixtures([base({ trials: 2 })])).toThrow(/trials must be/);
|
||||
});
|
||||
|
||||
test('rejects concurrency < 1', () => {
|
||||
expect(() => validateFixtures([base({ concurrency: 0 })])).toThrow(/concurrency must be/);
|
||||
});
|
||||
|
||||
test('rejects non-integer concurrency', () => {
|
||||
expect(() => validateFixtures([base({ concurrency: 2.5 })])).toThrow(/concurrency must be/);
|
||||
});
|
||||
|
||||
test('rejects empty model', () => {
|
||||
expect(() => validateFixtures([base({ model: '' })])).toThrow(/model must be/);
|
||||
});
|
||||
|
||||
test('rejects empty userPrompt', () => {
|
||||
expect(() => validateFixtures([base({ userPrompt: '' })])).toThrow(/userPrompt must be/);
|
||||
});
|
||||
|
||||
test('rejects absolute overlayPath', () => {
|
||||
expect(() => validateFixtures([base({ overlayPath: '/etc/passwd' })])).toThrow(/overlayPath must be/);
|
||||
});
|
||||
|
||||
test("rejects overlayPath containing '..'", () => {
|
||||
expect(() =>
|
||||
validateFixtures([base({ overlayPath: '../outside/file.md' })]),
|
||||
).toThrow(/overlayPath must be/);
|
||||
});
|
||||
|
||||
test('rejects missing overlay file', () => {
|
||||
expect(() =>
|
||||
validateFixtures([base({ overlayPath: 'model-overlays/nonexistent.md' })]),
|
||||
).toThrow(/overlay file not found/);
|
||||
});
|
||||
|
||||
test('rejects non-function setupWorkspace', () => {
|
||||
expect(() =>
|
||||
validateFixtures([base({ setupWorkspace: 'not a function' as unknown as (d: string) => void })]),
|
||||
).toThrow(/setupWorkspace must be a function/);
|
||||
});
|
||||
|
||||
test('rejects non-function metric', () => {
|
||||
expect(() =>
|
||||
validateFixtures([base({ metric: null as unknown as (r: AgentSdkResult) => number })]),
|
||||
).toThrow(/metric must be a function/);
|
||||
});
|
||||
|
||||
test('rejects non-function pass', () => {
|
||||
expect(() =>
|
||||
validateFixtures([base({ pass: undefined as unknown as OverlayFixture['pass'] })]),
|
||||
).toThrow(/pass must be a function/);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// fanoutPass predicate
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('fanoutPass predicate', () => {
|
||||
test('accepts mean lift >= 0.5 AND >=3/10 overlay trials >= 2', () => {
|
||||
const overlay = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2];
|
||||
const off = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
|
||||
expect(fanoutPass({ overlay, off })).toBe(true);
|
||||
});
|
||||
|
||||
test('rejects when mean lift < 0.5', () => {
|
||||
const overlay = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1];
|
||||
const off = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1];
|
||||
expect(fanoutPass({ overlay, off })).toBe(false);
|
||||
});
|
||||
|
||||
test('rejects when mean lift >= 0.5 but <3 overlay trials emit >=2', () => {
|
||||
// Mean overlay = 1.2, off = 0.0, lift 1.2 but only 2 trials at >=2
|
||||
const overlay = [2, 2, 1, 1, 1, 1, 1, 1, 1, 1];
|
||||
const off = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
|
||||
expect(fanoutPass({ overlay, off })).toBe(false);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,113 @@
|
||||
/**
|
||||
* E2E harness audit — every skill with `interactive: true` in its frontmatter
|
||||
* must have at least one test file that uses `canUseTool` via the extended
|
||||
* agent-sdk-runner. This prevents future drift where a skill opts into the
|
||||
* handshake without adding real coverage.
|
||||
*
|
||||
* Runs as a free unit test (no API calls). Pure filesystem scan.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const SKILL_GLOBS = [
|
||||
'plan-ceo-review',
|
||||
'plan-eng-review',
|
||||
'plan-design-review',
|
||||
'plan-devex-review',
|
||||
'office-hours',
|
||||
'codex',
|
||||
'investigate',
|
||||
'qa',
|
||||
'retro',
|
||||
'cso',
|
||||
'review',
|
||||
'ship',
|
||||
'design-review',
|
||||
'devex-review',
|
||||
'qa-only',
|
||||
'design-consultation',
|
||||
'design-shotgun',
|
||||
'autoplan',
|
||||
'land-and-deploy',
|
||||
'plan-tune',
|
||||
'document-release',
|
||||
'context-save',
|
||||
'context-restore',
|
||||
'health',
|
||||
'setup-deploy',
|
||||
'setup-browser-cookies',
|
||||
'canary',
|
||||
'learn',
|
||||
'benchmark',
|
||||
'benchmark-models',
|
||||
'make-pdf',
|
||||
'open-gstack-browser',
|
||||
'gstack-upgrade',
|
||||
'pair-agent',
|
||||
'design-html',
|
||||
'freeze',
|
||||
'unfreeze',
|
||||
'careful',
|
||||
'guard',
|
||||
];
|
||||
|
||||
/**
|
||||
* Load .tmpl files for each skill and return the names of those that have
|
||||
* `interactive: true` in frontmatter.
|
||||
*/
|
||||
function findInteractiveSkills(): string[] {
|
||||
const interactive: string[] = [];
|
||||
for (const skill of SKILL_GLOBS) {
|
||||
const tmplPath = path.join(ROOT, skill, 'SKILL.md.tmpl');
|
||||
if (!fs.existsSync(tmplPath)) continue;
|
||||
const content = fs.readFileSync(tmplPath, 'utf-8');
|
||||
// Frontmatter lives between the first '---' and the next '---'.
|
||||
const fmEnd = content.indexOf('\n---', 4);
|
||||
if (fmEnd < 0) continue;
|
||||
const frontmatter = content.slice(0, fmEnd);
|
||||
if (/^interactive:\s*true\s*$/m.test(frontmatter)) {
|
||||
interactive.push(skill);
|
||||
}
|
||||
}
|
||||
return interactive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scan a test file's contents for the canUseTool-via-harness pattern.
|
||||
* Either: direct canUseTool usage in runAgentSdkTest, or usage of the
|
||||
* shared plan-mode-handshake-helpers that wrap it.
|
||||
*/
|
||||
function hasCanUseToolCoverage(testFile: string): boolean {
|
||||
const content = fs.readFileSync(testFile, 'utf-8');
|
||||
if (content.includes('canUseTool')) return true;
|
||||
if (content.includes('runPlanModeHandshakeTest')) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
describe('E2E harness audit — interactive skills must have canUseTool coverage', () => {
|
||||
test('every interactive: true skill has at least one canUseTool test', () => {
|
||||
const interactive = findInteractiveSkills();
|
||||
expect(interactive.length).toBeGreaterThan(0);
|
||||
|
||||
const testFiles = fs
|
||||
.readdirSync(path.join(ROOT, 'test'))
|
||||
.filter((f) => f.startsWith('skill-e2e-') && f.endsWith('.test.ts'))
|
||||
.map((f) => path.join(ROOT, 'test', f));
|
||||
|
||||
const filesWithCoverage = testFiles.filter(hasCanUseToolCoverage);
|
||||
|
||||
for (const skill of interactive) {
|
||||
// Match the skill name in any test file that uses canUseTool. File
|
||||
// naming convention is `skill-e2e-<skill>-*.test.ts` — either the full
|
||||
// name (plan-ceo-review) or a subset token.
|
||||
const hasDedicatedTest = filesWithCoverage.some((f) => {
|
||||
const base = path.basename(f, '.test.ts');
|
||||
return base.includes(skill) || base.includes(skill.replace(/-review$/, ''));
|
||||
});
|
||||
expect(hasDedicatedTest, `skill "${skill}" has interactive:true but no canUseTool-based E2E test`).toBe(true);
|
||||
}
|
||||
});
|
||||
});
|
||||
+264
-22
@@ -355,6 +355,234 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
- Focus on completing the task and reporting results via prose output.
|
||||
- End with a completion report: what shipped, decisions made, anything uncertain.
|
||||
|
||||
## AskUserQuestion Format
|
||||
|
||||
**ALWAYS follow this structure for every AskUserQuestion call. Every element is non-skippable. If you find yourself about to skip any of them, stop and back up.**
|
||||
|
||||
### Required shape
|
||||
|
||||
Every AskUserQuestion reads like a decision brief, not a bullet list:
|
||||
|
||||
```
|
||||
D<N> — <one-line question title>
|
||||
|
||||
ELI10: <plain English a 16-year-old could follow, 2-4 sentences, name the stakes>
|
||||
|
||||
Stakes if we pick wrong: <one sentence on what breaks, what user sees, what's lost>
|
||||
|
||||
Recommendation: <choice> because <one-line reason>
|
||||
|
||||
Completeness: A=X/10, B=Y/10 (or: Note: options differ in kind, not coverage — no completeness score)
|
||||
|
||||
Pros / cons:
|
||||
|
||||
A) <option label> (recommended)
|
||||
✅ <pro — concrete, observable, ≥40 chars>
|
||||
✅ <pro>
|
||||
❌ <con — honest, ≥40 chars>
|
||||
|
||||
B) <option label>
|
||||
✅ <pro>
|
||||
❌ <con>
|
||||
|
||||
Net: <one-line synthesis of what you're actually trading off>
|
||||
```
|
||||
|
||||
### Element rules
|
||||
|
||||
1. **D-numbering.** First question in a skill invocation is `D1`. Increment per
|
||||
question within the same skill. This is a model-level instruction, not a
|
||||
runtime counter — you count your own questions. Nested skill invocation
|
||||
(e.g., `/plan-ceo-review` running `/office-hours` inline) starts its own
|
||||
D1; label as `D1 (office-hours)` to disambiguate when the user will see
|
||||
both. Drift is expected over long sessions; minor inconsistency is fine.
|
||||
|
||||
2. **Re-ground.** Before ELI10, state the project, current branch (use the
|
||||
`_BRANCH` value from the preamble, NOT conversation history or gitStatus),
|
||||
and the current plan/task. 1-2 sentences. Assume the user hasn't looked at
|
||||
this window in 20 minutes.
|
||||
|
||||
3. **ELI10 (ALWAYS).** Explain in plain English a smart 16-year-old could
|
||||
follow. Concrete examples and analogies, not function names. Say what it
|
||||
DOES, not what it's called. This is not preamble — the user is about to
|
||||
make a decision and needs context. Even in terse mode, emit the ELI10.
|
||||
|
||||
4. **Stakes if we pick wrong (ALWAYS).** One sentence naming what breaks in
|
||||
concrete terms (pain avoided / capability unlocked / consequence named).
|
||||
"Users see a 3-second spinner" beats "performance may degrade." Forces
|
||||
the trade-off to be real.
|
||||
|
||||
5. **Recommendation (ALWAYS).** `Recommendation: <choice> because <one-line
|
||||
reason>` on its own line. Never omit it. Required for every AskUserQuestion,
|
||||
even when neutral-posture (see rule 8). The `(recommended)` label on the
|
||||
option is REQUIRED — `scripts/resolvers/question-tuning.ts` reads it to
|
||||
power the AUTO_DECIDE path. Omitting it breaks auto-decide.
|
||||
|
||||
6. **Completeness scoring (when meaningful).** When options differ in
|
||||
coverage (full test coverage vs happy path vs shortcut, complete error
|
||||
handling vs partial), score each `Completeness: N/10` on its own line.
|
||||
Calibration: 10 = complete, 7 = happy path only, 3 = shortcut. Flag any
|
||||
option ≤5 where a higher-completeness option exists. When options differ
|
||||
in kind (review posture, architectural A-vs-B, cherry-pick Add/Defer/Skip,
|
||||
two different kinds of systems), SKIP the score and write one line:
|
||||
`Note: options differ in kind, not coverage — no completeness score.`
|
||||
Do NOT fabricate filler scores — empty 10/10 on every option is worse
|
||||
than no score.
|
||||
|
||||
7. **Pros / cons block.** Every option gets per-bullet ✅ (pro) and ❌ (con)
|
||||
markers. Rules:
|
||||
- **Minimum 2 pros and 1 con per option.** If you can't name a con for
|
||||
the recommended option, the recommendation is hollow — go find one. If
|
||||
you can't name a pro for the rejected option, the question isn't real.
|
||||
- **Minimum 40 characters per bullet.** `✅ Simple` is not a pro. `✅
|
||||
Reuses the YAML frontmatter format already in MEMORY.md, zero new
|
||||
parser` is a pro. Concrete, observable, specific.
|
||||
- **Hard-stop escape** for genuinely one-sided choices (destructive-action
|
||||
confirmation, one-way doors): a single bullet `✅ No cons — this is a
|
||||
hard-stop choice` satisfies the rule. Use sparingly; overuse flips a
|
||||
decision brief into theater.
|
||||
|
||||
8. **Net line (ALWAYS).** Closes the decision with a one-sentence synthesis
|
||||
of what the user is actually trading off. From the reference screenshot:
|
||||
*"The new-format case is speculative. The copy-format case is immediate
|
||||
leverage. Copy now, evolve later if a real pattern emerges."* Not a
|
||||
summary — a verdict frame.
|
||||
|
||||
9. **Neutral-posture handling.** When the skill explicitly says "neutral
|
||||
recommendation posture" (SELECTIVE EXPANSION cherry-picks, taste calls,
|
||||
kind-differentiated choices where neither side dominates), the
|
||||
Recommendation line reads: `Recommendation: <default-choice> — this is a
|
||||
taste call, no strong preference either way`. The `(recommended)` label
|
||||
STAYS on the default option (machine-readable hint for AUTO_DECIDE). The
|
||||
`— this is a taste call` prose is the human-readable neutrality signal.
|
||||
Both coexist.
|
||||
|
||||
10. **Effort both-scales.** When an option involves effort, show both human
|
||||
and CC scales: `(human: ~2 days / CC: ~15 min)`.
|
||||
|
||||
11. **Tool_use, not prose.** A markdown block labeled `Question:` is not a
|
||||
question — the user never sees it as interactive. If you wrote one in
|
||||
prose, stop and reissue as an actual AskUserQuestion tool_use. The rich
|
||||
markdown goes in the question body; the `options` array stays short
|
||||
labels (A, B, C).
|
||||
|
||||
### Self-check before emitting
|
||||
|
||||
Before calling AskUserQuestion, verify:
|
||||
- [ ] D<N> header present
|
||||
- [ ] ELI10 paragraph present (stakes line too)
|
||||
- [ ] Recommendation line present with concrete reason
|
||||
- [ ] Completeness scored (coverage) OR kind-note present (kind)
|
||||
- [ ] Every option has ≥2 ✅ and ≥1 ❌, each ≥40 chars (or hard-stop escape)
|
||||
- [ ] (recommended) label on one option (even for neutral-posture — see rule 9)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose
|
||||
|
||||
If you'd need to read the source to understand your own explanation, it's
|
||||
too complex — simplify before emitting.
|
||||
|
||||
Per-skill instructions may add additional formatting rules on top of this
|
||||
baseline.
|
||||
|
||||
## GBrain Sync (skill start)
|
||||
|
||||
```bash
|
||||
# gbrain-sync: drain pending writes, pull once per day. Silent no-op when
|
||||
# the feature isn't initialized or gbrain_sync_mode is "off". See
|
||||
# docs/gbrain-sync.md.
|
||||
|
||||
_GSTACK_HOME="${GSTACK_HOME:-$HOME/.gstack}"
|
||||
_BRAIN_REMOTE_FILE="$HOME/.gstack-brain-remote.txt"
|
||||
_BRAIN_SYNC_BIN="~/.claude/skills/gstack/bin/gstack-brain-sync"
|
||||
_BRAIN_CONFIG_BIN="~/.claude/skills/gstack/bin/gstack-config"
|
||||
|
||||
_BRAIN_SYNC_MODE=$("$_BRAIN_CONFIG_BIN" get gbrain_sync_mode 2>/dev/null || echo off)
|
||||
|
||||
# New-machine hint: URL file present, local .git missing, sync not yet enabled.
|
||||
if [ -f "$_BRAIN_REMOTE_FILE" ] && [ ! -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" = "off" ]; then
|
||||
_BRAIN_NEW_URL=$(head -1 "$_BRAIN_REMOTE_FILE" 2>/dev/null | tr -d '[:space:]')
|
||||
if [ -n "$_BRAIN_NEW_URL" ]; then
|
||||
echo "BRAIN_SYNC: brain repo detected: $_BRAIN_NEW_URL"
|
||||
echo "BRAIN_SYNC: run 'gstack-brain-restore' to pull your cross-machine memory (or 'gstack-config set gbrain_sync_mode off' to dismiss forever)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Active-sync path.
|
||||
if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
|
||||
# Once-per-day pull.
|
||||
_BRAIN_LAST_PULL_FILE="$_GSTACK_HOME/.brain-last-pull"
|
||||
_BRAIN_NOW=$(date +%s)
|
||||
_BRAIN_DO_PULL=1
|
||||
if [ -f "$_BRAIN_LAST_PULL_FILE" ]; then
|
||||
_BRAIN_LAST=$(cat "$_BRAIN_LAST_PULL_FILE" 2>/dev/null || echo 0)
|
||||
_BRAIN_AGE=$(( _BRAIN_NOW - _BRAIN_LAST ))
|
||||
[ "$_BRAIN_AGE" -lt 86400 ] && _BRAIN_DO_PULL=0
|
||||
fi
|
||||
if [ "$_BRAIN_DO_PULL" = "1" ]; then
|
||||
( cd "$_GSTACK_HOME" && git fetch origin >/dev/null 2>&1 && git merge --ff-only "origin/$(git rev-parse --abbrev-ref HEAD)" >/dev/null 2>&1 ) || true
|
||||
echo "$_BRAIN_NOW" > "$_BRAIN_LAST_PULL_FILE"
|
||||
fi
|
||||
# Drain pending queue, push.
|
||||
"$_BRAIN_SYNC_BIN" --once 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Status line — always emitted, easy to grep.
|
||||
if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
|
||||
_BRAIN_QUEUE_DEPTH=0
|
||||
[ -f "$_GSTACK_HOME/.brain-queue.jsonl" ] && _BRAIN_QUEUE_DEPTH=$(wc -l < "$_GSTACK_HOME/.brain-queue.jsonl" | tr -d ' ')
|
||||
_BRAIN_LAST_PUSH="never"
|
||||
[ -f "$_GSTACK_HOME/.brain-last-push" ] && _BRAIN_LAST_PUSH=$(cat "$_GSTACK_HOME/.brain-last-push" 2>/dev/null || echo never)
|
||||
echo "BRAIN_SYNC: mode=$_BRAIN_SYNC_MODE | last_push=$_BRAIN_LAST_PUSH | queue=$_BRAIN_QUEUE_DEPTH"
|
||||
else
|
||||
echo "BRAIN_SYNC: off"
|
||||
fi
|
||||
```
|
||||
|
||||
|
||||
|
||||
**Privacy stop-gate (fires ONCE per machine).**
|
||||
|
||||
If the bash output shows `BRAIN_SYNC: off` AND the config value
|
||||
`gbrain_sync_mode_prompted` is `false` AND gbrain is detected on this host
|
||||
(either `gbrain doctor --fast --json` succeeds or the `gbrain` binary is in PATH),
|
||||
fire a one-time privacy gate via AskUserQuestion:
|
||||
|
||||
> gstack can publish your session memory (learnings, plans, designs, retros) to a
|
||||
> private GitHub repo that GBrain indexes across your machines. Higher tiers
|
||||
> include behavioral data (session timelines, developer profile). How much do you
|
||||
> want to sync?
|
||||
|
||||
Options:
|
||||
- A) Everything allowlisted (recommended — maximum cross-machine memory)
|
||||
- B) Only artifacts (plans, designs, retros, learnings) — skip timelines and profile
|
||||
- C) Decline — keep everything local
|
||||
|
||||
After the user answers, run (substituting the chosen value):
|
||||
|
||||
```bash
|
||||
# Chosen mode: full | artifacts-only | off
|
||||
"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode <choice>
|
||||
"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode_prompted true
|
||||
```
|
||||
|
||||
If A or B was chosen AND `~/.gstack/.git` doesn't exist, ask a follow-up:
|
||||
"Set up the GBrain sync repo now? (runs `gstack-brain-init`)"
|
||||
- A) Yes, run it now
|
||||
- B) Show me the command, I'll run it myself
|
||||
|
||||
Do not block the skill. Emit the question, continue the skill workflow. The
|
||||
next skill run picks up wherever this left off.
|
||||
|
||||
**At skill END (before the telemetry block),** run these bash commands to
|
||||
catch artifact writes (design docs, plans, retros) that skipped the writer
|
||||
shims, plus drain any still-pending queue entries:
|
||||
|
||||
```bash
|
||||
"~/.claude/skills/gstack/bin/gstack-brain-sync" --discover-new 2>/dev/null || true
|
||||
"~/.claude/skills/gstack/bin/gstack-brain-sync" --once 2>/dev/null || true
|
||||
```
|
||||
|
||||
|
||||
## Model-Specific Behavioral Patch (claude)
|
||||
|
||||
The following nudges are tuned for the claude model family. They are
|
||||
@@ -468,20 +696,6 @@ are shown, synthesize a one-paragraph welcome briefing before proceeding:
|
||||
"Welcome back to {branch}. Last session: /{skill} ({outcome}). [Checkpoint summary if
|
||||
available]. [Health score if available]." Keep it to 2-3 sentences.
|
||||
|
||||
## AskUserQuestion Format
|
||||
|
||||
**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.**
|
||||
|
||||
1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
|
||||
2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time.
|
||||
3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind.
|
||||
4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores.
|
||||
5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
|
||||
|
||||
Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
|
||||
|
||||
Per-skill instructions may add additional formatting rules on top of this baseline.
|
||||
|
||||
## Writing Style (skip entirely if `EXPLAIN_LEVEL: terse` appears in the preamble echo OR the user's current message explicitly requests terse / no-explanations output)
|
||||
|
||||
These rules apply to every AskUserQuestion, every response you write to the user, and every review finding. They compose with the AskUserQuestion Format section above: Format = *how* a question is structured; Writing Style = *the prose quality of the content inside it*.
|
||||
@@ -2522,8 +2736,8 @@ fi
|
||||
Read the `STATE:` line and dispatch:
|
||||
|
||||
- **FRESH** → proceed with the bump action below (steps 1–4).
|
||||
- **ALREADY_BUMPED** → skip the bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body. Continue to the next step.
|
||||
- **DRIFT_STALE_PKG** → a prior `/ship` bumped `VERSION` but failed to update `package.json`. Run the sync-only repair block below (after step 4). Do NOT re-bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body.
|
||||
- **ALREADY_BUMPED** → skip the bump by default, BUT check for queue drift first: call `bin/gstack-next-version` with the implied bump level (derived from `CURRENT_VERSION` vs `BASE_VERSION`), compare its `.version` against `CURRENT_VERSION`. If they differ (queue moved since last ship), use **AskUserQuestion**: "VERSION drift detected: you claim v<CURRENT> but next available is v<NEW> (queue moved). A) Rebump to v<NEW> and rewrite CHANGELOG header + PR title (recommended), B) Keep v<CURRENT> — will be rejected by CI version-gate until resolved." If A, treat this as FRESH with `NEW_VERSION=<new>` and run steps 1-4 (which will also trigger Step 13 CHANGELOG header rewrite and Step 19 PR title rewrite). If B, reuse `CURRENT_VERSION` and warn that CI will likely reject. If util is offline, warn and reuse `CURRENT_VERSION`.
|
||||
- **DRIFT_STALE_PKG** → a prior `/ship` bumped `VERSION` but failed to update `package.json`. Run the sync-only repair block below (after step 4). Do NOT re-bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body. (Queue check still runs in ALREADY_BUMPED terms after repair.)
|
||||
- **DRIFT_UNEXPECTED** → `/ship` has halted (exit 1). Resolve manually; /ship cannot tell which file is authoritative.
|
||||
|
||||
1. Read the current `VERSION` file (4-digit format: `MAJOR.MINOR.PATCH.MICRO`)
|
||||
@@ -2536,9 +2750,33 @@ Read the `STATE:` line and dispatch:
|
||||
- **MINOR** (2nd digit): **ASK the user** if ANY feature signal is detected, OR 500+ lines changed, OR new modules/packages added
|
||||
- **MAJOR** (1st digit): **ASK the user** — only for milestones or breaking changes
|
||||
|
||||
3. Compute the new version:
|
||||
- Bumping a digit resets all digits to its right to 0
|
||||
- Example: `0.19.1.0` + PATCH → `0.19.2.0`
|
||||
Save the chosen level as `BUMP_LEVEL` (one of `major`, `minor`, `patch`, `micro`). This is the user-intended level. The next step decides *placement* — the level stays the same even if queue-aware allocation has to advance past a claimed slot.
|
||||
|
||||
3. **Queue-aware version pick (workspace-aware ship, v1.6.4.0+).** Call `bin/gstack-next-version` to see what's already claimed by open PRs + active sibling Conductor worktrees, then render the queue state to the user:
|
||||
|
||||
```bash
|
||||
QUEUE_JSON=$(bun run bin/gstack-next-version \
|
||||
--base <base> \
|
||||
--bump "$BUMP_LEVEL" \
|
||||
--current-version "$BASE_VERSION" 2>/dev/null || echo '{"offline":true}')
|
||||
NEW_VERSION=$(echo "$QUEUE_JSON" | jq -r '.version // empty')
|
||||
CLAIMED_COUNT=$(echo "$QUEUE_JSON" | jq -r '.claimed | length')
|
||||
ACTIVE_SIBLING_COUNT=$(echo "$QUEUE_JSON" | jq -r '.active_siblings | length')
|
||||
OFFLINE=$(echo "$QUEUE_JSON" | jq -r '.offline // false')
|
||||
REASON=$(echo "$QUEUE_JSON" | jq -r '.reason // ""')
|
||||
```
|
||||
|
||||
- If `OFFLINE=true` or the util fails (auth expired, no `gh`/`glab`, network): fall back to local `BUMP_LEVEL` arithmetic (bump `BASE_VERSION` at the chosen level). Print `⚠ workspace-aware ship offline — using local bump only`. Continue.
|
||||
- If `CLAIMED_COUNT > 0`: render the queue table to the user so they can see landing order at a glance:
|
||||
```
|
||||
Queue on <base> (vBASE_VERSION):
|
||||
#<pr> <branch> → v<version> [⚠ collision with #<other>]
|
||||
Active sibling workspaces (WIP, not yet PR'd):
|
||||
<path> → v<version> (committed Nh ago)
|
||||
Your branch will claim: vNEW_VERSION (<reason>)
|
||||
```
|
||||
- If `ACTIVE_SIBLING_COUNT > 0` and any active sibling's VERSION is `>= NEW_VERSION`, use **AskUserQuestion**: "Sibling workspace <path> has v<X> committed <N>h ago but hasn't PR'd yet. Wait for them to ship first, or advance past? A) Advance past (recommended for unrelated work), B) Abort /ship and sync up with sibling first."
|
||||
- Validate `NEW_VERSION` matches `MAJOR.MINOR.PATCH.MICRO`. If util returns an empty or malformed version, fall back to local bump.
|
||||
|
||||
4. **Validate** `NEW_VERSION` and write it to **both** `VERSION` and `package.json`. This block runs only when `STATE: FRESH`.
|
||||
|
||||
@@ -2879,7 +3117,11 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number):
|
||||
glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR"
|
||||
```
|
||||
|
||||
If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20.
|
||||
If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run.
|
||||
|
||||
**Also update the PR title** if the version changed on rerun. PR titles use the workspace-aware format `v<NEW_VERSION> <type>: <summary>` — version ALWAYS first. If the current title's version prefix doesn't match `NEW_VERSION`, run `gh pr edit --title "v$NEW_VERSION <type>: <summary>"` (or the `glab mr update -t ...` equivalent). This keeps the title truthful when Step 12's queue-drift detection rebumps a stale version. If the title has no `v<X.Y.Z.W>` prefix (a custom title kept intentionally), leave the title alone — only rewrite titles that already follow the format.
|
||||
|
||||
Print the existing URL and continue to Step 20.
|
||||
|
||||
If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0.
|
||||
|
||||
@@ -2947,7 +3189,7 @@ you missed it.>
|
||||
**If GitHub:**
|
||||
|
||||
```bash
|
||||
gh pr create --base <base> --title "<type>: <summary>" --body "$(cat <<'EOF'
|
||||
gh pr create --base <base> --title "v$NEW_VERSION <type>: <summary>" --body "$(cat <<'EOF'
|
||||
<PR body from above>
|
||||
EOF
|
||||
)"
|
||||
@@ -2956,7 +3198,7 @@ EOF
|
||||
**If GitLab:**
|
||||
|
||||
```bash
|
||||
glab mr create -b <base> -t "<type>: <summary>" -d "$(cat <<'EOF'
|
||||
glab mr create -b <base> -t "v$NEW_VERSION <type>: <summary>" -d "$(cat <<'EOF'
|
||||
<MR body from above>
|
||||
EOF
|
||||
)"
|
||||
|
||||
+264
-22
@@ -344,6 +344,234 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
- Focus on completing the task and reporting results via prose output.
|
||||
- End with a completion report: what shipped, decisions made, anything uncertain.
|
||||
|
||||
## AskUserQuestion Format
|
||||
|
||||
**ALWAYS follow this structure for every AskUserQuestion call. Every element is non-skippable. If you find yourself about to skip any of them, stop and back up.**
|
||||
|
||||
### Required shape
|
||||
|
||||
Every AskUserQuestion reads like a decision brief, not a bullet list:
|
||||
|
||||
```
|
||||
D<N> — <one-line question title>
|
||||
|
||||
ELI10: <plain English a 16-year-old could follow, 2-4 sentences, name the stakes>
|
||||
|
||||
Stakes if we pick wrong: <one sentence on what breaks, what user sees, what's lost>
|
||||
|
||||
Recommendation: <choice> because <one-line reason>
|
||||
|
||||
Completeness: A=X/10, B=Y/10 (or: Note: options differ in kind, not coverage — no completeness score)
|
||||
|
||||
Pros / cons:
|
||||
|
||||
A) <option label> (recommended)
|
||||
✅ <pro — concrete, observable, ≥40 chars>
|
||||
✅ <pro>
|
||||
❌ <con — honest, ≥40 chars>
|
||||
|
||||
B) <option label>
|
||||
✅ <pro>
|
||||
❌ <con>
|
||||
|
||||
Net: <one-line synthesis of what you're actually trading off>
|
||||
```
|
||||
|
||||
### Element rules
|
||||
|
||||
1. **D-numbering.** First question in a skill invocation is `D1`. Increment per
|
||||
question within the same skill. This is a model-level instruction, not a
|
||||
runtime counter — you count your own questions. Nested skill invocation
|
||||
(e.g., `/plan-ceo-review` running `/office-hours` inline) starts its own
|
||||
D1; label as `D1 (office-hours)` to disambiguate when the user will see
|
||||
both. Drift is expected over long sessions; minor inconsistency is fine.
|
||||
|
||||
2. **Re-ground.** Before ELI10, state the project, current branch (use the
|
||||
`_BRANCH` value from the preamble, NOT conversation history or gitStatus),
|
||||
and the current plan/task. 1-2 sentences. Assume the user hasn't looked at
|
||||
this window in 20 minutes.
|
||||
|
||||
3. **ELI10 (ALWAYS).** Explain in plain English a smart 16-year-old could
|
||||
follow. Concrete examples and analogies, not function names. Say what it
|
||||
DOES, not what it's called. This is not preamble — the user is about to
|
||||
make a decision and needs context. Even in terse mode, emit the ELI10.
|
||||
|
||||
4. **Stakes if we pick wrong (ALWAYS).** One sentence naming what breaks in
|
||||
concrete terms (pain avoided / capability unlocked / consequence named).
|
||||
"Users see a 3-second spinner" beats "performance may degrade." Forces
|
||||
the trade-off to be real.
|
||||
|
||||
5. **Recommendation (ALWAYS).** `Recommendation: <choice> because <one-line
|
||||
reason>` on its own line. Never omit it. Required for every AskUserQuestion,
|
||||
even when neutral-posture (see rule 8). The `(recommended)` label on the
|
||||
option is REQUIRED — `scripts/resolvers/question-tuning.ts` reads it to
|
||||
power the AUTO_DECIDE path. Omitting it breaks auto-decide.
|
||||
|
||||
6. **Completeness scoring (when meaningful).** When options differ in
|
||||
coverage (full test coverage vs happy path vs shortcut, complete error
|
||||
handling vs partial), score each `Completeness: N/10` on its own line.
|
||||
Calibration: 10 = complete, 7 = happy path only, 3 = shortcut. Flag any
|
||||
option ≤5 where a higher-completeness option exists. When options differ
|
||||
in kind (review posture, architectural A-vs-B, cherry-pick Add/Defer/Skip,
|
||||
two different kinds of systems), SKIP the score and write one line:
|
||||
`Note: options differ in kind, not coverage — no completeness score.`
|
||||
Do NOT fabricate filler scores — empty 10/10 on every option is worse
|
||||
than no score.
|
||||
|
||||
7. **Pros / cons block.** Every option gets per-bullet ✅ (pro) and ❌ (con)
|
||||
markers. Rules:
|
||||
- **Minimum 2 pros and 1 con per option.** If you can't name a con for
|
||||
the recommended option, the recommendation is hollow — go find one. If
|
||||
you can't name a pro for the rejected option, the question isn't real.
|
||||
- **Minimum 40 characters per bullet.** `✅ Simple` is not a pro. `✅
|
||||
Reuses the YAML frontmatter format already in MEMORY.md, zero new
|
||||
parser` is a pro. Concrete, observable, specific.
|
||||
- **Hard-stop escape** for genuinely one-sided choices (destructive-action
|
||||
confirmation, one-way doors): a single bullet `✅ No cons — this is a
|
||||
hard-stop choice` satisfies the rule. Use sparingly; overuse flips a
|
||||
decision brief into theater.
|
||||
|
||||
8. **Net line (ALWAYS).** Closes the decision with a one-sentence synthesis
|
||||
of what the user is actually trading off. From the reference screenshot:
|
||||
*"The new-format case is speculative. The copy-format case is immediate
|
||||
leverage. Copy now, evolve later if a real pattern emerges."* Not a
|
||||
summary — a verdict frame.
|
||||
|
||||
9. **Neutral-posture handling.** When the skill explicitly says "neutral
|
||||
recommendation posture" (SELECTIVE EXPANSION cherry-picks, taste calls,
|
||||
kind-differentiated choices where neither side dominates), the
|
||||
Recommendation line reads: `Recommendation: <default-choice> — this is a
|
||||
taste call, no strong preference either way`. The `(recommended)` label
|
||||
STAYS on the default option (machine-readable hint for AUTO_DECIDE). The
|
||||
`— this is a taste call` prose is the human-readable neutrality signal.
|
||||
Both coexist.
|
||||
|
||||
10. **Effort both-scales.** When an option involves effort, show both human
|
||||
and CC scales: `(human: ~2 days / CC: ~15 min)`.
|
||||
|
||||
11. **Tool_use, not prose.** A markdown block labeled `Question:` is not a
|
||||
question — the user never sees it as interactive. If you wrote one in
|
||||
prose, stop and reissue as an actual AskUserQuestion tool_use. The rich
|
||||
markdown goes in the question body; the `options` array stays short
|
||||
labels (A, B, C).
|
||||
|
||||
### Self-check before emitting
|
||||
|
||||
Before calling AskUserQuestion, verify:
|
||||
- [ ] D<N> header present
|
||||
- [ ] ELI10 paragraph present (stakes line too)
|
||||
- [ ] Recommendation line present with concrete reason
|
||||
- [ ] Completeness scored (coverage) OR kind-note present (kind)
|
||||
- [ ] Every option has ≥2 ✅ and ≥1 ❌, each ≥40 chars (or hard-stop escape)
|
||||
- [ ] (recommended) label on one option (even for neutral-posture — see rule 9)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose
|
||||
|
||||
If you'd need to read the source to understand your own explanation, it's
|
||||
too complex — simplify before emitting.
|
||||
|
||||
Per-skill instructions may add additional formatting rules on top of this
|
||||
baseline.
|
||||
|
||||
## GBrain Sync (skill start)
|
||||
|
||||
```bash
|
||||
# gbrain-sync: drain pending writes, pull once per day. Silent no-op when
|
||||
# the feature isn't initialized or gbrain_sync_mode is "off". See
|
||||
# docs/gbrain-sync.md.
|
||||
|
||||
_GSTACK_HOME="${GSTACK_HOME:-$HOME/.gstack}"
|
||||
_BRAIN_REMOTE_FILE="$HOME/.gstack-brain-remote.txt"
|
||||
_BRAIN_SYNC_BIN="$GSTACK_BIN/gstack-brain-sync"
|
||||
_BRAIN_CONFIG_BIN="$GSTACK_BIN/gstack-config"
|
||||
|
||||
_BRAIN_SYNC_MODE=$("$_BRAIN_CONFIG_BIN" get gbrain_sync_mode 2>/dev/null || echo off)
|
||||
|
||||
# New-machine hint: URL file present, local .git missing, sync not yet enabled.
|
||||
if [ -f "$_BRAIN_REMOTE_FILE" ] && [ ! -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" = "off" ]; then
|
||||
_BRAIN_NEW_URL=$(head -1 "$_BRAIN_REMOTE_FILE" 2>/dev/null | tr -d '[:space:]')
|
||||
if [ -n "$_BRAIN_NEW_URL" ]; then
|
||||
echo "BRAIN_SYNC: brain repo detected: $_BRAIN_NEW_URL"
|
||||
echo "BRAIN_SYNC: run 'gstack-brain-restore' to pull your cross-machine memory (or 'gstack-config set gbrain_sync_mode off' to dismiss forever)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Active-sync path.
|
||||
if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
|
||||
# Once-per-day pull.
|
||||
_BRAIN_LAST_PULL_FILE="$_GSTACK_HOME/.brain-last-pull"
|
||||
_BRAIN_NOW=$(date +%s)
|
||||
_BRAIN_DO_PULL=1
|
||||
if [ -f "$_BRAIN_LAST_PULL_FILE" ]; then
|
||||
_BRAIN_LAST=$(cat "$_BRAIN_LAST_PULL_FILE" 2>/dev/null || echo 0)
|
||||
_BRAIN_AGE=$(( _BRAIN_NOW - _BRAIN_LAST ))
|
||||
[ "$_BRAIN_AGE" -lt 86400 ] && _BRAIN_DO_PULL=0
|
||||
fi
|
||||
if [ "$_BRAIN_DO_PULL" = "1" ]; then
|
||||
( cd "$_GSTACK_HOME" && git fetch origin >/dev/null 2>&1 && git merge --ff-only "origin/$(git rev-parse --abbrev-ref HEAD)" >/dev/null 2>&1 ) || true
|
||||
echo "$_BRAIN_NOW" > "$_BRAIN_LAST_PULL_FILE"
|
||||
fi
|
||||
# Drain pending queue, push.
|
||||
"$_BRAIN_SYNC_BIN" --once 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Status line — always emitted, easy to grep.
|
||||
if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
|
||||
_BRAIN_QUEUE_DEPTH=0
|
||||
[ -f "$_GSTACK_HOME/.brain-queue.jsonl" ] && _BRAIN_QUEUE_DEPTH=$(wc -l < "$_GSTACK_HOME/.brain-queue.jsonl" | tr -d ' ')
|
||||
_BRAIN_LAST_PUSH="never"
|
||||
[ -f "$_GSTACK_HOME/.brain-last-push" ] && _BRAIN_LAST_PUSH=$(cat "$_GSTACK_HOME/.brain-last-push" 2>/dev/null || echo never)
|
||||
echo "BRAIN_SYNC: mode=$_BRAIN_SYNC_MODE | last_push=$_BRAIN_LAST_PUSH | queue=$_BRAIN_QUEUE_DEPTH"
|
||||
else
|
||||
echo "BRAIN_SYNC: off"
|
||||
fi
|
||||
```
|
||||
|
||||
|
||||
|
||||
**Privacy stop-gate (fires ONCE per machine).**
|
||||
|
||||
If the bash output shows `BRAIN_SYNC: off` AND the config value
|
||||
`gbrain_sync_mode_prompted` is `false` AND gbrain is detected on this host
|
||||
(either `gbrain doctor --fast --json` succeeds or the `gbrain` binary is in PATH),
|
||||
fire a one-time privacy gate via AskUserQuestion:
|
||||
|
||||
> gstack can publish your session memory (learnings, plans, designs, retros) to a
|
||||
> private GitHub repo that GBrain indexes across your machines. Higher tiers
|
||||
> include behavioral data (session timelines, developer profile). How much do you
|
||||
> want to sync?
|
||||
|
||||
Options:
|
||||
- A) Everything allowlisted (recommended — maximum cross-machine memory)
|
||||
- B) Only artifacts (plans, designs, retros, learnings) — skip timelines and profile
|
||||
- C) Decline — keep everything local
|
||||
|
||||
After the user answers, run (substituting the chosen value):
|
||||
|
||||
```bash
|
||||
# Chosen mode: full | artifacts-only | off
|
||||
"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode <choice>
|
||||
"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode_prompted true
|
||||
```
|
||||
|
||||
If A or B was chosen AND `~/.gstack/.git` doesn't exist, ask a follow-up:
|
||||
"Set up the GBrain sync repo now? (runs `gstack-brain-init`)"
|
||||
- A) Yes, run it now
|
||||
- B) Show me the command, I'll run it myself
|
||||
|
||||
Do not block the skill. Emit the question, continue the skill workflow. The
|
||||
next skill run picks up wherever this left off.
|
||||
|
||||
**At skill END (before the telemetry block),** run these bash commands to
|
||||
catch artifact writes (design docs, plans, retros) that skipped the writer
|
||||
shims, plus drain any still-pending queue entries:
|
||||
|
||||
```bash
|
||||
"$GSTACK_BIN/gstack-brain-sync" --discover-new 2>/dev/null || true
|
||||
"$GSTACK_BIN/gstack-brain-sync" --once 2>/dev/null || true
|
||||
```
|
||||
|
||||
|
||||
## Model-Specific Behavioral Patch (claude)
|
||||
|
||||
The following nudges are tuned for the claude model family. They are
|
||||
@@ -457,20 +685,6 @@ are shown, synthesize a one-paragraph welcome briefing before proceeding:
|
||||
"Welcome back to {branch}. Last session: /{skill} ({outcome}). [Checkpoint summary if
|
||||
available]. [Health score if available]." Keep it to 2-3 sentences.
|
||||
|
||||
## AskUserQuestion Format
|
||||
|
||||
**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.**
|
||||
|
||||
1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
|
||||
2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time.
|
||||
3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind.
|
||||
4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores.
|
||||
5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
|
||||
|
||||
Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
|
||||
|
||||
Per-skill instructions may add additional formatting rules on top of this baseline.
|
||||
|
||||
## Writing Style (skip entirely if `EXPLAIN_LEVEL: terse` appears in the preamble echo OR the user's current message explicitly requests terse / no-explanations output)
|
||||
|
||||
These rules apply to every AskUserQuestion, every response you write to the user, and every review finding. They compose with the AskUserQuestion Format section above: Format = *how* a question is structured; Writing Style = *the prose quality of the content inside it*.
|
||||
@@ -2137,8 +2351,8 @@ fi
|
||||
Read the `STATE:` line and dispatch:
|
||||
|
||||
- **FRESH** → proceed with the bump action below (steps 1–4).
|
||||
- **ALREADY_BUMPED** → skip the bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body. Continue to the next step.
|
||||
- **DRIFT_STALE_PKG** → a prior `/ship` bumped `VERSION` but failed to update `package.json`. Run the sync-only repair block below (after step 4). Do NOT re-bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body.
|
||||
- **ALREADY_BUMPED** → skip the bump by default, BUT check for queue drift first: call `bin/gstack-next-version` with the implied bump level (derived from `CURRENT_VERSION` vs `BASE_VERSION`), compare its `.version` against `CURRENT_VERSION`. If they differ (queue moved since last ship), use **AskUserQuestion**: "VERSION drift detected: you claim v<CURRENT> but next available is v<NEW> (queue moved). A) Rebump to v<NEW> and rewrite CHANGELOG header + PR title (recommended), B) Keep v<CURRENT> — will be rejected by CI version-gate until resolved." If A, treat this as FRESH with `NEW_VERSION=<new>` and run steps 1-4 (which will also trigger Step 13 CHANGELOG header rewrite and Step 19 PR title rewrite). If B, reuse `CURRENT_VERSION` and warn that CI will likely reject. If util is offline, warn and reuse `CURRENT_VERSION`.
|
||||
- **DRIFT_STALE_PKG** → a prior `/ship` bumped `VERSION` but failed to update `package.json`. Run the sync-only repair block below (after step 4). Do NOT re-bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body. (Queue check still runs in ALREADY_BUMPED terms after repair.)
|
||||
- **DRIFT_UNEXPECTED** → `/ship` has halted (exit 1). Resolve manually; /ship cannot tell which file is authoritative.
|
||||
|
||||
1. Read the current `VERSION` file (4-digit format: `MAJOR.MINOR.PATCH.MICRO`)
|
||||
@@ -2151,9 +2365,33 @@ Read the `STATE:` line and dispatch:
|
||||
- **MINOR** (2nd digit): **ASK the user** if ANY feature signal is detected, OR 500+ lines changed, OR new modules/packages added
|
||||
- **MAJOR** (1st digit): **ASK the user** — only for milestones or breaking changes
|
||||
|
||||
3. Compute the new version:
|
||||
- Bumping a digit resets all digits to its right to 0
|
||||
- Example: `0.19.1.0` + PATCH → `0.19.2.0`
|
||||
Save the chosen level as `BUMP_LEVEL` (one of `major`, `minor`, `patch`, `micro`). This is the user-intended level. The next step decides *placement* — the level stays the same even if queue-aware allocation has to advance past a claimed slot.
|
||||
|
||||
3. **Queue-aware version pick (workspace-aware ship, v1.6.4.0+).** Call `bin/gstack-next-version` to see what's already claimed by open PRs + active sibling Conductor worktrees, then render the queue state to the user:
|
||||
|
||||
```bash
|
||||
QUEUE_JSON=$(bun run bin/gstack-next-version \
|
||||
--base <base> \
|
||||
--bump "$BUMP_LEVEL" \
|
||||
--current-version "$BASE_VERSION" 2>/dev/null || echo '{"offline":true}')
|
||||
NEW_VERSION=$(echo "$QUEUE_JSON" | jq -r '.version // empty')
|
||||
CLAIMED_COUNT=$(echo "$QUEUE_JSON" | jq -r '.claimed | length')
|
||||
ACTIVE_SIBLING_COUNT=$(echo "$QUEUE_JSON" | jq -r '.active_siblings | length')
|
||||
OFFLINE=$(echo "$QUEUE_JSON" | jq -r '.offline // false')
|
||||
REASON=$(echo "$QUEUE_JSON" | jq -r '.reason // ""')
|
||||
```
|
||||
|
||||
- If `OFFLINE=true` or the util fails (auth expired, no `gh`/`glab`, network): fall back to local `BUMP_LEVEL` arithmetic (bump `BASE_VERSION` at the chosen level). Print `⚠ workspace-aware ship offline — using local bump only`. Continue.
|
||||
- If `CLAIMED_COUNT > 0`: render the queue table to the user so they can see landing order at a glance:
|
||||
```
|
||||
Queue on <base> (vBASE_VERSION):
|
||||
#<pr> <branch> → v<version> [⚠ collision with #<other>]
|
||||
Active sibling workspaces (WIP, not yet PR'd):
|
||||
<path> → v<version> (committed Nh ago)
|
||||
Your branch will claim: vNEW_VERSION (<reason>)
|
||||
```
|
||||
- If `ACTIVE_SIBLING_COUNT > 0` and any active sibling's VERSION is `>= NEW_VERSION`, use **AskUserQuestion**: "Sibling workspace <path> has v<X> committed <N>h ago but hasn't PR'd yet. Wait for them to ship first, or advance past? A) Advance past (recommended for unrelated work), B) Abort /ship and sync up with sibling first."
|
||||
- Validate `NEW_VERSION` matches `MAJOR.MINOR.PATCH.MICRO`. If util returns an empty or malformed version, fall back to local bump.
|
||||
|
||||
4. **Validate** `NEW_VERSION` and write it to **both** `VERSION` and `package.json`. This block runs only when `STATE: FRESH`.
|
||||
|
||||
@@ -2494,7 +2732,11 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number):
|
||||
glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR"
|
||||
```
|
||||
|
||||
If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20.
|
||||
If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run.
|
||||
|
||||
**Also update the PR title** if the version changed on rerun. PR titles use the workspace-aware format `v<NEW_VERSION> <type>: <summary>` — version ALWAYS first. If the current title's version prefix doesn't match `NEW_VERSION`, run `gh pr edit --title "v$NEW_VERSION <type>: <summary>"` (or the `glab mr update -t ...` equivalent). This keeps the title truthful when Step 12's queue-drift detection rebumps a stale version. If the title has no `v<X.Y.Z.W>` prefix (a custom title kept intentionally), leave the title alone — only rewrite titles that already follow the format.
|
||||
|
||||
Print the existing URL and continue to Step 20.
|
||||
|
||||
If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0.
|
||||
|
||||
@@ -2562,7 +2804,7 @@ you missed it.>
|
||||
**If GitHub:**
|
||||
|
||||
```bash
|
||||
gh pr create --base <base> --title "<type>: <summary>" --body "$(cat <<'EOF'
|
||||
gh pr create --base <base> --title "v$NEW_VERSION <type>: <summary>" --body "$(cat <<'EOF'
|
||||
<PR body from above>
|
||||
EOF
|
||||
)"
|
||||
@@ -2571,7 +2813,7 @@ EOF
|
||||
**If GitLab:**
|
||||
|
||||
```bash
|
||||
glab mr create -b <base> -t "<type>: <summary>" -d "$(cat <<'EOF'
|
||||
glab mr create -b <base> -t "v$NEW_VERSION <type>: <summary>" -d "$(cat <<'EOF'
|
||||
<MR body from above>
|
||||
EOF
|
||||
)"
|
||||
|
||||
+264
-22
@@ -346,6 +346,234 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
- Focus on completing the task and reporting results via prose output.
|
||||
- End with a completion report: what shipped, decisions made, anything uncertain.
|
||||
|
||||
## AskUserQuestion Format
|
||||
|
||||
**ALWAYS follow this structure for every AskUserQuestion call. Every element is non-skippable. If you find yourself about to skip any of them, stop and back up.**
|
||||
|
||||
### Required shape
|
||||
|
||||
Every AskUserQuestion reads like a decision brief, not a bullet list:
|
||||
|
||||
```
|
||||
D<N> — <one-line question title>
|
||||
|
||||
ELI10: <plain English a 16-year-old could follow, 2-4 sentences, name the stakes>
|
||||
|
||||
Stakes if we pick wrong: <one sentence on what breaks, what user sees, what's lost>
|
||||
|
||||
Recommendation: <choice> because <one-line reason>
|
||||
|
||||
Completeness: A=X/10, B=Y/10 (or: Note: options differ in kind, not coverage — no completeness score)
|
||||
|
||||
Pros / cons:
|
||||
|
||||
A) <option label> (recommended)
|
||||
✅ <pro — concrete, observable, ≥40 chars>
|
||||
✅ <pro>
|
||||
❌ <con — honest, ≥40 chars>
|
||||
|
||||
B) <option label>
|
||||
✅ <pro>
|
||||
❌ <con>
|
||||
|
||||
Net: <one-line synthesis of what you're actually trading off>
|
||||
```
|
||||
|
||||
### Element rules
|
||||
|
||||
1. **D-numbering.** First question in a skill invocation is `D1`. Increment per
|
||||
question within the same skill. This is a model-level instruction, not a
|
||||
runtime counter — you count your own questions. Nested skill invocation
|
||||
(e.g., `/plan-ceo-review` running `/office-hours` inline) starts its own
|
||||
D1; label as `D1 (office-hours)` to disambiguate when the user will see
|
||||
both. Drift is expected over long sessions; minor inconsistency is fine.
|
||||
|
||||
2. **Re-ground.** Before ELI10, state the project, current branch (use the
|
||||
`_BRANCH` value from the preamble, NOT conversation history or gitStatus),
|
||||
and the current plan/task. 1-2 sentences. Assume the user hasn't looked at
|
||||
this window in 20 minutes.
|
||||
|
||||
3. **ELI10 (ALWAYS).** Explain in plain English a smart 16-year-old could
|
||||
follow. Concrete examples and analogies, not function names. Say what it
|
||||
DOES, not what it's called. This is not preamble — the user is about to
|
||||
make a decision and needs context. Even in terse mode, emit the ELI10.
|
||||
|
||||
4. **Stakes if we pick wrong (ALWAYS).** One sentence naming what breaks in
|
||||
concrete terms (pain avoided / capability unlocked / consequence named).
|
||||
"Users see a 3-second spinner" beats "performance may degrade." Forces
|
||||
the trade-off to be real.
|
||||
|
||||
5. **Recommendation (ALWAYS).** `Recommendation: <choice> because <one-line
|
||||
reason>` on its own line. Never omit it. Required for every AskUserQuestion,
|
||||
even when neutral-posture (see rule 8). The `(recommended)` label on the
|
||||
option is REQUIRED — `scripts/resolvers/question-tuning.ts` reads it to
|
||||
power the AUTO_DECIDE path. Omitting it breaks auto-decide.
|
||||
|
||||
6. **Completeness scoring (when meaningful).** When options differ in
|
||||
coverage (full test coverage vs happy path vs shortcut, complete error
|
||||
handling vs partial), score each `Completeness: N/10` on its own line.
|
||||
Calibration: 10 = complete, 7 = happy path only, 3 = shortcut. Flag any
|
||||
option ≤5 where a higher-completeness option exists. When options differ
|
||||
in kind (review posture, architectural A-vs-B, cherry-pick Add/Defer/Skip,
|
||||
two different kinds of systems), SKIP the score and write one line:
|
||||
`Note: options differ in kind, not coverage — no completeness score.`
|
||||
Do NOT fabricate filler scores — empty 10/10 on every option is worse
|
||||
than no score.
|
||||
|
||||
7. **Pros / cons block.** Every option gets per-bullet ✅ (pro) and ❌ (con)
|
||||
markers. Rules:
|
||||
- **Minimum 2 pros and 1 con per option.** If you can't name a con for
|
||||
the recommended option, the recommendation is hollow — go find one. If
|
||||
you can't name a pro for the rejected option, the question isn't real.
|
||||
- **Minimum 40 characters per bullet.** `✅ Simple` is not a pro. `✅
|
||||
Reuses the YAML frontmatter format already in MEMORY.md, zero new
|
||||
parser` is a pro. Concrete, observable, specific.
|
||||
- **Hard-stop escape** for genuinely one-sided choices (destructive-action
|
||||
confirmation, one-way doors): a single bullet `✅ No cons — this is a
|
||||
hard-stop choice` satisfies the rule. Use sparingly; overuse flips a
|
||||
decision brief into theater.
|
||||
|
||||
8. **Net line (ALWAYS).** Closes the decision with a one-sentence synthesis
|
||||
of what the user is actually trading off. From the reference screenshot:
|
||||
*"The new-format case is speculative. The copy-format case is immediate
|
||||
leverage. Copy now, evolve later if a real pattern emerges."* Not a
|
||||
summary — a verdict frame.
|
||||
|
||||
9. **Neutral-posture handling.** When the skill explicitly says "neutral
|
||||
recommendation posture" (SELECTIVE EXPANSION cherry-picks, taste calls,
|
||||
kind-differentiated choices where neither side dominates), the
|
||||
Recommendation line reads: `Recommendation: <default-choice> — this is a
|
||||
taste call, no strong preference either way`. The `(recommended)` label
|
||||
STAYS on the default option (machine-readable hint for AUTO_DECIDE). The
|
||||
`— this is a taste call` prose is the human-readable neutrality signal.
|
||||
Both coexist.
|
||||
|
||||
10. **Effort both-scales.** When an option involves effort, show both human
|
||||
and CC scales: `(human: ~2 days / CC: ~15 min)`.
|
||||
|
||||
11. **Tool_use, not prose.** A markdown block labeled `Question:` is not a
|
||||
question — the user never sees it as interactive. If you wrote one in
|
||||
prose, stop and reissue as an actual AskUserQuestion tool_use. The rich
|
||||
markdown goes in the question body; the `options` array stays short
|
||||
labels (A, B, C).
|
||||
|
||||
### Self-check before emitting
|
||||
|
||||
Before calling AskUserQuestion, verify:
|
||||
- [ ] D<N> header present
|
||||
- [ ] ELI10 paragraph present (stakes line too)
|
||||
- [ ] Recommendation line present with concrete reason
|
||||
- [ ] Completeness scored (coverage) OR kind-note present (kind)
|
||||
- [ ] Every option has ≥2 ✅ and ≥1 ❌, each ≥40 chars (or hard-stop escape)
|
||||
- [ ] (recommended) label on one option (even for neutral-posture — see rule 9)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose
|
||||
|
||||
If you'd need to read the source to understand your own explanation, it's
|
||||
too complex — simplify before emitting.
|
||||
|
||||
Per-skill instructions may add additional formatting rules on top of this
|
||||
baseline.
|
||||
|
||||
## GBrain Sync (skill start)
|
||||
|
||||
```bash
|
||||
# gbrain-sync: drain pending writes, pull once per day. Silent no-op when
|
||||
# the feature isn't initialized or gbrain_sync_mode is "off". See
|
||||
# docs/gbrain-sync.md.
|
||||
|
||||
_GSTACK_HOME="${GSTACK_HOME:-$HOME/.gstack}"
|
||||
_BRAIN_REMOTE_FILE="$HOME/.gstack-brain-remote.txt"
|
||||
_BRAIN_SYNC_BIN="$GSTACK_BIN/gstack-brain-sync"
|
||||
_BRAIN_CONFIG_BIN="$GSTACK_BIN/gstack-config"
|
||||
|
||||
_BRAIN_SYNC_MODE=$("$_BRAIN_CONFIG_BIN" get gbrain_sync_mode 2>/dev/null || echo off)
|
||||
|
||||
# New-machine hint: URL file present, local .git missing, sync not yet enabled.
|
||||
if [ -f "$_BRAIN_REMOTE_FILE" ] && [ ! -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" = "off" ]; then
|
||||
_BRAIN_NEW_URL=$(head -1 "$_BRAIN_REMOTE_FILE" 2>/dev/null | tr -d '[:space:]')
|
||||
if [ -n "$_BRAIN_NEW_URL" ]; then
|
||||
echo "BRAIN_SYNC: brain repo detected: $_BRAIN_NEW_URL"
|
||||
echo "BRAIN_SYNC: run 'gstack-brain-restore' to pull your cross-machine memory (or 'gstack-config set gbrain_sync_mode off' to dismiss forever)"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Active-sync path.
|
||||
if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
|
||||
# Once-per-day pull.
|
||||
_BRAIN_LAST_PULL_FILE="$_GSTACK_HOME/.brain-last-pull"
|
||||
_BRAIN_NOW=$(date +%s)
|
||||
_BRAIN_DO_PULL=1
|
||||
if [ -f "$_BRAIN_LAST_PULL_FILE" ]; then
|
||||
_BRAIN_LAST=$(cat "$_BRAIN_LAST_PULL_FILE" 2>/dev/null || echo 0)
|
||||
_BRAIN_AGE=$(( _BRAIN_NOW - _BRAIN_LAST ))
|
||||
[ "$_BRAIN_AGE" -lt 86400 ] && _BRAIN_DO_PULL=0
|
||||
fi
|
||||
if [ "$_BRAIN_DO_PULL" = "1" ]; then
|
||||
( cd "$_GSTACK_HOME" && git fetch origin >/dev/null 2>&1 && git merge --ff-only "origin/$(git rev-parse --abbrev-ref HEAD)" >/dev/null 2>&1 ) || true
|
||||
echo "$_BRAIN_NOW" > "$_BRAIN_LAST_PULL_FILE"
|
||||
fi
|
||||
# Drain pending queue, push.
|
||||
"$_BRAIN_SYNC_BIN" --once 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Status line — always emitted, easy to grep.
|
||||
if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then
|
||||
_BRAIN_QUEUE_DEPTH=0
|
||||
[ -f "$_GSTACK_HOME/.brain-queue.jsonl" ] && _BRAIN_QUEUE_DEPTH=$(wc -l < "$_GSTACK_HOME/.brain-queue.jsonl" | tr -d ' ')
|
||||
_BRAIN_LAST_PUSH="never"
|
||||
[ -f "$_GSTACK_HOME/.brain-last-push" ] && _BRAIN_LAST_PUSH=$(cat "$_GSTACK_HOME/.brain-last-push" 2>/dev/null || echo never)
|
||||
echo "BRAIN_SYNC: mode=$_BRAIN_SYNC_MODE | last_push=$_BRAIN_LAST_PUSH | queue=$_BRAIN_QUEUE_DEPTH"
|
||||
else
|
||||
echo "BRAIN_SYNC: off"
|
||||
fi
|
||||
```
|
||||
|
||||
|
||||
|
||||
**Privacy stop-gate (fires ONCE per machine).**
|
||||
|
||||
If the bash output shows `BRAIN_SYNC: off` AND the config value
|
||||
`gbrain_sync_mode_prompted` is `false` AND gbrain is detected on this host
|
||||
(either `gbrain doctor --fast --json` succeeds or the `gbrain` binary is in PATH),
|
||||
fire a one-time privacy gate via AskUserQuestion:
|
||||
|
||||
> gstack can publish your session memory (learnings, plans, designs, retros) to a
|
||||
> private GitHub repo that GBrain indexes across your machines. Higher tiers
|
||||
> include behavioral data (session timelines, developer profile). How much do you
|
||||
> want to sync?
|
||||
|
||||
Options:
|
||||
- A) Everything allowlisted (recommended — maximum cross-machine memory)
|
||||
- B) Only artifacts (plans, designs, retros, learnings) — skip timelines and profile
|
||||
- C) Decline — keep everything local
|
||||
|
||||
After the user answers, run (substituting the chosen value):
|
||||
|
||||
```bash
|
||||
# Chosen mode: full | artifacts-only | off
|
||||
"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode <choice>
|
||||
"$_BRAIN_CONFIG_BIN" set gbrain_sync_mode_prompted true
|
||||
```
|
||||
|
||||
If A or B was chosen AND `~/.gstack/.git` doesn't exist, ask a follow-up:
|
||||
"Set up the GBrain sync repo now? (runs `gstack-brain-init`)"
|
||||
- A) Yes, run it now
|
||||
- B) Show me the command, I'll run it myself
|
||||
|
||||
Do not block the skill. Emit the question, continue the skill workflow. The
|
||||
next skill run picks up wherever this left off.
|
||||
|
||||
**At skill END (before the telemetry block),** run these bash commands to
|
||||
catch artifact writes (design docs, plans, retros) that skipped the writer
|
||||
shims, plus drain any still-pending queue entries:
|
||||
|
||||
```bash
|
||||
"$GSTACK_BIN/gstack-brain-sync" --discover-new 2>/dev/null || true
|
||||
"$GSTACK_BIN/gstack-brain-sync" --once 2>/dev/null || true
|
||||
```
|
||||
|
||||
|
||||
## Model-Specific Behavioral Patch (claude)
|
||||
|
||||
The following nudges are tuned for the claude model family. They are
|
||||
@@ -459,20 +687,6 @@ are shown, synthesize a one-paragraph welcome briefing before proceeding:
|
||||
"Welcome back to {branch}. Last session: /{skill} ({outcome}). [Checkpoint summary if
|
||||
available]. [Health score if available]." Keep it to 2-3 sentences.
|
||||
|
||||
## AskUserQuestion Format
|
||||
|
||||
**ALWAYS follow this structure for every AskUserQuestion call. All four elements are non-skippable. If you find yourself about to skip any of them, stop and back up.**
|
||||
|
||||
1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
|
||||
2. **Simplify (ELI10, ALWAYS):** Explain what's happening in plain English a smart 16-year-old could follow. Concrete examples and analogies, not function names or internal jargon. Say what it DOES, not what it's called. State the stakes: what breaks if we pick wrong. This is NOT optional verbosity and it is NOT preamble — the user is about to make a decision and needs context. Even if you'd normally stay terse, emit the ELI10 paragraph. The user will ask for it anyway; do it the first time.
|
||||
3. **Recommend (ALWAYS):** Every question ends with `RECOMMENDATION: Choose [X] because [one-line reason]` on its own line. Never omit it. Never collapse it into the options list. Required for every AskUserQuestion, regardless of whether the options are coverage-differentiated or different-in-kind.
|
||||
4. **Score completeness (when meaningful):** When options differ in coverage (e.g. full test coverage vs happy path vs shortcut, complete error handling vs partial), score each with `Completeness: N/10` on its own line. Calibration: 10 = complete (all edge cases, full coverage), 7 = happy path only, 3 = shortcut. Flag any option ≤5 where a higher-completeness option exists. When options differ in kind (picking a review posture, picking an architectural approach, cherry-pick Add/Defer/Skip, choosing between two different kinds of systems), the completeness axis doesn't apply — skip `Completeness: N/10` entirely and write one line: `Note: options differ in kind, not coverage — no completeness score.` Do not fabricate filler scores.
|
||||
5. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)`
|
||||
|
||||
Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.
|
||||
|
||||
Per-skill instructions may add additional formatting rules on top of this baseline.
|
||||
|
||||
## Writing Style (skip entirely if `EXPLAIN_LEVEL: terse` appears in the preamble echo OR the user's current message explicitly requests terse / no-explanations output)
|
||||
|
||||
These rules apply to every AskUserQuestion, every response you write to the user, and every review finding. They compose with the AskUserQuestion Format section above: Format = *how* a question is structured; Writing Style = *the prose quality of the content inside it*.
|
||||
@@ -2513,8 +2727,8 @@ fi
|
||||
Read the `STATE:` line and dispatch:
|
||||
|
||||
- **FRESH** → proceed with the bump action below (steps 1–4).
|
||||
- **ALREADY_BUMPED** → skip the bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body. Continue to the next step.
|
||||
- **DRIFT_STALE_PKG** → a prior `/ship` bumped `VERSION` but failed to update `package.json`. Run the sync-only repair block below (after step 4). Do NOT re-bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body.
|
||||
- **ALREADY_BUMPED** → skip the bump by default, BUT check for queue drift first: call `bin/gstack-next-version` with the implied bump level (derived from `CURRENT_VERSION` vs `BASE_VERSION`), compare its `.version` against `CURRENT_VERSION`. If they differ (queue moved since last ship), use **AskUserQuestion**: "VERSION drift detected: you claim v<CURRENT> but next available is v<NEW> (queue moved). A) Rebump to v<NEW> and rewrite CHANGELOG header + PR title (recommended), B) Keep v<CURRENT> — will be rejected by CI version-gate until resolved." If A, treat this as FRESH with `NEW_VERSION=<new>` and run steps 1-4 (which will also trigger Step 13 CHANGELOG header rewrite and Step 19 PR title rewrite). If B, reuse `CURRENT_VERSION` and warn that CI will likely reject. If util is offline, warn and reuse `CURRENT_VERSION`.
|
||||
- **DRIFT_STALE_PKG** → a prior `/ship` bumped `VERSION` but failed to update `package.json`. Run the sync-only repair block below (after step 4). Do NOT re-bump. Reuse `CURRENT_VERSION` for CHANGELOG and PR body. (Queue check still runs in ALREADY_BUMPED terms after repair.)
|
||||
- **DRIFT_UNEXPECTED** → `/ship` has halted (exit 1). Resolve manually; /ship cannot tell which file is authoritative.
|
||||
|
||||
1. Read the current `VERSION` file (4-digit format: `MAJOR.MINOR.PATCH.MICRO`)
|
||||
@@ -2527,9 +2741,33 @@ Read the `STATE:` line and dispatch:
|
||||
- **MINOR** (2nd digit): **ASK the user** if ANY feature signal is detected, OR 500+ lines changed, OR new modules/packages added
|
||||
- **MAJOR** (1st digit): **ASK the user** — only for milestones or breaking changes
|
||||
|
||||
3. Compute the new version:
|
||||
- Bumping a digit resets all digits to its right to 0
|
||||
- Example: `0.19.1.0` + PATCH → `0.19.2.0`
|
||||
Save the chosen level as `BUMP_LEVEL` (one of `major`, `minor`, `patch`, `micro`). This is the user-intended level. The next step decides *placement* — the level stays the same even if queue-aware allocation has to advance past a claimed slot.
|
||||
|
||||
3. **Queue-aware version pick (workspace-aware ship, v1.6.4.0+).** Call `bin/gstack-next-version` to see what's already claimed by open PRs + active sibling Conductor worktrees, then render the queue state to the user:
|
||||
|
||||
```bash
|
||||
QUEUE_JSON=$(bun run bin/gstack-next-version \
|
||||
--base <base> \
|
||||
--bump "$BUMP_LEVEL" \
|
||||
--current-version "$BASE_VERSION" 2>/dev/null || echo '{"offline":true}')
|
||||
NEW_VERSION=$(echo "$QUEUE_JSON" | jq -r '.version // empty')
|
||||
CLAIMED_COUNT=$(echo "$QUEUE_JSON" | jq -r '.claimed | length')
|
||||
ACTIVE_SIBLING_COUNT=$(echo "$QUEUE_JSON" | jq -r '.active_siblings | length')
|
||||
OFFLINE=$(echo "$QUEUE_JSON" | jq -r '.offline // false')
|
||||
REASON=$(echo "$QUEUE_JSON" | jq -r '.reason // ""')
|
||||
```
|
||||
|
||||
- If `OFFLINE=true` or the util fails (auth expired, no `gh`/`glab`, network): fall back to local `BUMP_LEVEL` arithmetic (bump `BASE_VERSION` at the chosen level). Print `⚠ workspace-aware ship offline — using local bump only`. Continue.
|
||||
- If `CLAIMED_COUNT > 0`: render the queue table to the user so they can see landing order at a glance:
|
||||
```
|
||||
Queue on <base> (vBASE_VERSION):
|
||||
#<pr> <branch> → v<version> [⚠ collision with #<other>]
|
||||
Active sibling workspaces (WIP, not yet PR'd):
|
||||
<path> → v<version> (committed Nh ago)
|
||||
Your branch will claim: vNEW_VERSION (<reason>)
|
||||
```
|
||||
- If `ACTIVE_SIBLING_COUNT > 0` and any active sibling's VERSION is `>= NEW_VERSION`, use **AskUserQuestion**: "Sibling workspace <path> has v<X> committed <N>h ago but hasn't PR'd yet. Wait for them to ship first, or advance past? A) Advance past (recommended for unrelated work), B) Abort /ship and sync up with sibling first."
|
||||
- Validate `NEW_VERSION` matches `MAJOR.MINOR.PATCH.MICRO`. If util returns an empty or malformed version, fall back to local bump.
|
||||
|
||||
4. **Validate** `NEW_VERSION` and write it to **both** `VERSION` and `package.json`. This block runs only when `STATE: FRESH`.
|
||||
|
||||
@@ -2870,7 +3108,11 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number):
|
||||
glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR"
|
||||
```
|
||||
|
||||
If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20.
|
||||
If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run.
|
||||
|
||||
**Also update the PR title** if the version changed on rerun. PR titles use the workspace-aware format `v<NEW_VERSION> <type>: <summary>` — version ALWAYS first. If the current title's version prefix doesn't match `NEW_VERSION`, run `gh pr edit --title "v$NEW_VERSION <type>: <summary>"` (or the `glab mr update -t ...` equivalent). This keeps the title truthful when Step 12's queue-drift detection rebumps a stale version. If the title has no `v<X.Y.Z.W>` prefix (a custom title kept intentionally), leave the title alone — only rewrite titles that already follow the format.
|
||||
|
||||
Print the existing URL and continue to Step 20.
|
||||
|
||||
If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0.
|
||||
|
||||
@@ -2938,7 +3180,7 @@ you missed it.>
|
||||
**If GitHub:**
|
||||
|
||||
```bash
|
||||
gh pr create --base <base> --title "<type>: <summary>" --body "$(cat <<'EOF'
|
||||
gh pr create --base <base> --title "v$NEW_VERSION <type>: <summary>" --body "$(cat <<'EOF'
|
||||
<PR body from above>
|
||||
EOF
|
||||
)"
|
||||
@@ -2947,7 +3189,7 @@ EOF
|
||||
**If GitLab:**
|
||||
|
||||
```bash
|
||||
glab mr create -b <base> -t "<type>: <summary>" -d "$(cat <<'EOF'
|
||||
glab mr create -b <base> -t "v$NEW_VERSION <type>: <summary>" -d "$(cat <<'EOF'
|
||||
<MR body from above>
|
||||
EOF
|
||||
)"
|
||||
|
||||
Vendored
+487
@@ -0,0 +1,487 @@
|
||||
/**
|
||||
* Overlay-efficacy fixture registry.
|
||||
*
|
||||
* Each fixture defines a reproducible A/B test for one behavioral nudge
|
||||
* embedded in a model-overlays/*.md file. The harness at
|
||||
* test/skill-e2e-overlay-harness.test.ts iterates this registry and runs
|
||||
* `fixture.trials` A/B trials per fixture, asserting `fixture.pass(arms)`.
|
||||
*
|
||||
* Adding a new overlay eval = one entry in this list. The harness handles
|
||||
* arm wiring, concurrency, artifact storage, rate-limit retries, and the
|
||||
* cross-harness diagnostic.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import {
|
||||
firstTurnParallelism,
|
||||
type AgentSdkResult,
|
||||
} from '../helpers/agent-sdk-runner';
|
||||
|
||||
const REPO_ROOT = path.resolve(__dirname, '..', '..');
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface OverlayFixture {
|
||||
/** Unique, lowercase/digits/dash only. Used in artifact paths. */
|
||||
id: string;
|
||||
/** Path to the overlay file, relative to repo root. */
|
||||
overlayPath: string;
|
||||
/** API model ID, not the overlay family name. */
|
||||
model: string;
|
||||
/** Integer >= 3. Trials per arm. */
|
||||
trials: number;
|
||||
/** Max concurrent queries for this fixture's arms. Default 3. */
|
||||
concurrency?: number;
|
||||
/** Populate the workspace dir before each trial. */
|
||||
setupWorkspace: (dir: string) => void;
|
||||
/** The prompt the model receives. Non-empty. */
|
||||
userPrompt: string;
|
||||
/** Per-fixture tool allowlist. Omit to use runner default [Read, Glob, Grep, Bash]. */
|
||||
allowedTools?: string[];
|
||||
/** Max turns per trial. Omit to use runner default (5). */
|
||||
maxTurns?: number;
|
||||
/**
|
||||
* Direction of the expected effect. `higher_is_better` = overlay should
|
||||
* increase the metric (e.g. fanout, files touched for literal scope).
|
||||
* `lower_is_better` = overlay should decrease it (e.g. Bash count, turn count).
|
||||
* Used only for cosmetic logging in the test output; `pass` is the actual gate.
|
||||
*/
|
||||
direction?: 'higher_is_better' | 'lower_is_better';
|
||||
/** Compute the per-trial metric from the typed SDK result. */
|
||||
metric: (r: AgentSdkResult) => number;
|
||||
/** Acceptance predicate across all arms' per-trial metrics. */
|
||||
pass: (arms: { overlay: number[]; off: number[] }) => boolean;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Validation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function validateFixtures(fixtures: OverlayFixture[]): void {
|
||||
const ids = new Set<string>();
|
||||
for (const f of fixtures) {
|
||||
if (!f.id || !/^[a-z0-9-]+$/.test(f.id)) {
|
||||
throw new Error(
|
||||
`fixture id must be non-empty, lowercase/digits/dash only: ${JSON.stringify(f.id)}`,
|
||||
);
|
||||
}
|
||||
if (ids.has(f.id)) {
|
||||
throw new Error(`duplicate fixture id: ${f.id}`);
|
||||
}
|
||||
ids.add(f.id);
|
||||
|
||||
if (!Number.isInteger(f.trials) || f.trials < 3) {
|
||||
throw new Error(`${f.id}: trials must be an integer >= 3 (got ${f.trials})`);
|
||||
}
|
||||
if (
|
||||
f.concurrency !== undefined &&
|
||||
(!Number.isInteger(f.concurrency) || f.concurrency < 1)
|
||||
) {
|
||||
throw new Error(
|
||||
`${f.id}: concurrency must be an integer >= 1 (got ${f.concurrency})`,
|
||||
);
|
||||
}
|
||||
|
||||
if (!f.model) throw new Error(`${f.id}: model must be non-empty`);
|
||||
if (!f.userPrompt) throw new Error(`${f.id}: userPrompt must be non-empty`);
|
||||
|
||||
if (path.isAbsolute(f.overlayPath) || f.overlayPath.includes('..')) {
|
||||
throw new Error(
|
||||
`${f.id}: overlayPath must be relative and must not contain '..' (got ${f.overlayPath})`,
|
||||
);
|
||||
}
|
||||
const fullPath = path.resolve(REPO_ROOT, f.overlayPath);
|
||||
if (!fs.existsSync(fullPath)) {
|
||||
throw new Error(`${f.id}: overlay file not found at ${f.overlayPath}`);
|
||||
}
|
||||
|
||||
for (const fn of ['setupWorkspace', 'metric', 'pass'] as const) {
|
||||
if (typeof f[fn] !== 'function') {
|
||||
throw new Error(`${f.id}: ${fn} must be a function`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metric + predicate helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function mean(xs: number[]): number {
|
||||
if (xs.length === 0) return 0;
|
||||
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Standard fanout predicate: overlay mean beats off mean by at least 0.5
|
||||
* parallel tool_use blocks in first turn, AND at least 3 of the overlay
|
||||
* trials emit >= 2 parallel tool_use blocks.
|
||||
*
|
||||
* The combined rule catches both "overlay nudges every trial slightly"
|
||||
* (mean) and "overlay sometimes triggers real fanout" (floor). A single
|
||||
* 0.5 lift with every trial still emitting 1 call would be suspicious;
|
||||
* this predicate rejects it.
|
||||
*/
|
||||
export function fanoutPass(arms: { overlay: number[]; off: number[] }): boolean {
|
||||
const lift = mean(arms.overlay) - mean(arms.off);
|
||||
const floorHits = arms.overlay.filter((n) => n >= 2).length;
|
||||
return lift >= 0.5 && floorHits >= 3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic "lower is better" pass predicate: overlay mean should drop the
|
||||
* metric by at least 20% vs baseline. Used for nudges like "effort-match"
|
||||
* (fewer turns) and "dedicated tools vs Bash" (fewer Bash calls).
|
||||
*/
|
||||
export function lowerIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
|
||||
const meanOff = mean(arms.off);
|
||||
if (meanOff === 0) return mean(arms.overlay) <= meanOff;
|
||||
return mean(arms.overlay) <= meanOff * 0.8;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic "higher is better" pass predicate: overlay mean should lift the
|
||||
* metric by at least 20% vs baseline. Used for nudges like "literal
|
||||
* interpretation" (more files touched when scope is ambiguous).
|
||||
*/
|
||||
export function higherIsBetter20Pct(arms: { overlay: number[]; off: number[] }): boolean {
|
||||
const meanOff = mean(arms.off);
|
||||
const meanOn = mean(arms.overlay);
|
||||
if (meanOff === 0) return meanOn > 0;
|
||||
return meanOn >= meanOff * 1.2;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metrics
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Count the total number of Bash tool_use blocks across ALL assistant turns.
|
||||
* Signal for "dedicated tools over Bash" nudge in claude.md.
|
||||
*/
|
||||
export function bashToolCallCount(r: AgentSdkResult): number {
|
||||
return r.toolCalls.filter((c) => c.tool === 'Bash').length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Total turns the session used to complete. Signal for "effort-match the
|
||||
* step" nudge in opus-4-7.md — trivial prompts should complete quickly.
|
||||
*/
|
||||
export function turnsToCompletion(r: AgentSdkResult): number {
|
||||
return r.turnsUsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Count of unique files the model edited or wrote. Signal for "literal
|
||||
* interpretation" nudge in opus-4-7.md — "fix the tests" with multiple
|
||||
* failures should touch all of them.
|
||||
*/
|
||||
export function uniqueFilesEdited(r: AgentSdkResult): number {
|
||||
const touched = new Set<string>();
|
||||
for (const call of r.toolCalls) {
|
||||
if (call.tool === 'Edit' || call.tool === 'Write' || call.tool === 'MultiEdit') {
|
||||
const input = call.input as { file_path?: string } | null;
|
||||
if (input?.file_path) touched.add(input.file_path);
|
||||
}
|
||||
}
|
||||
return touched.size;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Fixtures
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const OVERLAY_FIXTURES: OverlayFixture[] = [
|
||||
{
|
||||
id: 'opus-4-7-fanout-toy',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
|
||||
fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
|
||||
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
|
||||
},
|
||||
userPrompt:
|
||||
'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
{
|
||||
id: 'opus-4-7-fanout-realistic',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'app.ts'),
|
||||
"import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.ts'),
|
||||
"export const config = { name: 'demo', version: 1 };\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
'# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
|
||||
);
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'src', 'util.ts'),
|
||||
"export function util() { return 'util-result'; }\n",
|
||||
);
|
||||
},
|
||||
userPrompt:
|
||||
'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
|
||||
'every .ts file under src/. Summarize what you find in 3 bullet points.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// claude.md / "Dedicated tools over Bash"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'claude-dedicated-tools-vs-bash',
|
||||
overlayPath: 'model-overlays/claude.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
// 5 files + summary = needs more than default 5 turns. SDK throws
|
||||
// instead of returning a result when it hits the cap.
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
|
||||
},
|
||||
userPrompt:
|
||||
"List every TypeScript file under src/ and tell me what each exports. " +
|
||||
"You may use any tools available.",
|
||||
// Metric: total Bash tool_use count across the whole session.
|
||||
// The overlay says "prefer Read/Glob/Grep over cat/find/grep shell."
|
||||
// A model following that should emit Glob + Read, not Bash ls/find/cat.
|
||||
metric: bashToolCallCount,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// opus-4-7.md / "Effort-match the step"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'opus-4-7-effort-match-trivial',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
maxTurns: 8,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.json'),
|
||||
'{"name": "demo", "version": "1.0.0"}\n',
|
||||
);
|
||||
},
|
||||
userPrompt: "What's the version in config.json?",
|
||||
// Metric: turns used to reach a result.
|
||||
// The overlay says "simple lookups don't need deep reasoning."
|
||||
// A trivial prompt like this should complete in 2 turns
|
||||
// (Read, respond). Extended deliberation bumps this up.
|
||||
metric: turnsToCompletion,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// opus-4-7.md / "Literal interpretation awareness"
|
||||
// -------------------------------------------------------------------------
|
||||
{
|
||||
id: 'opus-4-7-literal-interpretation',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-opus-4-7',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'higher_is_better',
|
||||
// Allow Edit and Write so the model can fix the tests.
|
||||
allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
// Three failing test files. The "obvious" one is named first; the
|
||||
// other two require the model to enumerate to notice them.
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'auth.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('auth returns true', () => { expect(1).toBe(2); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'billing.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('billing computes total', () => { expect('ten').toBe('10'); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'notifications.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
"# demo\n\nSmall project with three failing test files.\n",
|
||||
);
|
||||
},
|
||||
userPrompt: "Fix the failing tests.",
|
||||
// Metric: number of unique test files the model edited/wrote.
|
||||
// The overlay says "fix ALL failing tests, not just the obvious one."
|
||||
// Overlay-ON should touch all 3 test files. Overlay-OFF might stop
|
||||
// at the first one after making it pass.
|
||||
metric: uniqueFilesEdited,
|
||||
pass: higherIsBetter20Pct,
|
||||
},
|
||||
|
||||
// =========================================================================
|
||||
// Sonnet 4.6 variants of the Opus-4.7 fixtures.
|
||||
//
|
||||
// Rationale: /claude.md + /opus-4-7.md overlays measured as no-op or
|
||||
// counterproductive on Opus 4.7. Before deleting the whole overlay stack,
|
||||
// check whether weaker Claude models (Sonnet, Haiku) benefit from the same
|
||||
// nudges. Same overlays, same prompts, same metrics, different model ID.
|
||||
// Sonnet is ~4x cheaper than Opus so these 5 add ~$3 to a run.
|
||||
// =========================================================================
|
||||
|
||||
{
|
||||
id: 'opus-4-7-fanout-toy-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(path.join(dir, 'alpha.txt'), 'Alpha file: used in module A.\n');
|
||||
fs.writeFileSync(path.join(dir, 'beta.txt'), 'Beta file: used in module B.\n');
|
||||
fs.writeFileSync(path.join(dir, 'gamma.txt'), 'Gamma file: used in module C.\n');
|
||||
},
|
||||
userPrompt:
|
||||
'Read alpha.txt, beta.txt, and gamma.txt and summarize each in one line.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'opus-4-7-fanout-realistic-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'app.ts'),
|
||||
"import { config } from './config';\nimport { util } from './src/util';\n\nexport function main() { return config.name + ':' + util(); }\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.ts'),
|
||||
"export const config = { name: 'demo', version: 1 };\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
'# demo project\n\nA small demo. Entry: `app.ts`. Config: `config.ts`.\n',
|
||||
);
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'src', 'util.ts'),
|
||||
"export function util() { return 'util-result'; }\n",
|
||||
);
|
||||
},
|
||||
userPrompt:
|
||||
'Audit this project: read app.ts, config.ts, and README.md, and glob for ' +
|
||||
'every .ts file under src/. Summarize what you find in 3 bullet points.',
|
||||
metric: (r) => firstTurnParallelism(r.assistantTurns[0]),
|
||||
pass: fanoutPass,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'claude-dedicated-tools-vs-bash-sonnet',
|
||||
overlayPath: 'model-overlays/claude.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
|
||||
fs.writeFileSync(path.join(dir, 'src', 'index.ts'), "export const x = 1;\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'util.ts'), "export function util() { return 42; }\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'types.ts'), "export type Foo = { a: number };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'config.ts'), "export const c = { n: 'demo' };\n");
|
||||
fs.writeFileSync(path.join(dir, 'src', 'api.ts'), "export async function fetchFoo() { return null; }\n");
|
||||
},
|
||||
userPrompt:
|
||||
"List every TypeScript file under src/ and tell me what each exports. " +
|
||||
"You may use any tools available.",
|
||||
metric: bashToolCallCount,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'opus-4-7-effort-match-trivial-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'lower_is_better',
|
||||
maxTurns: 8,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'config.json'),
|
||||
'{"name": "demo", "version": "1.0.0"}\n',
|
||||
);
|
||||
},
|
||||
userPrompt: "What's the version in config.json?",
|
||||
metric: turnsToCompletion,
|
||||
pass: lowerIsBetter20Pct,
|
||||
},
|
||||
|
||||
{
|
||||
id: 'opus-4-7-literal-interpretation-sonnet',
|
||||
overlayPath: 'model-overlays/opus-4-7.md',
|
||||
model: 'claude-sonnet-4-6',
|
||||
trials: 10,
|
||||
concurrency: 3,
|
||||
direction: 'higher_is_better',
|
||||
allowedTools: ['Read', 'Glob', 'Grep', 'Bash', 'Edit', 'Write'],
|
||||
maxTurns: 15,
|
||||
setupWorkspace: (dir) => {
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'auth.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('auth returns true', () => { expect(1).toBe(2); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'billing.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('billing computes total', () => { expect('ten').toBe('10'); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'notifications.test.ts'),
|
||||
"import { test, expect } from 'bun:test';\n" +
|
||||
"test('notifications fire on save', () => { expect([1,2]).toEqual([1,2,3]); });\n",
|
||||
);
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'README.md'),
|
||||
"# demo\n\nSmall project with three failing test files.\n",
|
||||
);
|
||||
},
|
||||
userPrompt: "Fix the failing tests.",
|
||||
metric: uniqueFilesEdited,
|
||||
pass: higherIsBetter20Pct,
|
||||
},
|
||||
];
|
||||
|
||||
// Validate at module load so a broken fixture fails fast at test startup,
|
||||
// not mid-run after burning API dollars.
|
||||
validateFixtures(OVERLAY_FIXTURES);
|
||||
@@ -241,10 +241,11 @@ describe('gen-skill-docs', () => {
|
||||
expect(content).toContain('git branch --show-current');
|
||||
});
|
||||
|
||||
test('tier 2+ skills contain ELI16 simplification rules (AskUserQuestion format)', () => {
|
||||
test('tier 2+ skills contain ELI10 simplification rules (AskUserQuestion format)', () => {
|
||||
// Root SKILL.md is tier 1 (no AskUserQuestion format). Check a tier 2+ skill instead.
|
||||
// v1.7.0.0 Pros/Cons format uses "ELI10 (ALWAYS)" rather than "Simplify (ELI10".
|
||||
const content = fs.readFileSync(path.join(ROOT, 'cso', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Simplify (ELI10');
|
||||
expect(content).toContain('ELI10');
|
||||
expect(content).toContain('plain English');
|
||||
expect(content).toContain('not function names');
|
||||
});
|
||||
@@ -2773,3 +2774,93 @@ describe('voice-triggers processing', () => {
|
||||
expect(frontmatter).not.toContain('voice-triggers:');
|
||||
});
|
||||
});
|
||||
|
||||
describe('plan-mode handshake (interactive: true) resolver', () => {
|
||||
const INTERACTIVE_SKILLS = [
|
||||
'plan-ceo-review',
|
||||
'plan-eng-review',
|
||||
'plan-design-review',
|
||||
'plan-devex-review',
|
||||
];
|
||||
|
||||
const HANDSHAKE_MARKER = '## Plan Mode Handshake';
|
||||
|
||||
test.each(INTERACTIVE_SKILLS)(
|
||||
'%s (Claude host) SKILL.md contains the handshake section',
|
||||
(skill) => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain(HANDSHAKE_MARKER);
|
||||
expect(content).toContain(
|
||||
'Plan mode is active. The user indicated that they do not want you to execute yet',
|
||||
);
|
||||
},
|
||||
);
|
||||
|
||||
test('handshake is absent from non-interactive Claude skills', () => {
|
||||
const nonInteractive = ['ship', 'review', 'qa', 'office-hours', 'codex', 'retro', 'cso'];
|
||||
for (const skill of nonInteractive) {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8');
|
||||
expect(content).not.toContain(HANDSHAKE_MARKER);
|
||||
}
|
||||
});
|
||||
|
||||
test('handshake is absent from non-Claude host outputs when present on disk', () => {
|
||||
// Non-Claude hosts render to hostSubdirs (.agents/, .openclaw/, etc). The
|
||||
// handshake resolver returns '' when ctx.host !== 'claude', so those
|
||||
// outputs must not contain the marker. The current gen-skill-docs layout
|
||||
// prefixes skill names as `gstack-<skill>` under the hostSubdir; older
|
||||
// layouts used `gstack/<skill>` (no prefix). Only stable-present paths
|
||||
// are asserted — older ones may or may not exist per install history.
|
||||
const candidateOutputs = [
|
||||
// Current prefixed layout
|
||||
path.join(ROOT, '.agents', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'),
|
||||
path.join(ROOT, '.openclaw', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'),
|
||||
path.join(ROOT, '.opencode', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'),
|
||||
path.join(ROOT, '.factory', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'),
|
||||
path.join(ROOT, '.hermes', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'),
|
||||
];
|
||||
let checked = 0;
|
||||
for (const out of candidateOutputs) {
|
||||
if (fs.existsSync(out)) {
|
||||
const content = fs.readFileSync(out, 'utf-8');
|
||||
expect(content).not.toContain(HANDSHAKE_MARKER);
|
||||
checked++;
|
||||
}
|
||||
}
|
||||
// At least one non-Claude host's output should exist after a full gen
|
||||
// run; this test is meaningful only if we checked something. If no
|
||||
// non-Claude outputs exist locally, the cross-host guarantee is still
|
||||
// enforced by the resolver's ctx.host check; this test is belt-and-
|
||||
// suspenders and becomes a no-op rather than a false positive.
|
||||
if (checked === 0) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.warn(
|
||||
'plan-mode handshake: no non-Claude host outputs found for cross-host absence check — ' +
|
||||
'run `bun run gen:skill-docs --host all` to populate',
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('0C-bis STOP block present in plan-ceo-review/SKILL.md', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
|
||||
const presentIdx = content.indexOf('Present these approach options via AskUserQuestion');
|
||||
const preludeIdx = content.indexOf('### 0D-prelude');
|
||||
expect(presentIdx).toBeGreaterThan(0);
|
||||
expect(preludeIdx).toBeGreaterThan(presentIdx);
|
||||
const between = content.slice(presentIdx, preludeIdx);
|
||||
expect(between).toContain('**STOP.**');
|
||||
expect(between).toContain('Do NOT proceed to Step 0D or 0F until the user responds to 0C-bis');
|
||||
});
|
||||
|
||||
test('handshake resolver is wired BEFORE generateUpgradeCheck in preamble', () => {
|
||||
const content = fs.readFileSync(
|
||||
path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
|
||||
'utf-8',
|
||||
);
|
||||
const handshakeIdx = content.indexOf(HANDSHAKE_MARKER);
|
||||
const upgradeIdx = content.indexOf('UPGRADE_AVAILABLE');
|
||||
expect(handshakeIdx).toBeGreaterThan(0);
|
||||
expect(upgradeIdx).toBeGreaterThan(0);
|
||||
expect(handshakeIdx).toBeLessThan(upgradeIdx);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -0,0 +1,182 @@
|
||||
// Pure-function tests for bin/gstack-next-version.
|
||||
// Covers the version arithmetic and slot-picking logic. Subprocess paths
|
||||
// (gh/glab/git) are covered by the integration test at the bottom (skipped
|
||||
// when the relevant CLI isn't available).
|
||||
|
||||
import { test, expect, describe } from "bun:test";
|
||||
import {
|
||||
parseVersion,
|
||||
fmtVersion,
|
||||
bumpVersion,
|
||||
cmpVersion,
|
||||
pickNextSlot,
|
||||
markActiveSiblings,
|
||||
} from "../bin/gstack-next-version";
|
||||
|
||||
describe("parseVersion", () => {
|
||||
test("accepts 4-digit semver", () => {
|
||||
expect(parseVersion("1.6.3.0")).toEqual([1, 6, 3, 0]);
|
||||
expect(parseVersion("0.0.0.0")).toEqual([0, 0, 0, 0]);
|
||||
expect(parseVersion("99.99.99.99")).toEqual([99, 99, 99, 99]);
|
||||
});
|
||||
|
||||
test("trims whitespace", () => {
|
||||
expect(parseVersion(" 1.2.3.4 \n")).toEqual([1, 2, 3, 4]);
|
||||
});
|
||||
|
||||
test("rejects malformed", () => {
|
||||
expect(parseVersion("1.2.3")).toBeNull();
|
||||
expect(parseVersion("1.2.3.4.5")).toBeNull();
|
||||
expect(parseVersion("v1.2.3.4")).toBeNull();
|
||||
expect(parseVersion("")).toBeNull();
|
||||
expect(parseVersion("not-a-version")).toBeNull();
|
||||
expect(parseVersion("1.2.3.x")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe("bumpVersion", () => {
|
||||
test("major zeros everything right", () => {
|
||||
expect(bumpVersion([1, 6, 3, 0], "major")).toEqual([2, 0, 0, 0]);
|
||||
expect(bumpVersion([1, 6, 3, 7], "major")).toEqual([2, 0, 0, 0]);
|
||||
});
|
||||
test("minor zeros patch+micro", () => {
|
||||
expect(bumpVersion([1, 6, 3, 0], "minor")).toEqual([1, 7, 0, 0]);
|
||||
expect(bumpVersion([1, 6, 3, 7], "minor")).toEqual([1, 7, 0, 0]);
|
||||
});
|
||||
test("patch zeros micro", () => {
|
||||
expect(bumpVersion([1, 6, 3, 0], "patch")).toEqual([1, 6, 4, 0]);
|
||||
expect(bumpVersion([1, 6, 3, 7], "patch")).toEqual([1, 6, 4, 0]);
|
||||
});
|
||||
test("micro increments slot 4", () => {
|
||||
expect(bumpVersion([1, 6, 3, 0], "micro")).toEqual([1, 6, 3, 1]);
|
||||
expect(bumpVersion([1, 6, 3, 7], "micro")).toEqual([1, 6, 3, 8]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("cmpVersion", () => {
|
||||
test("detects order", () => {
|
||||
expect(cmpVersion([1, 6, 3, 0], [1, 6, 3, 0])).toBe(0);
|
||||
expect(cmpVersion([1, 6, 4, 0], [1, 6, 3, 0])).toBeGreaterThan(0);
|
||||
expect(cmpVersion([1, 6, 3, 0], [1, 6, 4, 0])).toBeLessThan(0);
|
||||
expect(cmpVersion([2, 0, 0, 0], [1, 99, 99, 99])).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("pickNextSlot (the heart of queue-aware allocation)", () => {
|
||||
const base: [number, number, number, number] = [1, 6, 3, 0];
|
||||
|
||||
test("happy path — no claims, clean bump", () => {
|
||||
const r = pickNextSlot(base, [], "minor");
|
||||
expect(fmtVersion(r.version)).toBe("1.7.0.0");
|
||||
expect(r.reason).toMatch(/no collision/);
|
||||
});
|
||||
|
||||
test("collision — one PR claims the next slot, bump past", () => {
|
||||
const r = pickNextSlot(base, [[1, 7, 0, 0]], "minor");
|
||||
expect(fmtVersion(r.version)).toBe("1.8.0.0");
|
||||
expect(r.reason).toMatch(/bumped past/);
|
||||
});
|
||||
|
||||
test("multi-collision — two PRs claim sequential slots", () => {
|
||||
const r = pickNextSlot(base, [[1, 7, 0, 0], [1, 8, 0, 0]], "minor");
|
||||
expect(fmtVersion(r.version)).toBe("1.9.0.0");
|
||||
});
|
||||
|
||||
test("collision cross-level — queued MINOR bumps past my PATCH", () => {
|
||||
// Queue has 1.7.0.0 (minor), my bump is patch. I should land at 1.7.1.0
|
||||
// (patch relative to the highest claim).
|
||||
const r = pickNextSlot(base, [[1, 7, 0, 0]], "patch");
|
||||
expect(fmtVersion(r.version)).toBe("1.7.1.0");
|
||||
});
|
||||
|
||||
test("claims below base are ignored", () => {
|
||||
const r = pickNextSlot(base, [[1, 5, 0, 0], [1, 6, 2, 0]], "patch");
|
||||
expect(fmtVersion(r.version)).toBe("1.6.4.0");
|
||||
expect(r.reason).toMatch(/no collision/);
|
||||
});
|
||||
|
||||
test("claims equal to base are treated as no-claim", () => {
|
||||
// The caller is expected to pre-filter base-equal claims out, but even if
|
||||
// one slipped through, we don't want to inflate past it.
|
||||
const r = pickNextSlot(base, [], "micro");
|
||||
expect(fmtVersion(r.version)).toBe("1.6.3.1");
|
||||
});
|
||||
|
||||
test("major collision — competing majors", () => {
|
||||
const r = pickNextSlot(base, [[2, 0, 0, 0]], "major");
|
||||
expect(fmtVersion(r.version)).toBe("3.0.0.0");
|
||||
});
|
||||
|
||||
test("unsorted claims still resolve correctly", () => {
|
||||
const r = pickNextSlot(base, [[1, 9, 0, 0], [1, 7, 0, 0], [1, 8, 0, 0]], "minor");
|
||||
expect(fmtVersion(r.version)).toBe("1.10.0.0");
|
||||
});
|
||||
});
|
||||
|
||||
describe("markActiveSiblings", () => {
|
||||
const base: [number, number, number, number] = [1, 6, 3, 0];
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
|
||||
test("flags siblings that are ahead of base AND recent AND have no PR", () => {
|
||||
const siblings = [
|
||||
{ path: "/a", branch: "feat/alpha", version: "1.7.0.0", last_commit_ts: now - 60, has_open_pr: false, is_active: false },
|
||||
];
|
||||
const r = markActiveSiblings(siblings, base);
|
||||
expect(r[0].is_active).toBe(true);
|
||||
});
|
||||
|
||||
test("does not flag siblings with open PRs (already in the queue)", () => {
|
||||
const siblings = [
|
||||
{ path: "/a", branch: "feat/alpha", version: "1.7.0.0", last_commit_ts: now - 60, has_open_pr: true, is_active: false },
|
||||
];
|
||||
expect(markActiveSiblings(siblings, base)[0].is_active).toBe(false);
|
||||
});
|
||||
|
||||
test("does not flag stale siblings (commit > 24h old)", () => {
|
||||
const siblings = [
|
||||
{ path: "/a", branch: "feat/alpha", version: "1.7.0.0", last_commit_ts: now - 25 * 3600, has_open_pr: false, is_active: false },
|
||||
];
|
||||
expect(markActiveSiblings(siblings, base)[0].is_active).toBe(false);
|
||||
});
|
||||
|
||||
test("does not flag siblings at or below base", () => {
|
||||
const siblings = [
|
||||
{ path: "/a", branch: "feat/alpha", version: "1.6.3.0", last_commit_ts: now - 60, has_open_pr: false, is_active: false },
|
||||
{ path: "/b", branch: "feat/beta", version: "1.5.0.0", last_commit_ts: now - 60, has_open_pr: false, is_active: false },
|
||||
];
|
||||
const r = markActiveSiblings(siblings, base);
|
||||
expect(r[0].is_active).toBe(false);
|
||||
expect(r[1].is_active).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
// Integration smoke — only runs if gh is available and authenticated. Confirms
|
||||
// the CLI executes end-to-end against real APIs without crashing.
|
||||
describe("integration (smoke)", () => {
|
||||
test("CLI runs against real repo and emits parseable JSON", async () => {
|
||||
const proc = Bun.spawnSync([
|
||||
"bun",
|
||||
"run",
|
||||
"./bin/gstack-next-version",
|
||||
"--base",
|
||||
"main",
|
||||
"--bump",
|
||||
"patch",
|
||||
"--current-version",
|
||||
"1.6.3.0",
|
||||
"--workspace-root",
|
||||
"null", // skip sibling scan in CI
|
||||
]);
|
||||
const out = new TextDecoder().decode(proc.stdout);
|
||||
const parsed = JSON.parse(out);
|
||||
expect(parsed).toHaveProperty("version");
|
||||
expect(parseVersion(parsed.version)).not.toBeNull();
|
||||
expect(parsed).toHaveProperty("bump", "patch");
|
||||
expect(parsed).toHaveProperty("host");
|
||||
expect(["github", "gitlab", "unknown"]).toContain(parsed.host);
|
||||
expect(parsed).toHaveProperty("claimed");
|
||||
expect(Array.isArray(parsed.claimed)).toBe(true);
|
||||
expect(parsed).toHaveProperty("siblings");
|
||||
expect(parsed.siblings).toEqual([]); // --workspace-root null disabled scanning
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,565 @@
|
||||
/**
|
||||
* Claude Agent SDK wrapper for the overlay-efficacy harness.
|
||||
*
|
||||
* This sits alongside session-runner.ts (which drives `claude -p` as a
|
||||
* subprocess) but runs the model via the published @anthropic-ai/claude-agent-sdk
|
||||
* instead. The SDK exposes the same harness primitives Claude Code itself uses,
|
||||
* so overlay-driven behavior change is measured against a closer approximation
|
||||
* of real Claude Code than the `claude -p` subprocess path provides.
|
||||
*
|
||||
* Explicit design rules (from plan review):
|
||||
* - Use SDK-exported SDKMessage types. No `| unknown` union collapse.
|
||||
* - Permission surface is explicit: bypassPermissions + settingSources:[] +
|
||||
* disallowedTools inverse. Without these, the SDK inherits user settings,
|
||||
* project .claude/, and local hooks, and arms are no longer comparable.
|
||||
* - Binary pinning via pathToClaudeCodeExecutable. Resolve with `which claude`
|
||||
* at setup time; the SDK would otherwise use its bundled binary.
|
||||
* - 3-shape rate-limit detection: thrown error, result-message error subtype,
|
||||
* mid-stream SDKRateLimitEvent. All three recover on retry.
|
||||
* - On retry, caller resets workspace via a setupWorkspace callback so
|
||||
* partial Bash side-effects don't contaminate the next attempt.
|
||||
* - Process-level semaphore caps concurrent queries across all callers in
|
||||
* the same bun-test process. Composes with bun's own --concurrent flag.
|
||||
*/
|
||||
|
||||
import {
|
||||
query,
|
||||
type SDKMessage,
|
||||
type SDKAssistantMessage,
|
||||
type SDKResultMessage,
|
||||
type SDKSystemMessage,
|
||||
type PermissionMode,
|
||||
type SettingSource,
|
||||
type Options,
|
||||
type CanUseTool,
|
||||
} from '@anthropic-ai/claude-agent-sdk';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { execSync } from 'child_process';
|
||||
import type { SkillTestResult } from './session-runner';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface AgentSdkResult {
|
||||
/** Full raw event stream for forensic recovery. */
|
||||
events: SDKMessage[];
|
||||
/** Assistant-typed subset, in order. */
|
||||
assistantTurns: SDKAssistantMessage[];
|
||||
/** Flat tool-call list, in order of emission. */
|
||||
toolCalls: Array<{ tool: string; input: unknown; output: string }>;
|
||||
/** Concatenated assistant text, newline-joined. */
|
||||
output: string;
|
||||
/** 'success' | 'error_during_execution' | 'error_max_turns' | ... */
|
||||
exitReason: string;
|
||||
turnsUsed: number;
|
||||
durationMs: number;
|
||||
firstResponseMs: number;
|
||||
maxInterTurnMs: number;
|
||||
costUsd: number;
|
||||
model: string;
|
||||
sdkVersion: string;
|
||||
/** claude_code_version from the SDK's system/init event (authoritative). */
|
||||
sdkClaudeCodeVersion: string;
|
||||
/** Path to the claude binary we pinned. */
|
||||
resolvedBinaryPath: string;
|
||||
/** browse-error pattern scan for SkillTestResult parity. Always empty here. */
|
||||
browseErrors: string[];
|
||||
}
|
||||
|
||||
/** Signature matching `query()` from the SDK. DI hook for unit tests. */
|
||||
export type QueryProvider = typeof query;
|
||||
|
||||
/** Subset of SDK Options['systemPrompt'] we support. */
|
||||
export type SystemPromptOption =
|
||||
| string
|
||||
| { type: 'preset'; preset: 'claude_code'; append?: string; excludeDynamicSections?: boolean };
|
||||
|
||||
export interface RunAgentSdkOptions {
|
||||
/**
|
||||
* System prompt surface.
|
||||
* - bare string "" -> omit entirely (SDK default: no system prompt)
|
||||
* - bare string "...text..." -> REPLACE default with given text (use sparingly)
|
||||
* - { type:'preset', preset:'claude_code' } -> use Claude Code default
|
||||
* - { type:'preset', preset:'claude_code', append: "..." } -> default + append
|
||||
*
|
||||
* For overlay-efficacy measurement, the preset+append pattern is the right
|
||||
* one: it measures "does adding overlay text to the REAL Claude Code system
|
||||
* prompt change behavior" rather than "does the overlay alone (stripped of
|
||||
* base scaffolding) change behavior".
|
||||
*/
|
||||
systemPrompt: SystemPromptOption;
|
||||
userPrompt: string;
|
||||
workingDirectory: string;
|
||||
model?: string;
|
||||
maxTurns?: number;
|
||||
allowedTools?: string[];
|
||||
disallowedTools?: string[];
|
||||
permissionMode?: PermissionMode;
|
||||
settingSources?: SettingSource[];
|
||||
env?: Record<string, string>;
|
||||
pathToClaudeCodeExecutable?: string;
|
||||
testName?: string;
|
||||
runId?: string;
|
||||
fixtureId?: string;
|
||||
queryProvider?: QueryProvider;
|
||||
/** Max 429 retries per call. Default 3. */
|
||||
maxRetries?: number;
|
||||
/**
|
||||
* Caller provides this when retry should reset the workspace. The harness
|
||||
* invokes it with a fresh dir after a rate-limit failure. When omitted,
|
||||
* retries reuse the original workingDirectory (fine for read-only tests).
|
||||
*/
|
||||
onRetry?: (freshDir: string) => void;
|
||||
/**
|
||||
* Optional canUseTool callback. When supplied, the harness flips
|
||||
* permissionMode from 'bypassPermissions' to 'default' so the SDK actually
|
||||
* routes tool-use approval decisions through the callback. Without this
|
||||
* flip, bypassPermissions short-circuits the callback and tests that want
|
||||
* to assert on AskUserQuestion content silently pass without asserting.
|
||||
*
|
||||
* Callback contract matches the SDK: fires on every tool-use approval
|
||||
* request and on AskUserQuestion invocations. For non-AskUserQuestion
|
||||
* tools that tests don't care about, use `passThroughNonAskUserQuestion`
|
||||
* to auto-allow them.
|
||||
*/
|
||||
canUseTool?: CanUseTool;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass-through helper: auto-allows any tool_use that isn't AskUserQuestion.
|
||||
* Most plan-mode handshake tests only care about the handshake AskUserQuestion;
|
||||
* every other tool (Read, Grep, Bash, Write, Edit, ExitPlanMode) should just
|
||||
* run. Compose with a test-specific AskUserQuestion handler:
|
||||
*
|
||||
* canUseTool: async (toolName, input, options) => {
|
||||
* if (toolName === 'AskUserQuestion') {
|
||||
* // custom assertions + canned answer
|
||||
* return { behavior: 'allow', updatedInput: { questions: input.questions, answers: {...} } };
|
||||
* }
|
||||
* return passThroughNonAskUserQuestion(toolName, input);
|
||||
* }
|
||||
*/
|
||||
export function passThroughNonAskUserQuestion(
|
||||
toolName: string,
|
||||
input: Record<string, unknown>,
|
||||
): { behavior: 'allow'; updatedInput: Record<string, unknown> } {
|
||||
// SDK requires an allow response to include updatedInput — pass the original
|
||||
// input through unchanged so the tool runs as the model intended.
|
||||
void toolName;
|
||||
return { behavior: 'allow', updatedInput: input };
|
||||
}
|
||||
|
||||
export class RateLimitExhaustedError extends Error {
|
||||
readonly attempts: number;
|
||||
constructor(attempts: number, cause?: unknown) {
|
||||
super(`rate limit exhausted after ${attempts} attempts`);
|
||||
this.name = 'RateLimitExhaustedError';
|
||||
this.attempts = attempts;
|
||||
if (cause !== undefined) (this as { cause?: unknown }).cause = cause;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Process-level semaphore for API concurrency
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Bounded token bucket. Shared across all runAgentSdkTest calls in this
|
||||
* process so that bun's --concurrent flag does not compound with in-test
|
||||
* concurrency to blow past Anthropic's rate limits.
|
||||
*
|
||||
* Default capacity 3. Override via GSTACK_SDK_MAX_CONCURRENCY env var.
|
||||
*/
|
||||
class Semaphore {
|
||||
private available: number;
|
||||
private readonly queue: Array<() => void> = [];
|
||||
constructor(capacity: number) {
|
||||
this.available = capacity;
|
||||
}
|
||||
async acquire(): Promise<void> {
|
||||
if (this.available > 0) {
|
||||
this.available--;
|
||||
return;
|
||||
}
|
||||
await new Promise<void>((resolve) => this.queue.push(resolve));
|
||||
}
|
||||
release(): void {
|
||||
const next = this.queue.shift();
|
||||
if (next) {
|
||||
next();
|
||||
} else {
|
||||
this.available++;
|
||||
}
|
||||
}
|
||||
/** For tests. Returns tokens currently in-flight. */
|
||||
inFlight(): number {
|
||||
// Not introspectable from outside without tracking; approximate.
|
||||
return this.queue.length;
|
||||
}
|
||||
}
|
||||
|
||||
const DEFAULT_SDK_CONCURRENCY = Number(process.env.GSTACK_SDK_MAX_CONCURRENCY ?? 3);
|
||||
let _apiSemaphore: Semaphore | null = null;
|
||||
function getApiSemaphore(): Semaphore {
|
||||
if (!_apiSemaphore) _apiSemaphore = new Semaphore(DEFAULT_SDK_CONCURRENCY);
|
||||
return _apiSemaphore;
|
||||
}
|
||||
|
||||
/** Test-only. Resets the process-level semaphore. */
|
||||
export function __resetSemaphoreForTests(capacity: number): void {
|
||||
_apiSemaphore = new Semaphore(capacity);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Rate-limit detection
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** True if `err` looks like a rate-limit thrown from the SDK. */
|
||||
export function isRateLimitThrown(err: unknown): boolean {
|
||||
if (!err || typeof err !== 'object') return false;
|
||||
const msg = (err as { message?: string }).message ?? '';
|
||||
const name = (err as { name?: string }).name ?? '';
|
||||
const status = (err as { status?: number }).status;
|
||||
return (
|
||||
status === 429 ||
|
||||
/rate.?limit|429|too many requests/i.test(msg) ||
|
||||
/RateLimit/i.test(name)
|
||||
);
|
||||
}
|
||||
|
||||
/** True if a SDKResultMessage is a rate-limit-shaped error. */
|
||||
export function isRateLimitResult(msg: SDKMessage): boolean {
|
||||
if (msg.type !== 'result') return false;
|
||||
const r = msg as SDKResultMessage;
|
||||
if (r.subtype === 'success') return false;
|
||||
// subtype === 'error_during_execution' | 'error_max_turns' | 'error_max_budget_usd' | ...
|
||||
if (r.subtype !== 'error_during_execution') return false;
|
||||
const errs = (r as { errors?: string[] }).errors ?? [];
|
||||
return errs.some((e) => /rate.?limit|429|too many requests/i.test(e));
|
||||
}
|
||||
|
||||
/** True if mid-stream SDKRateLimitEvent indicates a blocking rate-limit. */
|
||||
export function isRateLimitEvent(msg: SDKMessage): boolean {
|
||||
if (msg.type !== 'rate_limit_event') return false;
|
||||
const info = (msg as { rate_limit_info?: { status?: string } }).rate_limit_info;
|
||||
return info?.status === 'rejected';
|
||||
}
|
||||
|
||||
/**
|
||||
* True if `err` is the SDK's "max turns reached" throw. Some SDK versions
|
||||
* raise this as an exception from the generator instead of emitting a
|
||||
* result message with subtype='error_max_turns'. We treat it as terminal-
|
||||
* but-recoverable: record what we collected and continue, rather than
|
||||
* failing the whole run.
|
||||
*/
|
||||
export function isMaxTurnsError(err: unknown): boolean {
|
||||
if (!err || typeof err !== 'object') return false;
|
||||
const msg = (err as { message?: string }).message ?? '';
|
||||
return /reached maximum number of turns|max.?turns/i.test(msg);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Version resolution (cached)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
let _sdkVersionCache: string | null = null;
|
||||
function resolveSdkVersion(): string {
|
||||
if (_sdkVersionCache) return _sdkVersionCache;
|
||||
try {
|
||||
const pkgPath = require.resolve('@anthropic-ai/claude-agent-sdk/package.json');
|
||||
const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8')) as { version?: string };
|
||||
_sdkVersionCache = pkg.version ?? 'unknown';
|
||||
} catch {
|
||||
_sdkVersionCache = 'unknown';
|
||||
}
|
||||
return _sdkVersionCache;
|
||||
}
|
||||
|
||||
export function resolveClaudeBinary(): string | null {
|
||||
try {
|
||||
return execSync('which claude', { encoding: 'utf-8' }).trim() || null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main runner
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Execute a single SDK query with retries. Returns a typed result.
|
||||
*
|
||||
* The retry loop treats 429 as recoverable and any other error as fatal.
|
||||
* Exponential backoff: 1s, 2s, 4s. After maxRetries failures, throws
|
||||
* RateLimitExhaustedError so the caller can decide what to do with the run.
|
||||
*/
|
||||
export async function runAgentSdkTest(
|
||||
opts: RunAgentSdkOptions,
|
||||
): Promise<AgentSdkResult> {
|
||||
const sem = getApiSemaphore();
|
||||
const maxRetries = opts.maxRetries ?? 3;
|
||||
const queryImpl: QueryProvider = opts.queryProvider ?? query;
|
||||
const model = opts.model ?? 'claude-opus-4-7';
|
||||
|
||||
let attempt = 0;
|
||||
let lastErr: unknown = null;
|
||||
|
||||
while (attempt <= maxRetries) {
|
||||
await sem.acquire();
|
||||
const startMs = Date.now();
|
||||
|
||||
// Hoisted so the max-turns catch branch can synthesize a result from
|
||||
// whatever we captured before the SDK threw.
|
||||
const events: SDKMessage[] = [];
|
||||
const assistantTurns: SDKAssistantMessage[] = [];
|
||||
const toolCalls: Array<{ tool: string; input: unknown; output: string }> = [];
|
||||
const assistantTextParts: string[] = [];
|
||||
let firstResponseMs = 0;
|
||||
let lastEventMs = startMs;
|
||||
let maxInterTurnMs = 0;
|
||||
let systemInitVersion = 'unknown';
|
||||
let rateLimited: unknown = null;
|
||||
let terminalResult: SDKResultMessage | null = null;
|
||||
|
||||
try {
|
||||
// When canUseTool is supplied, the SDK must route tool-use approval
|
||||
// decisions through the callback. bypassPermissions short-circuits
|
||||
// that. Flip to 'default' mode so canUseTool actually fires. Tests
|
||||
// that want AskUserQuestion interception without this flip would
|
||||
// silently auto-pass — the exact testability gap D14/D4-eng fix.
|
||||
const hasCanUseTool = typeof opts.canUseTool === 'function';
|
||||
const resolvedPermissionMode: PermissionMode =
|
||||
opts.permissionMode ?? (hasCanUseTool ? 'default' : 'bypassPermissions');
|
||||
|
||||
// When canUseTool is supplied, ensure AskUserQuestion is in the allowed
|
||||
// tools list. Without it, Claude can't invoke AskUserQuestion at all
|
||||
// and the callback never has a chance to fire on it.
|
||||
const baseTools = opts.allowedTools ?? ['Read', 'Glob', 'Grep', 'Bash'];
|
||||
const resolvedTools =
|
||||
hasCanUseTool && !baseTools.includes('AskUserQuestion')
|
||||
? [...baseTools, 'AskUserQuestion']
|
||||
: baseTools;
|
||||
|
||||
const sdkOpts: Options = {
|
||||
model,
|
||||
cwd: opts.workingDirectory,
|
||||
maxTurns: opts.maxTurns ?? 5,
|
||||
tools: resolvedTools,
|
||||
disallowedTools: opts.disallowedTools,
|
||||
allowedTools: resolvedTools,
|
||||
permissionMode: resolvedPermissionMode,
|
||||
allowDangerouslySkipPermissions: resolvedPermissionMode === 'bypassPermissions',
|
||||
settingSources: opts.settingSources ?? [],
|
||||
env: opts.env,
|
||||
pathToClaudeCodeExecutable: opts.pathToClaudeCodeExecutable,
|
||||
...(hasCanUseTool ? { canUseTool: opts.canUseTool } : {}),
|
||||
};
|
||||
// Empty bare string means "omit entirely" (SDK runs with no override).
|
||||
// Any object or non-empty string is passed through.
|
||||
if (typeof opts.systemPrompt === 'object' || opts.systemPrompt !== '') {
|
||||
sdkOpts.systemPrompt = opts.systemPrompt;
|
||||
}
|
||||
|
||||
const q = queryImpl({
|
||||
prompt: opts.userPrompt,
|
||||
options: sdkOpts,
|
||||
});
|
||||
|
||||
for await (const ev of q) {
|
||||
const now = Date.now();
|
||||
if (firstResponseMs === 0) firstResponseMs = now - startMs;
|
||||
const interTurn = now - lastEventMs;
|
||||
if (interTurn > maxInterTurnMs) maxInterTurnMs = interTurn;
|
||||
lastEventMs = now;
|
||||
|
||||
events.push(ev);
|
||||
|
||||
if (ev.type === 'system' && (ev as SDKSystemMessage).subtype === 'init') {
|
||||
systemInitVersion =
|
||||
(ev as SDKSystemMessage).claude_code_version ?? 'unknown';
|
||||
} else if (ev.type === 'assistant') {
|
||||
const am = ev as SDKAssistantMessage;
|
||||
assistantTurns.push(am);
|
||||
const content = am.message?.content;
|
||||
if (Array.isArray(content)) {
|
||||
for (const block of content as Array<
|
||||
| { type: 'text'; text?: string }
|
||||
| { type: 'tool_use'; name?: string; input?: unknown }
|
||||
| { type: string }
|
||||
>) {
|
||||
if (block.type === 'text') {
|
||||
const t = (block as { text?: string }).text;
|
||||
if (t) assistantTextParts.push(t);
|
||||
} else if (block.type === 'tool_use') {
|
||||
const tb = block as { name?: string; input?: unknown };
|
||||
toolCalls.push({
|
||||
tool: tb.name ?? 'unknown',
|
||||
input: tb.input ?? {},
|
||||
output: '',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (isRateLimitEvent(ev)) {
|
||||
rateLimited = new Error(
|
||||
`mid-stream rate limit: ${JSON.stringify(
|
||||
(ev as { rate_limit_info?: unknown }).rate_limit_info,
|
||||
)}`,
|
||||
);
|
||||
} else if (ev.type === 'result') {
|
||||
terminalResult = ev as SDKResultMessage;
|
||||
if (isRateLimitResult(ev)) {
|
||||
rateLimited = new Error(
|
||||
`result-message rate limit: ${((ev as { errors?: string[] }).errors ?? []).join('; ')}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (rateLimited) {
|
||||
throw rateLimited;
|
||||
}
|
||||
if (!terminalResult) {
|
||||
throw new Error('query stream ended without a result event');
|
||||
}
|
||||
|
||||
const durationMs = Date.now() - startMs;
|
||||
const costUsd =
|
||||
(terminalResult as { total_cost_usd?: number }).total_cost_usd ?? 0;
|
||||
const turnsUsed =
|
||||
(terminalResult as { num_turns?: number }).num_turns ??
|
||||
assistantTurns.length;
|
||||
const exitReason =
|
||||
(terminalResult as { subtype?: string }).subtype ?? 'unknown';
|
||||
|
||||
return {
|
||||
events,
|
||||
assistantTurns,
|
||||
toolCalls,
|
||||
output: assistantTextParts.join('\n'),
|
||||
exitReason,
|
||||
turnsUsed,
|
||||
durationMs,
|
||||
firstResponseMs,
|
||||
maxInterTurnMs,
|
||||
costUsd,
|
||||
model,
|
||||
sdkVersion: resolveSdkVersion(),
|
||||
sdkClaudeCodeVersion: systemInitVersion,
|
||||
resolvedBinaryPath: opts.pathToClaudeCodeExecutable ?? 'sdk-default',
|
||||
browseErrors: [],
|
||||
};
|
||||
} catch (err) {
|
||||
lastErr = err;
|
||||
|
||||
// "Max turns reached" is the SDK's way of saying "this session ran
|
||||
// out of turns." It's thrown from the generator instead of emitted
|
||||
// as a result message. Treat as a successful-but-capped trial: the
|
||||
// assistant turns we collected are real and carry a metric. Record
|
||||
// them with exitReason='error_max_turns' rather than failing the
|
||||
// whole run.
|
||||
if (isMaxTurnsError(err)) {
|
||||
const durationMs = Date.now() - startMs;
|
||||
return {
|
||||
events,
|
||||
assistantTurns,
|
||||
toolCalls,
|
||||
output: assistantTextParts.join('\n'),
|
||||
exitReason: 'error_max_turns',
|
||||
turnsUsed: assistantTurns.length,
|
||||
durationMs,
|
||||
firstResponseMs,
|
||||
maxInterTurnMs,
|
||||
costUsd: 0, // unknown from thrown-error path
|
||||
model,
|
||||
sdkVersion: resolveSdkVersion(),
|
||||
sdkClaudeCodeVersion: systemInitVersion,
|
||||
resolvedBinaryPath: opts.pathToClaudeCodeExecutable ?? 'sdk-default',
|
||||
browseErrors: [],
|
||||
};
|
||||
}
|
||||
|
||||
const isRetryable = isRateLimitThrown(err);
|
||||
if (!isRetryable || attempt >= maxRetries) {
|
||||
if (isRetryable) {
|
||||
throw new RateLimitExhaustedError(attempt + 1, err);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
attempt++;
|
||||
// backoff: 1s, 2s, 4s
|
||||
await new Promise((r) => setTimeout(r, 1000 * Math.pow(2, attempt - 1)));
|
||||
// Let caller reset workspace since prior attempt may have partially
|
||||
// mutated files via Bash.
|
||||
if (opts.onRetry) {
|
||||
opts.onRetry(opts.workingDirectory);
|
||||
}
|
||||
} finally {
|
||||
sem.release();
|
||||
}
|
||||
}
|
||||
|
||||
throw new RateLimitExhaustedError(attempt + 1, lastErr);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Legacy shape mapper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Adapt AgentSdkResult to the legacy SkillTestResult shape so helpers that
|
||||
* expect the old `claude -p` output (extractToolSummary, etc) work unchanged.
|
||||
*/
|
||||
export function toSkillTestResult(r: AgentSdkResult): SkillTestResult {
|
||||
// Cost estimate: use SDK's authoritative cost; back-compute chars.
|
||||
// session-runner.ts:30 requires inputChars/outputChars/estimatedTokens.
|
||||
// These are rough; real consumers of CostEstimate use cost + turns.
|
||||
const outputChars = r.output.length;
|
||||
const inputChars = 0; // unknown from SDK path; not used for pass/fail
|
||||
const estimatedTokens = Math.round((inputChars + outputChars) / 4);
|
||||
|
||||
// Build a flat transcript list mimicking the NDJSON shape:
|
||||
// parseNDJSON emits [{ type: 'assistant', message: {...} }, ...].
|
||||
// Use the SDK's assistantTurns directly since their shape matches.
|
||||
const transcript: unknown[] = r.events.slice();
|
||||
|
||||
return {
|
||||
toolCalls: r.toolCalls,
|
||||
browseErrors: r.browseErrors,
|
||||
exitReason: r.exitReason,
|
||||
duration: r.durationMs,
|
||||
output: r.output,
|
||||
costEstimate: {
|
||||
inputChars,
|
||||
outputChars,
|
||||
estimatedTokens,
|
||||
estimatedCost: r.costUsd,
|
||||
turnsUsed: r.turnsUsed,
|
||||
},
|
||||
transcript,
|
||||
model: r.model,
|
||||
firstResponseMs: r.firstResponseMs,
|
||||
maxInterTurnMs: r.maxInterTurnMs,
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Metric helpers (re-exported for fixtures)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Count `tool_use` blocks in the first assistant turn of an SDK result.
|
||||
* Returns 0 if there is no first turn or no content array.
|
||||
*
|
||||
* This is the core "fanout" metric. A turn with N tool_use blocks = N
|
||||
* parallel tool invocations.
|
||||
*/
|
||||
export function firstTurnParallelism(firstTurn: SDKAssistantMessage | undefined): number {
|
||||
if (!firstTurn) return 0;
|
||||
const content = firstTurn.message?.content;
|
||||
if (!Array.isArray(content)) return 0;
|
||||
return (content as Array<{ type: string }>).filter((b) => b.type === 'tool_use').length;
|
||||
}
|
||||
@@ -0,0 +1,166 @@
|
||||
/**
|
||||
* Shared helpers for plan-mode handshake E2E tests.
|
||||
*
|
||||
* Four sibling test files (plan-ceo, plan-eng, plan-design, plan-devex) exercise
|
||||
* the identical handshake contract against different skills. This helper
|
||||
* centralizes the canUseTool interceptor and the assertion shape so the four
|
||||
* test files are thin wiring (~40 LOC each) and can't drift out of sync.
|
||||
*
|
||||
* See scripts/resolvers/preamble/generate-plan-mode-handshake.ts for the
|
||||
* handshake prose that the tests below assert against.
|
||||
*/
|
||||
|
||||
import { expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { execSync } from 'child_process';
|
||||
import {
|
||||
runAgentSdkTest,
|
||||
passThroughNonAskUserQuestion,
|
||||
resolveClaudeBinary,
|
||||
type AgentSdkResult,
|
||||
} from './agent-sdk-runner';
|
||||
|
||||
/** Distinctive phrase matching what Claude Code's harness actually injects. */
|
||||
export const PLAN_MODE_REMINDER =
|
||||
'Plan mode is active. The user indicated that they do not want you to execute yet';
|
||||
|
||||
export interface HandshakeCaptureResult {
|
||||
sdkResult: AgentSdkResult;
|
||||
/** Each AskUserQuestion that fired, with its input payload. */
|
||||
askUserQuestions: Array<{ input: Record<string, unknown>; orderIndex: number }>;
|
||||
/** Tool-use events in the order they fired (names only). */
|
||||
toolOrder: string[];
|
||||
/** Whether any Write or Edit tool fired BEFORE the first AskUserQuestion. */
|
||||
writeOrEditBeforeAsk: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a skill via the Agent SDK with canUseTool intercepting every tool use.
|
||||
* Inject the plan-mode distinctive phrase into the system prompt and auto-
|
||||
* answer the handshake with the given answerLabel ("Exit" or "Cancel"). Return
|
||||
* the captured events for assertion.
|
||||
*/
|
||||
export async function runPlanModeHandshakeTest(opts: {
|
||||
/** Skill name, e.g. 'plan-ceo-review'. */
|
||||
skillName: string;
|
||||
/** "Exit" to pick option A (exit-and-rerun) or "Cancel" for option C. */
|
||||
answerLabel: 'Exit' | 'Cancel';
|
||||
/** If true, DO NOT inject the reminder — used by the no-op regression test. */
|
||||
omitPlanModeReminder?: boolean;
|
||||
/** Max turns for the SDK call (default 4 — handshake + exit should fit easily). */
|
||||
maxTurns?: number;
|
||||
}): Promise<HandshakeCaptureResult> {
|
||||
const { skillName, answerLabel, omitPlanModeReminder, maxTurns } = opts;
|
||||
|
||||
const askUserQuestions: HandshakeCaptureResult['askUserQuestions'] = [];
|
||||
const toolOrder: string[] = [];
|
||||
let toolIndex = 0;
|
||||
let firstAskIndex = -1;
|
||||
|
||||
const workingDir = fs.mkdtempSync(
|
||||
path.join(os.tmpdir(), `plan-mode-handshake-${skillName}-`),
|
||||
);
|
||||
|
||||
// The SDK requires AskUserQuestion to be in the allowed tools list. The
|
||||
// harness auto-adds it when canUseTool is supplied, but we also want Read
|
||||
// so the skill can load its own file if it tries to.
|
||||
const binary = resolveClaudeBinary();
|
||||
|
||||
try {
|
||||
// Inject the distinctive phrase into the system prompt by appending it to
|
||||
// the default Claude Code preset. Claude Code's real plan mode uses an
|
||||
// injected system-reminder; in SDK tests we use systemPrompt.append which
|
||||
// the model treats as equally authoritative.
|
||||
const reminderAppend = omitPlanModeReminder
|
||||
? ''
|
||||
: `\n\n<system-reminder>\n${PLAN_MODE_REMINDER}. This supercedes any other instructions you have received.\n</system-reminder>\n`;
|
||||
|
||||
const sdkResult = await runAgentSdkTest({
|
||||
systemPrompt: {
|
||||
type: 'preset',
|
||||
preset: 'claude_code',
|
||||
append: reminderAppend,
|
||||
},
|
||||
userPrompt: `Read the skill file at ${path.resolve(
|
||||
import.meta.dir,
|
||||
'..',
|
||||
'..',
|
||||
skillName,
|
||||
'SKILL.md',
|
||||
)} and follow its instructions. There is no real plan to review — just start the skill and respond to any AskUserQuestion that fires.`,
|
||||
workingDirectory: workingDir,
|
||||
maxTurns: maxTurns ?? 4,
|
||||
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
|
||||
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
|
||||
canUseTool: async (toolName, input) => {
|
||||
toolOrder.push(toolName);
|
||||
if (toolName === 'AskUserQuestion') {
|
||||
if (firstAskIndex === -1) firstAskIndex = toolIndex;
|
||||
askUserQuestions.push({ input, orderIndex: toolIndex });
|
||||
toolIndex++;
|
||||
// Auto-answer with the label the test specified.
|
||||
const q = (input.questions as Array<{ question: string; options: Array<{ label: string }> }>)[0];
|
||||
const matched = q.options.find((o) => o.label.includes(answerLabel));
|
||||
const answer = matched ? matched.label : q.options[0]!.label;
|
||||
return {
|
||||
behavior: 'allow',
|
||||
updatedInput: {
|
||||
questions: input.questions,
|
||||
answers: { [q.question]: answer },
|
||||
},
|
||||
};
|
||||
}
|
||||
toolIndex++;
|
||||
return passThroughNonAskUserQuestion(toolName, input);
|
||||
},
|
||||
});
|
||||
|
||||
const writeOrEditBeforeAsk =
|
||||
firstAskIndex > 0 &&
|
||||
toolOrder.slice(0, firstAskIndex).some((t) => t === 'Write' || t === 'Edit');
|
||||
|
||||
return { sdkResult, askUserQuestions, toolOrder, writeOrEditBeforeAsk };
|
||||
} finally {
|
||||
try {
|
||||
fs.rmSync(workingDir, { recursive: true, force: true });
|
||||
} catch { /* ignore cleanup errors */ }
|
||||
}
|
||||
}
|
||||
|
||||
/** Assert the shape of a fired handshake AskUserQuestion. */
|
||||
export function assertHandshakeShape(
|
||||
aq: { input: Record<string, unknown> },
|
||||
): void {
|
||||
const questions = aq.input.questions as Array<{
|
||||
question: string;
|
||||
options: Array<{ label: string }>;
|
||||
}>;
|
||||
expect(questions).toBeDefined();
|
||||
expect(questions.length).toBe(1);
|
||||
const q = questions[0]!;
|
||||
// D8 dropped Option B; handshake has exactly 2 options.
|
||||
expect(q.options.length).toBe(2);
|
||||
const labels = q.options.map((o) => o.label);
|
||||
expect(labels.some((l) => l.includes('Exit'))).toBe(true);
|
||||
expect(labels.some((l) => l.includes('Cancel'))).toBe(true);
|
||||
}
|
||||
|
||||
/** Read the skill-usage.jsonl log and return handshake entries. */
|
||||
export function readHandshakeLog(): Array<Record<string, unknown>> {
|
||||
const logPath = path.join(os.homedir(), '.gstack', 'analytics', 'skill-usage.jsonl');
|
||||
if (!fs.existsSync(logPath)) return [];
|
||||
const lines = fs.readFileSync(logPath, 'utf-8').split('\n').filter(Boolean);
|
||||
return lines
|
||||
.map((line) => {
|
||||
try {
|
||||
return JSON.parse(line);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter((x): x is Record<string, unknown> => x !== null && x.event === 'plan_mode_handshake');
|
||||
}
|
||||
|
||||
export { execSync };
|
||||
@@ -82,12 +82,40 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'plan-eng-review-artifact': ['plan-eng-review/**'],
|
||||
'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Plan-mode handshake (v1.10.2.0) — gate-tier safety regression tests.
|
||||
// Each fires when any of: the interactive skill's template, the resolver,
|
||||
// preamble composition, the Agent SDK harness, the question registry, or
|
||||
// the one-way-door classifier changes.
|
||||
'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'],
|
||||
'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'],
|
||||
'plan-design-review-plan-mode-handshake': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'],
|
||||
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'],
|
||||
'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts'],
|
||||
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'test/helpers/agent-sdk-runner.ts'],
|
||||
|
||||
// AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
|
||||
// Fires when either template OR the two preamble resolvers change.
|
||||
'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
|
||||
'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
|
||||
'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
|
||||
'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts'],
|
||||
'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-ceo-review-format-approach': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-eng-review-format-coverage': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-eng-review-format-kind': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
|
||||
// v1.7.0.0 Pros/Cons format cadence + format + negative-escape evals.
|
||||
// Dependencies: same as format-mode + the 4 plan-review templates + overlay.
|
||||
// All periodic-tier (non-deterministic Opus 4.7 behavior).
|
||||
'plan-ceo-review-prosons-cadence': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-review-prosons-format': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-review-prosons-hardstop-neg': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'plan-review-prosons-neutral-neg': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
|
||||
// Expanded coverage (CT3) — 6 non-plan-review skills inherit Pros/Cons via preamble
|
||||
'ship-prosons-format': ['ship/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'office-hours-prosons-format': ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'investigate-prosons-format': ['investigate/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'qa-prosons-format': ['qa/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'review-prosons-format': ['review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'design-review-prosons-format': ['design-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
'document-release-prosons-format': ['document-release/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
|
||||
|
||||
// /plan-tune (v1 observational)
|
||||
'plan-tune-inspect': ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'],
|
||||
@@ -222,6 +250,24 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],
|
||||
'fanout-arm-overlay-off':
|
||||
['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],
|
||||
|
||||
// Overlay efficacy harness (SDK) — measures whether overlay nudges change
|
||||
// behavior under @anthropic-ai/claude-agent-sdk (closer to real Claude Code
|
||||
// than `claude -p`). testNames in the file are template literals so the
|
||||
// completeness scanner doesn't require them; these entries exist for
|
||||
// diff-based selection accuracy.
|
||||
'overlay-harness-opus-4-7-fanout-toy': [
|
||||
'model-overlays/**',
|
||||
'test/fixtures/overlay-nudges.ts',
|
||||
'test/helpers/agent-sdk-runner.ts',
|
||||
'scripts/resolvers/model-overlay.ts',
|
||||
],
|
||||
'overlay-harness-opus-4-7-fanout-realistic': [
|
||||
'model-overlays/**',
|
||||
'test/fixtures/overlay-nudges.ts',
|
||||
'test/helpers/agent-sdk-runner.ts',
|
||||
'scripts/resolvers/model-overlay.ts',
|
||||
],
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -282,12 +328,35 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'plan-eng-coverage-audit': 'gate',
|
||||
'plan-review-report': 'gate',
|
||||
|
||||
// Plan-mode handshake — deterministic safety regression, gate-tier
|
||||
'plan-ceo-review-plan-mode': 'gate',
|
||||
'plan-eng-review-plan-mode': 'gate',
|
||||
'plan-design-review-plan-mode-handshake': 'gate',
|
||||
'plan-devex-review-plan-mode': 'gate',
|
||||
'plan-mode-no-op': 'gate',
|
||||
'e2e-harness-audit': 'gate',
|
||||
|
||||
// AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark)
|
||||
'plan-ceo-review-format-mode': 'periodic',
|
||||
'plan-ceo-review-format-approach': 'periodic',
|
||||
'plan-eng-review-format-coverage': 'periodic',
|
||||
'plan-eng-review-format-kind': 'periodic',
|
||||
|
||||
// v1.7.0.0 Pros/Cons format — cadence + negative-escape evals (all periodic)
|
||||
'plan-ceo-review-prosons-cadence': 'periodic',
|
||||
'plan-review-prosons-format': 'periodic',
|
||||
'plan-review-prosons-hardstop-neg': 'periodic',
|
||||
'plan-review-prosons-neutral-neg': 'periodic',
|
||||
|
||||
// CT3 expanded coverage — non-plan-review skills inheriting Pros/Cons (all periodic)
|
||||
'ship-prosons-format': 'periodic',
|
||||
'office-hours-prosons-format': 'periodic',
|
||||
'investigate-prosons-format': 'periodic',
|
||||
'qa-prosons-format': 'periodic',
|
||||
'review-prosons-format': 'periodic',
|
||||
'design-review-prosons-format': 'periodic',
|
||||
'document-release-prosons-format': 'periodic',
|
||||
|
||||
// /plan-tune — gate (core v1 DX promise: plain-English intent routing)
|
||||
'plan-tune-inspect': 'gate',
|
||||
|
||||
@@ -398,6 +467,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// Opus 4.7 overlay evals — periodic (non-deterministic LLM behavior + Opus cost)
|
||||
'fanout-arm-overlay-on': 'periodic',
|
||||
'fanout-arm-overlay-off': 'periodic',
|
||||
|
||||
// Overlay efficacy harness (SDK, paid) — periodic only
|
||||
'overlay-harness-opus-4-7-fanout-toy': 'periodic',
|
||||
'overlay-harness-opus-4-7-fanout-realistic': 'periodic',
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,98 @@
|
||||
/**
|
||||
* Opus 4.7 model overlay — gate-tier assertions on the pacing directive.
|
||||
*
|
||||
* v1.6.4.0 regressed plan-review cadence because the Opus 4.7 overlay
|
||||
* carried a "Batch your questions" directive that physically rendered
|
||||
* above the skill-level pacing rule. Opus 4.7 read top-to-bottom,
|
||||
* absorbed batching as the ambient default, and stopped honoring the
|
||||
* plan-review STOP directives.
|
||||
*
|
||||
* v1.7.0.0 replaces that block with "Pace questions to the skill" —
|
||||
* one-question-at-a-time is now the default when the skill contains
|
||||
* STOP directives; batching becomes the explicit exception.
|
||||
*
|
||||
* This test asserts:
|
||||
* - The new "Pace questions" directive is present
|
||||
* - The old "Batch your questions" directive is gone
|
||||
* - The AUTO_DECIDE-compatible language survives (subordination, skill wins)
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import type { TemplateContext } from '../scripts/resolvers/types';
|
||||
import { HOST_PATHS } from '../scripts/resolvers/types';
|
||||
import { generateModelOverlay } from '../scripts/resolvers/model-overlay';
|
||||
|
||||
function makeCtx(model: string): TemplateContext {
|
||||
return {
|
||||
skillName: 'test-skill',
|
||||
tmplPath: 'test.tmpl',
|
||||
host: 'claude',
|
||||
paths: HOST_PATHS.claude,
|
||||
preambleTier: 2,
|
||||
model,
|
||||
};
|
||||
}
|
||||
|
||||
const ROOT = path.resolve(__dirname, '..');
|
||||
|
||||
describe('Opus 4.7 overlay — pacing directive', () => {
|
||||
test('raw opus-4-7.md contains "Pace questions to the skill"', () => {
|
||||
const raw = fs.readFileSync(
|
||||
path.join(ROOT, 'model-overlays/opus-4-7.md'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(raw).toContain('Pace questions to the skill');
|
||||
});
|
||||
|
||||
test('raw opus-4-7.md does NOT contain "Batch your questions" directive', () => {
|
||||
const raw = fs.readFileSync(
|
||||
path.join(ROOT, 'model-overlays/opus-4-7.md'),
|
||||
'utf-8',
|
||||
);
|
||||
expect(raw).not.toContain('**Batch your questions.**');
|
||||
});
|
||||
|
||||
test('resolved overlay output contains "Pace questions to the skill"', () => {
|
||||
const out = generateModelOverlay(makeCtx('opus-4-7'));
|
||||
expect(out).toContain('Pace questions to the skill');
|
||||
});
|
||||
|
||||
test('resolved overlay inherits from claude base (INHERIT:claude)', () => {
|
||||
const out = generateModelOverlay(makeCtx('opus-4-7'));
|
||||
// The claude base contributes the subordination wrapper + Todo discipline
|
||||
expect(out).toContain('Todo-list discipline');
|
||||
expect(out).toContain('subordinate');
|
||||
});
|
||||
|
||||
test('resolved overlay says skill STOP directives trigger one-per-turn pacing', () => {
|
||||
const out = generateModelOverlay(makeCtx('opus-4-7'));
|
||||
expect(out).toMatch(/STOP\. AskUserQuestion/);
|
||||
expect(out).toMatch(/pace one question per turn|one question per turn/i);
|
||||
});
|
||||
|
||||
test('resolved overlay requires AskUserQuestion as tool_use', () => {
|
||||
const out = generateModelOverlay(makeCtx('opus-4-7'));
|
||||
expect(out).toContain('tool_use');
|
||||
});
|
||||
|
||||
test('resolved overlay flags "obvious fix" findings still need user approval', () => {
|
||||
const out = generateModelOverlay(makeCtx('opus-4-7'));
|
||||
expect(out).toMatch(/obvious fix/i);
|
||||
expect(out).toMatch(/user approval/i);
|
||||
});
|
||||
|
||||
test('resolved overlay keeps Fan out / Effort-match / Literal interpretation nudges', () => {
|
||||
const out = generateModelOverlay(makeCtx('opus-4-7'));
|
||||
expect(out).toContain('Fan out explicitly');
|
||||
expect(out).toContain('Effort-match the step');
|
||||
expect(out).toContain('Literal interpretation awareness');
|
||||
});
|
||||
|
||||
test('claude overlay (no INHERIT chain) does not carry the pacing directive', () => {
|
||||
// Claude is the default overlay; opus-4-7 inherits FROM claude.
|
||||
// The pacing directive belongs to opus-4-7 only.
|
||||
const out = generateModelOverlay(makeCtx('claude'));
|
||||
expect(out).not.toContain('Pace questions to the skill');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,72 @@
|
||||
/**
|
||||
* Preamble composition order — gate-tier test.
|
||||
*
|
||||
* Asserts that the AskUserQuestion Format section renders BEFORE the
|
||||
* Model-Specific Behavioral Patch section in tier-≥2 preamble output.
|
||||
* This order is load-bearing: Opus 4.7 reads top-to-bottom and absorbs
|
||||
* the first pacing directive it hits. v1.6.4.0 regressed plan-review
|
||||
* cadence because the overlay rendered first with "Batch your questions"
|
||||
* as the ambient default.
|
||||
*
|
||||
* If someone later reorders `scripts/resolvers/preamble.ts` so Overlay
|
||||
* comes before Format, this test catches it before the next model
|
||||
* migration can silently re-break the plan-review pacing.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import type { TemplateContext } from '../scripts/resolvers/types';
|
||||
import { HOST_PATHS } from '../scripts/resolvers/types';
|
||||
import { generatePreamble } from '../scripts/resolvers/preamble';
|
||||
|
||||
function makeCtx(
|
||||
host: 'claude' | 'codex',
|
||||
tier: 1 | 2 | 3 | 4,
|
||||
model?: string,
|
||||
): TemplateContext {
|
||||
return {
|
||||
skillName: 'test-skill',
|
||||
tmplPath: 'test.tmpl',
|
||||
host,
|
||||
paths: HOST_PATHS[host],
|
||||
preambleTier: tier,
|
||||
...(model ? { model } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
describe('Preamble composition order', () => {
|
||||
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 2, claude)', () => {
|
||||
const out = generatePreamble(makeCtx('claude', 2, 'claude'));
|
||||
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
||||
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
||||
expect(formatIdx).toBeGreaterThan(-1);
|
||||
expect(overlayIdx).toBeGreaterThan(-1);
|
||||
expect(formatIdx).toBeLessThan(overlayIdx);
|
||||
});
|
||||
|
||||
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 2, opus-4-7)', () => {
|
||||
const out = generatePreamble(makeCtx('claude', 2, 'opus-4-7'));
|
||||
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
||||
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
||||
expect(formatIdx).toBeGreaterThan(-1);
|
||||
expect(overlayIdx).toBeGreaterThan(-1);
|
||||
expect(formatIdx).toBeLessThan(overlayIdx);
|
||||
});
|
||||
|
||||
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (tier 3)', () => {
|
||||
const out = generatePreamble(makeCtx('claude', 3, 'opus-4-7'));
|
||||
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
||||
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
||||
expect(formatIdx).toBeLessThan(overlayIdx);
|
||||
});
|
||||
|
||||
test('AskUserQuestion Format renders before Model-Specific Behavioral Patch (codex host)', () => {
|
||||
const out = generatePreamble(makeCtx('codex', 2, 'opus-4-7'));
|
||||
const formatIdx = out.indexOf('## AskUserQuestion Format');
|
||||
const overlayIdx = out.indexOf('## Model-Specific Behavioral Patch');
|
||||
expect(formatIdx).toBeLessThan(overlayIdx);
|
||||
});
|
||||
|
||||
test('tier 1 preamble does NOT include AskUserQuestion Format (but MAY include overlay)', () => {
|
||||
const out = generatePreamble(makeCtx('claude', 1));
|
||||
expect(out).not.toContain('## AskUserQuestion Format');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,121 @@
|
||||
/**
|
||||
* AskUserQuestion Format resolver — gate-tier assertions on the generated
|
||||
* Pros/Cons format directive block.
|
||||
*
|
||||
* v1.7.0.0 introduces Pros/Cons decision-brief formatting:
|
||||
* - D<N> numbered header
|
||||
* - ELI10 paragraph
|
||||
* - Stakes-if-we-pick-wrong line
|
||||
* - Recommendation line (mandatory, even for neutral posture)
|
||||
* - Pros/Cons block with ✅/❌ per option, min 2 pros + 1 con, ≥40 char bullets
|
||||
* - Net: synthesis line
|
||||
*
|
||||
* This test pins the format contract so a future edit to the resolver
|
||||
* can't silently drop a rule. If the resolver stops emitting one of
|
||||
* these tokens, bun test catches it in milliseconds instead of waiting
|
||||
* for the weekly periodic eval to notice.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import type { TemplateContext } from '../scripts/resolvers/types';
|
||||
import { HOST_PATHS } from '../scripts/resolvers/types';
|
||||
import { generateAskUserFormat } from '../scripts/resolvers/preamble/generate-ask-user-format';
|
||||
|
||||
function makeCtx(): TemplateContext {
|
||||
return {
|
||||
skillName: 'test-skill',
|
||||
tmplPath: 'test.tmpl',
|
||||
host: 'claude',
|
||||
paths: HOST_PATHS.claude,
|
||||
preambleTier: 2,
|
||||
};
|
||||
}
|
||||
|
||||
describe('generateAskUserFormat — v1.7.0.0 Pros/Cons format', () => {
|
||||
const out = generateAskUserFormat(makeCtx());
|
||||
|
||||
test('includes AskUserQuestion Format header', () => {
|
||||
expect(out).toContain('## AskUserQuestion Format');
|
||||
});
|
||||
|
||||
test('documents D-numbered header requirement', () => {
|
||||
expect(out).toContain('D<N>');
|
||||
expect(out).toMatch(/first question in a skill invocation is `D1`/i);
|
||||
});
|
||||
|
||||
test('documents ELI10 requirement', () => {
|
||||
expect(out).toContain('ELI10');
|
||||
expect(out).toMatch(/plain English.*16-year-old/);
|
||||
});
|
||||
|
||||
test('documents Stakes-if-we-pick-wrong line', () => {
|
||||
expect(out).toContain('Stakes if we pick wrong');
|
||||
});
|
||||
|
||||
test('documents mandatory Recommendation line', () => {
|
||||
expect(out).toContain('Recommendation: <choice>');
|
||||
expect(out).toMatch(/Recommendation.*ALWAYS|Recommendation \(ALWAYS\)/);
|
||||
});
|
||||
|
||||
test('documents Pros / cons block header', () => {
|
||||
expect(out).toContain('Pros / cons:');
|
||||
});
|
||||
|
||||
test('documents ✅ pro markers with min count + min length rule', () => {
|
||||
expect(out).toContain('✅');
|
||||
expect(out).toMatch(/[Mm]inimum 2 pros/);
|
||||
expect(out).toMatch(/40 characters|≥40 chars/);
|
||||
});
|
||||
|
||||
test('documents ❌ con markers with min count rule', () => {
|
||||
expect(out).toContain('❌');
|
||||
expect(out).toMatch(/1 con per option|minimum.*1 con/i);
|
||||
});
|
||||
|
||||
test('documents hard-stop escape with exact phrase', () => {
|
||||
// "No cons — this is a hard-stop choice" may span a line break in the
|
||||
// rendered resolver text; match across whitespace collapses.
|
||||
expect(out).toMatch(/No cons\s+—\s+this is a\s+hard-stop choice/);
|
||||
});
|
||||
|
||||
test('documents neutral-posture escape preserving (recommended) label', () => {
|
||||
// CT1 resolution: (recommended) label STAYS on default option to preserve
|
||||
// AUTO_DECIDE contract. Neutrality expressed in prose only.
|
||||
expect(out).toMatch(/taste call/i);
|
||||
// `s` flag makes . match newlines — the label + STAYS phrase spans a line break
|
||||
expect(out).toMatch(/\(recommended\)[\s\S]*STAYS|STAYS[\s\S]*\(recommended\)/);
|
||||
expect(out).toMatch(/AUTO_DECIDE/);
|
||||
});
|
||||
|
||||
test('documents Net line for closing synthesis', () => {
|
||||
expect(out).toMatch(/^Net:/m);
|
||||
expect(out).toMatch(/synthesis|tradeoff/i);
|
||||
});
|
||||
|
||||
test('documents Completeness scoring rules (coverage vs kind)', () => {
|
||||
expect(out).toContain('Completeness');
|
||||
expect(out).toMatch(/10 = complete/);
|
||||
expect(out).toMatch(/options differ in kind, not coverage/);
|
||||
});
|
||||
|
||||
test('documents tool_use mandate (rule 11)', () => {
|
||||
expect(out).toMatch(/tool_use/);
|
||||
// "not a question" spans a newline in the rendered text
|
||||
expect(out).toMatch(/not a[\s\S]*question|not[\s\S]*interactive/i);
|
||||
});
|
||||
|
||||
test('includes self-check before emitting', () => {
|
||||
expect(out).toContain('Self-check before emitting');
|
||||
expect(out).toMatch(/D<N> header present/);
|
||||
expect(out).toMatch(/Net line closes/);
|
||||
});
|
||||
|
||||
test('documents D-numbering as model-level not runtime state', () => {
|
||||
// Codex finding #4 caveat: D-numbering is a prompt wish, not a system
|
||||
// guarantee. TemplateContext has no counter. This check pins the caveat.
|
||||
expect(out).toMatch(/model-level instruction|not a runtime counter|count your own/i);
|
||||
});
|
||||
|
||||
test('per-skill override guidance preserved', () => {
|
||||
expect(out).toMatch(/Per-skill instructions may add/);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,320 @@
|
||||
/**
|
||||
* Overlay-efficacy harness (periodic tier, paid).
|
||||
*
|
||||
* Measures whether a model-specific overlay nudge actually changes model
|
||||
* behavior when run through the real Claude Agent SDK — the harness
|
||||
* Claude Code itself is built on. This complements test/skill-e2e-opus-47.test.ts
|
||||
* which measures the same thing via `claude -p` subprocess (a different
|
||||
* harness with different prompt composition).
|
||||
*
|
||||
* For each fixture in test/fixtures/overlay-nudges.ts, runs two arms at
|
||||
* `fixture.trials` trials per arm with bounded concurrency:
|
||||
* - overlay-on: SDK systemPrompt = resolved overlay content
|
||||
* - overlay-off: SDK systemPrompt = "" (empty)
|
||||
*
|
||||
* Both arms have no CLAUDE.md, no skills directory, no setting-source
|
||||
* inheritance (settingSources: []). This is the TRUE bare comparison —
|
||||
* the only variable is the overlay text.
|
||||
*
|
||||
* Budget ~$20 per run at 40 trials (2 fixtures × 2 arms × 10 trials).
|
||||
* Gated by EVALS=1 AND EVALS_TIER=periodic. Never runs under test:gate.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, afterAll } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import {
|
||||
runAgentSdkTest,
|
||||
resolveClaudeBinary,
|
||||
type AgentSdkResult,
|
||||
type SystemPromptOption,
|
||||
} from './helpers/agent-sdk-runner';
|
||||
import { EvalCollector, getProjectEvalDir } from './helpers/eval-store';
|
||||
import {
|
||||
OVERLAY_FIXTURES,
|
||||
type OverlayFixture,
|
||||
} from './fixtures/overlay-nudges';
|
||||
import { readOverlay } from '../scripts/resolvers/model-overlay';
|
||||
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const periodicTier = process.env.EVALS_TIER === 'periodic';
|
||||
const shouldRun = evalsEnabled && periodicTier;
|
||||
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
// EvalCollector's tier must be 'e2e' | 'llm-judge' per its type signature.
|
||||
// The existing paid evals violate this by passing descriptive names like
|
||||
// 'e2e-opus-47' — a pre-existing pattern that only works because bun-test
|
||||
// runs without strict typechecking. We stay conforming here.
|
||||
const evalCollector = shouldRun ? new EvalCollector('e2e') : null;
|
||||
|
||||
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
||||
const runId = new Date()
|
||||
.toISOString()
|
||||
.replace(/[:.]/g, '')
|
||||
.replace('T', '-')
|
||||
.slice(0, 15);
|
||||
const TRANSCRIPTS_DIR = path.join(
|
||||
path.dirname(getProjectEvalDir()),
|
||||
'transcripts',
|
||||
`overlay-harness-${runId}`,
|
||||
);
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-arm helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type Arm = 'overlay-on' | 'overlay-off';
|
||||
|
||||
function mkTrialDir(fixtureId: string, arm: Arm, n: number): string {
|
||||
const dir = fs.mkdtempSync(
|
||||
path.join(os.tmpdir(), `overlay-harness-${fixtureId}-${arm}-${n}-`),
|
||||
);
|
||||
return dir;
|
||||
}
|
||||
|
||||
function saveRawTranscript(
|
||||
fixtureId: string,
|
||||
arm: Arm,
|
||||
n: number,
|
||||
result: AgentSdkResult,
|
||||
): void {
|
||||
fs.mkdirSync(TRANSCRIPTS_DIR, { recursive: true });
|
||||
const out = path.join(TRANSCRIPTS_DIR, `${fixtureId}-${arm}-${n}.jsonl`);
|
||||
const lines = result.events.map((e) => JSON.stringify(e));
|
||||
fs.writeFileSync(out, lines.join('\n') + '\n');
|
||||
}
|
||||
|
||||
function overlayContentFor(fixture: OverlayFixture): string {
|
||||
const family = path.basename(fixture.overlayPath, '.md');
|
||||
const resolved = readOverlay(family);
|
||||
if (!resolved) {
|
||||
throw new Error(
|
||||
`fixture ${fixture.id}: resolver returned empty content for ${family}`,
|
||||
);
|
||||
}
|
||||
return resolved;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per-fixture runner
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface ArmResult {
|
||||
metrics: number[];
|
||||
costs: number[];
|
||||
durations: number[];
|
||||
rateLimitExhausted: number;
|
||||
sdkClaudeCodeVersions: Set<string>;
|
||||
}
|
||||
|
||||
async function runArm(
|
||||
fixture: OverlayFixture,
|
||||
arm: Arm,
|
||||
systemPrompt: SystemPromptOption,
|
||||
claudeBinary: string | null,
|
||||
): Promise<ArmResult> {
|
||||
const result: ArmResult = {
|
||||
metrics: [],
|
||||
costs: [],
|
||||
durations: [],
|
||||
rateLimitExhausted: 0,
|
||||
sdkClaudeCodeVersions: new Set(),
|
||||
};
|
||||
|
||||
const trials = fixture.trials;
|
||||
const concurrency = fixture.concurrency ?? 3;
|
||||
|
||||
// Simple bounded executor: run trials in chunks of `concurrency`.
|
||||
// The process-level semaphore in agent-sdk-runner.ts enforces the true cap.
|
||||
let nextTrial = 0;
|
||||
const workers = Array.from({ length: concurrency }, async () => {
|
||||
while (true) {
|
||||
const n = nextTrial++;
|
||||
if (n >= trials) return;
|
||||
|
||||
const dir = mkTrialDir(fixture.id, arm, n);
|
||||
fixture.setupWorkspace(dir);
|
||||
try {
|
||||
const sdkResult = await runAgentSdkTest({
|
||||
systemPrompt,
|
||||
userPrompt: fixture.userPrompt,
|
||||
workingDirectory: dir,
|
||||
model: fixture.model,
|
||||
maxTurns: fixture.maxTurns ?? 5,
|
||||
allowedTools: fixture.allowedTools ?? ['Read', 'Glob', 'Grep', 'Bash'],
|
||||
permissionMode: 'bypassPermissions',
|
||||
settingSources: [],
|
||||
env: { ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY ?? '' },
|
||||
pathToClaudeCodeExecutable: claudeBinary ?? undefined,
|
||||
testName: `${fixture.id}-${arm}-${n}`,
|
||||
runId,
|
||||
fixtureId: fixture.id,
|
||||
onRetry: (_) => {
|
||||
// Reset the workspace before the retry so partial Bash side effects
|
||||
// from the failed attempt don't contaminate.
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
fixture.setupWorkspace(dir);
|
||||
},
|
||||
});
|
||||
|
||||
saveRawTranscript(fixture.id, arm, n, sdkResult);
|
||||
|
||||
const metric = fixture.metric(sdkResult);
|
||||
result.metrics.push(metric);
|
||||
result.costs.push(sdkResult.costUsd);
|
||||
result.durations.push(sdkResult.durationMs);
|
||||
result.sdkClaudeCodeVersions.add(sdkResult.sdkClaudeCodeVersion);
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: `${fixture.id}-${arm}-${n}`,
|
||||
suite: 'overlay-harness',
|
||||
tier: 'e2e',
|
||||
passed: true,
|
||||
duration_ms: sdkResult.durationMs,
|
||||
cost_usd: sdkResult.costUsd,
|
||||
transcript: sdkResult.events,
|
||||
prompt: fixture.userPrompt,
|
||||
output: sdkResult.output,
|
||||
turns_used: sdkResult.turnsUsed,
|
||||
browse_errors: sdkResult.browseErrors,
|
||||
exit_reason: sdkResult.exitReason,
|
||||
model: sdkResult.model,
|
||||
first_response_ms: sdkResult.firstResponseMs,
|
||||
max_inter_turn_ms: sdkResult.maxInterTurnMs,
|
||||
});
|
||||
} catch (err) {
|
||||
if (err instanceof Error && err.name === 'RateLimitExhaustedError') {
|
||||
result.rateLimitExhausted++;
|
||||
// Record a failed trial so the collector captures the attempt.
|
||||
evalCollector?.addTest({
|
||||
name: `${fixture.id}-${arm}-${n}`,
|
||||
suite: 'overlay-harness',
|
||||
tier: 'e2e',
|
||||
passed: false,
|
||||
duration_ms: 0,
|
||||
cost_usd: 0,
|
||||
exit_reason: 'rate_limit_exhausted',
|
||||
error: err.message,
|
||||
});
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
fs.rmSync(dir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// best-effort cleanup
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(workers);
|
||||
return result;
|
||||
}
|
||||
|
||||
function mean(xs: number[]): number {
|
||||
if (xs.length === 0) return 0;
|
||||
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
||||
}
|
||||
|
||||
function sum(xs: number[]): number {
|
||||
return xs.reduce((a, b) => a + b, 0);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test bodies
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describeE2E('overlay efficacy harness (SDK)', () => {
|
||||
// Resolve binary once
|
||||
const claudeBinary = resolveClaudeBinary();
|
||||
|
||||
if (!claudeBinary) {
|
||||
test.skip(
|
||||
'no local `claude` binary on PATH — cannot pin for harness parity',
|
||||
() => {},
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
for (const fixture of OVERLAY_FIXTURES) {
|
||||
test(
|
||||
`${fixture.id}: overlay-ON vs overlay-OFF, N=${fixture.trials} per arm`,
|
||||
async () => {
|
||||
const overlayText = overlayContentFor(fixture);
|
||||
expect(overlayText.length).toBeGreaterThan(100);
|
||||
|
||||
// Arm composition: both arms use the real Claude Code default system
|
||||
// prompt (preset). Overlay-ON APPENDS the overlay text; overlay-OFF
|
||||
// uses the default alone. This measures the overlay's marginal effect
|
||||
// ON TOP of Claude Code's normal behavioral scaffolding — which is
|
||||
// the only measurement that matches how real Claude Code composes
|
||||
// overlays into its system prompt stack.
|
||||
const [onArm, offArm] = await Promise.all([
|
||||
runArm(
|
||||
fixture,
|
||||
'overlay-on',
|
||||
{ type: 'preset', preset: 'claude_code', append: overlayText },
|
||||
claudeBinary,
|
||||
),
|
||||
runArm(
|
||||
fixture,
|
||||
'overlay-off',
|
||||
{ type: 'preset', preset: 'claude_code' },
|
||||
claudeBinary,
|
||||
),
|
||||
]);
|
||||
|
||||
const arms = {
|
||||
overlay: onArm.metrics,
|
||||
off: offArm.metrics,
|
||||
};
|
||||
|
||||
const meanOn = mean(arms.overlay);
|
||||
const meanOff = mean(arms.off);
|
||||
const lift = meanOn - meanOff;
|
||||
const floorHits = arms.overlay.filter((n) => n >= 2).length;
|
||||
const totalCost = sum(onArm.costs) + sum(offArm.costs);
|
||||
const versionSet = new Set([
|
||||
...onArm.sdkClaudeCodeVersions,
|
||||
...offArm.sdkClaudeCodeVersions,
|
||||
]);
|
||||
|
||||
// Loud output for the next person reading the eval JSON:
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(
|
||||
`\n[${fixture.id}]\n` +
|
||||
` binary: ${claudeBinary}\n` +
|
||||
` claude_code_version(s): ${[...versionSet].join(', ')}\n` +
|
||||
` overlay-ON metrics: [${arms.overlay.join(', ')}] mean=${meanOn.toFixed(2)}\n` +
|
||||
` overlay-OFF metrics: [${arms.off.join(', ')}] mean=${meanOff.toFixed(2)}\n` +
|
||||
` lift: ${lift.toFixed(2)} floor_hits(>=2): ${floorHits}/${fixture.trials}\n` +
|
||||
` rate_limit_exhausted: on=${onArm.rateLimitExhausted} off=${offArm.rateLimitExhausted}\n` +
|
||||
` total_cost_usd: $${totalCost.toFixed(4)}\n` +
|
||||
` transcripts: ${TRANSCRIPTS_DIR}`,
|
||||
);
|
||||
|
||||
// Demand enough trials actually completed to make the assertion
|
||||
// meaningful. If rate-limit exhaustion took out more than half of an
|
||||
// arm, fail loudly rather than pass/fail on a fragment.
|
||||
const minTrials = Math.ceil(fixture.trials / 2);
|
||||
expect(arms.overlay.length).toBeGreaterThanOrEqual(minTrials);
|
||||
expect(arms.off.length).toBeGreaterThanOrEqual(minTrials);
|
||||
|
||||
expect(fixture.pass(arms)).toBe(true);
|
||||
},
|
||||
30 * 60 * 1000, // 30 minute timeout per fixture
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
const filepath = await evalCollector.finalize();
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(`\n[overlay-harness] eval results: ${filepath}`);
|
||||
}
|
||||
});
|
||||
@@ -0,0 +1,40 @@
|
||||
/**
|
||||
* plan-ceo-review plan-mode handshake E2E (gate tier, paid).
|
||||
*
|
||||
* Asserts: when /plan-ceo-review is invoked with the plan-mode distinctive
|
||||
* phrase in the system reminder, the skill fires AskUserQuestion FIRST
|
||||
* (before any Write or Edit), the question has exactly 2 options (A exit,
|
||||
* C cancel), picking "Exit" leads to an orderly exit with no plan file
|
||||
* written.
|
||||
*
|
||||
* Cost: ~$0.50–$1.00 per run. Gated: EVALS=1 EVALS_TIER=gate.
|
||||
* Depends on: scripts/resolvers/preamble/generate-plan-mode-handshake.ts,
|
||||
* test/helpers/agent-sdk-runner.ts (canUseTool extension).
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeHandshakeTest,
|
||||
assertHandshakeShape,
|
||||
} from './helpers/plan-mode-handshake-helpers';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-ceo-review plan-mode handshake (gate)', () => {
|
||||
test('handshake fires before any Write/Edit when plan mode is detected', async () => {
|
||||
const result = await runPlanModeHandshakeTest({
|
||||
skillName: 'plan-ceo-review',
|
||||
answerLabel: 'Exit',
|
||||
});
|
||||
|
||||
// Handshake must have fired at least once.
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
// Critically: no Write or Edit fired before the first AskUserQuestion.
|
||||
// This is the bug v1.10.2.0 fixes — plan mode used to allow silent
|
||||
// plan-file writes without any interactive gate.
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
// Handshake shape: 2 options (Exit/Cancel), Option B dropped per D8.
|
||||
assertHandshakeShape(result.askUserQuestions[0]!);
|
||||
}, 120_000);
|
||||
});
|
||||
@@ -0,0 +1,28 @@
|
||||
/**
|
||||
* plan-design-review plan-mode handshake E2E (gate tier, paid).
|
||||
*
|
||||
* See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion
|
||||
* contract. This file exercises the same handshake against /plan-design-review.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeHandshakeTest,
|
||||
assertHandshakeShape,
|
||||
} from './helpers/plan-mode-handshake-helpers';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-design-review plan-mode handshake (gate)', () => {
|
||||
test('handshake fires before any Write/Edit when plan mode is detected', async () => {
|
||||
const result = await runPlanModeHandshakeTest({
|
||||
skillName: 'plan-design-review',
|
||||
answerLabel: 'Cancel', // exercise the C-cancel branch instead of A-exit
|
||||
});
|
||||
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
assertHandshakeShape(result.askUserQuestions[0]!);
|
||||
}, 120_000);
|
||||
});
|
||||
@@ -0,0 +1,28 @@
|
||||
/**
|
||||
* plan-devex-review plan-mode handshake E2E (gate tier, paid).
|
||||
*
|
||||
* See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion
|
||||
* contract. This file exercises the same handshake against /plan-devex-review.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeHandshakeTest,
|
||||
assertHandshakeShape,
|
||||
} from './helpers/plan-mode-handshake-helpers';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-devex-review plan-mode handshake (gate)', () => {
|
||||
test('handshake fires before any Write/Edit when plan mode is detected', async () => {
|
||||
const result = await runPlanModeHandshakeTest({
|
||||
skillName: 'plan-devex-review',
|
||||
answerLabel: 'Exit',
|
||||
});
|
||||
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
assertHandshakeShape(result.askUserQuestions[0]!);
|
||||
}, 120_000);
|
||||
});
|
||||
@@ -0,0 +1,28 @@
|
||||
/**
|
||||
* plan-eng-review plan-mode handshake E2E (gate tier, paid).
|
||||
*
|
||||
* See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion
|
||||
* contract. This file exercises the same handshake against /plan-eng-review.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeHandshakeTest,
|
||||
assertHandshakeShape,
|
||||
} from './helpers/plan-mode-handshake-helpers';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-eng-review plan-mode handshake (gate)', () => {
|
||||
test('handshake fires before any Write/Edit when plan mode is detected', async () => {
|
||||
const result = await runPlanModeHandshakeTest({
|
||||
skillName: 'plan-eng-review',
|
||||
answerLabel: 'Exit',
|
||||
});
|
||||
|
||||
expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1);
|
||||
expect(result.writeOrEditBeforeAsk).toBe(false);
|
||||
assertHandshakeShape(result.askUserQuestions[0]!);
|
||||
}, 120_000);
|
||||
});
|
||||
@@ -35,10 +35,25 @@ const evalCollector = createEvalCollector('e2e-plan-format');
|
||||
// Regex predicates applied to captured AskUserQuestion content.
|
||||
// RECOMMENDATION regex is lenient on intervening markdown markers (e.g.
|
||||
// agent writes `**RECOMMENDATION:** Choose` — the `**` closers are benign).
|
||||
const RECOMMENDATION_RE = /RECOMMENDATION:[*\s]*Choose/;
|
||||
// Post v1.7.0.0: "Recommendation:" (mixed-case) is the canonical form per
|
||||
// the Pros/Cons format; accept both cases for backward compatibility.
|
||||
const RECOMMENDATION_RE = /[Rr]ecommendation:[*\s]*Choose/;
|
||||
const COMPLETENESS_RE = /Completeness:\s*\d{1,2}\/10/;
|
||||
const KIND_NOTE_RE = /options differ in kind/i;
|
||||
|
||||
// v1.7.0.0 Pros/Cons format tokens. Tests are additive: existing
|
||||
// RECOMMENDATION / Completeness / kind-note assertions still hold; new
|
||||
// format tokens are asserted ONLY when the capture is from a v1.7+
|
||||
// skill rendering. Presence is optional for backward compatibility during
|
||||
// rollout; the periodic-tier cadence+format eval (see skill-e2e-plan-cadence)
|
||||
// is the strict gate for the new format.
|
||||
const PROS_CONS_HEADER_RE = /Pros\s*\/\s*cons:/i;
|
||||
const PRO_BULLET_RE = /^\s*✅\s+\S/m;
|
||||
const CON_BULLET_RE = /^\s*❌\s+\S/m;
|
||||
const NET_LINE_RE = /^Net:\s+\S/m;
|
||||
const D_NUMBER_RE = /^D\d+\s+—/m;
|
||||
const STAKES_RE = /Stakes if we pick wrong:/i;
|
||||
|
||||
const SAMPLE_PLAN = `# Plan: Add User Dashboard
|
||||
|
||||
## Context
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
/**
|
||||
* Plan-mode handshake negative regression (gate tier, paid).
|
||||
*
|
||||
* Asserts: when /plan-ceo-review is invoked WITHOUT the plan-mode distinctive
|
||||
* phrase in the system reminder, the handshake does NOT fire. The skill
|
||||
* should proceed to its normal Step 0 flow. This is the REGRESSION RULE
|
||||
* guardrail — the handshake must be a no-op outside plan mode or it breaks
|
||||
* every existing interactive-review session.
|
||||
*
|
||||
* Cost: ~$0.50 per run. Gated: EVALS=1 EVALS_TIER=gate.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanModeHandshakeTest,
|
||||
PLAN_MODE_REMINDER,
|
||||
} from './helpers/plan-mode-handshake-helpers';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('plan-mode handshake no-op outside plan mode (gate regression)', () => {
|
||||
test('handshake does NOT fire when distinctive phrase is absent', async () => {
|
||||
const result = await runPlanModeHandshakeTest({
|
||||
skillName: 'plan-ceo-review',
|
||||
answerLabel: 'Exit', // ignored — handshake should never fire
|
||||
omitPlanModeReminder: true,
|
||||
maxTurns: 3, // enough to see Step 0 start, but bounded
|
||||
});
|
||||
|
||||
// The handshake AskUserQuestion should NOT have fired during Step 0 entry.
|
||||
// Other AskUserQuestions may fire later in the skill (e.g., Step 0C-bis),
|
||||
// but they will NOT have the handshake's question text.
|
||||
for (const aq of result.askUserQuestions) {
|
||||
const questions = aq.input.questions as Array<{ question: string }>;
|
||||
for (const q of questions) {
|
||||
// The handshake's question mentions the distinctive phrase in its
|
||||
// prose; a non-handshake AskUserQuestion won't.
|
||||
expect(q.question).not.toContain(PLAN_MODE_REMINDER);
|
||||
}
|
||||
}
|
||||
}, 120_000);
|
||||
});
|
||||
@@ -0,0 +1,352 @@
|
||||
/**
|
||||
* v1.7.0.0 Pros/Cons format regression tests for plan reviews.
|
||||
*
|
||||
* Extends the v1.6.3.0 format harness (skill-e2e-plan-format.test.ts) with
|
||||
* four new cases covering the Pros/Cons decision-brief format:
|
||||
*
|
||||
* 1. Format positive — every AskUserQuestion renders with D<N> / ELI10 /
|
||||
* Stakes / Recommendation / Pros/cons / ✅×2+ / ❌×1+ / Net tokens.
|
||||
* 2. Hard-stop positive — destructive-action question may use the single
|
||||
* "No cons — this is a hard-stop choice" escape.
|
||||
* 3. Hard-stop NEGATIVE (CT2) — plan with genuine tradeoff, model must NOT
|
||||
* dodge to the hard-stop escape. Forces real tradeoff articulation.
|
||||
* 4. Neutral-posture NEGATIVE (CT2) — plan with one clearly-dominant option,
|
||||
* model must emit (recommended) label and concrete recommendation, NOT
|
||||
* "no preference — taste call" dodge.
|
||||
*
|
||||
* Capture pattern matches existing harness: agent writes verbatim
|
||||
* AskUserQuestion text to $OUT_FILE; regex predicates run on the captured
|
||||
* file. Classified periodic (Opus 4.7 non-deterministic).
|
||||
*
|
||||
* FOLLOW-UP (not in v1.7.0.0):
|
||||
* - True cadence eval (3 findings → 3 distinct asks across turns). Current
|
||||
* $OUT_FILE harness captures ONE would-be question per session. Multi-turn
|
||||
* cadence needs new harness support. Filed in TODOs.
|
||||
* - Expanded coverage for /ship /office-hours /investigate /qa /review
|
||||
* /design-review /document-release. Touchfiles entries already exist; eval
|
||||
* cases will land as follow-up PRs per skill.
|
||||
*/
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import {
|
||||
ROOT, runId,
|
||||
describeIfSelected, testConcurrentIfSelected,
|
||||
logCost, recordE2E,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const evalCollector = createEvalCollector('e2e-plan-prosons');
|
||||
|
||||
// v1.7.0.0 format tokens
|
||||
const D_NUMBER_RE = /D\d+\s+—/;
|
||||
const ELI10_RE = /ELI10:/i;
|
||||
const STAKES_RE = /Stakes if we pick wrong:/i;
|
||||
const RECOMMENDATION_RE = /[Rr]ecommendation:/;
|
||||
const PROS_CONS_HEADER_RE = /Pros\s*\/\s*cons:/i;
|
||||
const NET_LINE_RE = /^Net:/m;
|
||||
const HARD_STOP_ESCAPE_RE = /✅\s+No cons\s+—\s+this is a hard-stop choice/;
|
||||
const NEUTRAL_POSTURE_RE = /taste call/i;
|
||||
const RECOMMENDED_LABEL_RE = /\(recommended\)/;
|
||||
|
||||
function countChars(text: string, char: string): number {
|
||||
return (text.match(new RegExp(char, 'g')) || []).length;
|
||||
}
|
||||
|
||||
const TRADEOFF_PLAN = `# Plan: Add user dashboard caching
|
||||
|
||||
## Context
|
||||
Dashboard renders in 3s on cold load, 800ms on warm cache. Users complain.
|
||||
|
||||
## Approach options
|
||||
|
||||
### Option A: Redis cache layer (complete)
|
||||
- Add Redis with 5min TTL for dashboard aggregates.
|
||||
- Cold path: compute + cache. Warm path: fetch from cache.
|
||||
- Needs Redis infra, cache invalidation logic for activity updates.
|
||||
- Covers all users, all flows, fails gracefully on cache miss.
|
||||
|
||||
### Option B: In-memory LRU cache (happy path only)
|
||||
- Per-process LRU with 100-entry cap.
|
||||
- No cross-process sharing; cache warms per-pod.
|
||||
- Skips cache invalidation; stale reads up to 5min.
|
||||
|
||||
Both options have real pros and cons. This is a genuine tradeoff.
|
||||
`;
|
||||
|
||||
const HARDSTOP_PLAN = `# Plan: Delete all user sessions
|
||||
|
||||
## Context
|
||||
Security incident. All active sessions need to be terminated immediately.
|
||||
|
||||
## Action
|
||||
Run \`DELETE FROM sessions WHERE TRUE\`. No dry-run mode.
|
||||
|
||||
This is a one-way door. There is no "partial" version.
|
||||
`;
|
||||
|
||||
const DOMINANT_PLAN = `# Plan: Add input validation to signup endpoint
|
||||
|
||||
## Context
|
||||
Signup endpoint currently accepts any email string and any password length.
|
||||
Bug report: users type gibberish, signup succeeds, they can't log in.
|
||||
|
||||
## Options
|
||||
|
||||
### Option A: Full RFC 5322 email validation + min 8-char password + server-side checks
|
||||
- Catches malformed emails, rejects weak passwords, validated on server.
|
||||
- Prevents the reported bug and adjacent bugs.
|
||||
- Standard web practice.
|
||||
|
||||
### Option B: Client-side type="email" only, no password validation
|
||||
- Only catches some browsers' built-in validation.
|
||||
- Attackers bypass by disabling JS.
|
||||
- Does not fix the reported bug.
|
||||
|
||||
Option A clearly dominates on coverage. This is NOT a taste call.
|
||||
`;
|
||||
|
||||
function setupPlanDir(tmpPrefix: string, planContent: string, skillName: string): string {
|
||||
const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(planDir, 'plan.md'), planContent);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'add plan']);
|
||||
|
||||
fs.mkdirSync(path.join(planDir, skillName), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, skillName, 'SKILL.md'),
|
||||
path.join(planDir, skillName, 'SKILL.md'),
|
||||
);
|
||||
|
||||
return planDir;
|
||||
}
|
||||
|
||||
function captureInstruction(outFile: string): string {
|
||||
return `Write the verbatim text of the single AskUserQuestion you would have made to ${outFile} (full text including D<N> header, ELI10, Stakes, Recommendation, Pros/cons, and Net line — the complete rich markdown body). Do NOT call any tool to ask the user. Do NOT paraphrase. This is a format-capture test.`;
|
||||
}
|
||||
|
||||
// --- Case 1: Format positive — all v1.7.0.0 tokens present ---
|
||||
|
||||
describeIfSelected('Plan Prosons — Format Positive', ['plan-review-prosons-format'], () => {
|
||||
let planDir: string;
|
||||
let outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = setupPlanDir('skill-e2e-plan-prosons-format-', TRADEOFF_PLAN, 'plan-ceo-review');
|
||||
outFile = path.join(planDir, 'ask-capture.md');
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('plan-review-prosons-format', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
|
||||
|
||||
Read plan.md — two cache approaches with real tradeoffs. Pick the architectural approach via AskUserQuestion (Step 0C-bis / Implementation Alternatives). These options differ in coverage.
|
||||
|
||||
${captureInstruction(outFile)}
|
||||
|
||||
After writing the file, stop.`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 10,
|
||||
timeout: 240_000,
|
||||
testName: 'plan-review-prosons-format',
|
||||
runId,
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-review prosons format positive', result);
|
||||
recordE2E(evalCollector, '/plan-review-prosons-format', 'Plan Prosons — Format Positive', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(200);
|
||||
|
||||
// Every Pros/Cons token present
|
||||
expect(captured).toMatch(D_NUMBER_RE);
|
||||
expect(captured).toMatch(ELI10_RE);
|
||||
expect(captured).toMatch(STAKES_RE);
|
||||
expect(captured).toMatch(RECOMMENDATION_RE);
|
||||
expect(captured).toMatch(PROS_CONS_HEADER_RE);
|
||||
expect(captured).toMatch(NET_LINE_RE);
|
||||
|
||||
// Pro/con bullet counts: ≥2 ✅ and ≥1 ❌ per option (total ≥4 ✅ and ≥2 ❌ for 2 options)
|
||||
expect(countChars(captured, '✅')).toBeGreaterThanOrEqual(4);
|
||||
expect(countChars(captured, '❌')).toBeGreaterThanOrEqual(2);
|
||||
|
||||
// (recommended) label on one option
|
||||
expect(captured).toMatch(RECOMMENDED_LABEL_RE);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
// --- Case 2: Hard-stop escape NEGATIVE (CT2) ---
|
||||
|
||||
describeIfSelected('Plan Prosons — Hard-stop Negative', ['plan-review-prosons-hardstop-neg'], () => {
|
||||
let planDir: string;
|
||||
let outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = setupPlanDir('skill-e2e-plan-prosons-hardstop-neg-', TRADEOFF_PLAN, 'plan-ceo-review');
|
||||
outFile = path.join(planDir, 'ask-capture.md');
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('plan-review-prosons-hardstop-neg', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md.
|
||||
|
||||
Read plan.md — this has REAL tradeoffs between Redis and in-memory caching (both have pros and cons). Pick the architectural approach via AskUserQuestion.
|
||||
|
||||
${captureInstruction(outFile)}
|
||||
|
||||
After writing the file, stop.`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 10,
|
||||
timeout: 240_000,
|
||||
testName: 'plan-review-prosons-hardstop-neg',
|
||||
runId,
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-review prosons hard-stop negative', result);
|
||||
recordE2E(evalCollector, '/plan-review-prosons-hardstop-neg', 'Plan Prosons — Hard-stop Negative', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(200);
|
||||
|
||||
// Genuine tradeoff — must NOT dodge to hard-stop escape.
|
||||
expect(captured).not.toMatch(HARD_STOP_ESCAPE_RE);
|
||||
// Must have real pros and cons (≥2 ✅ + ≥1 ❌ per option)
|
||||
expect(countChars(captured, '✅')).toBeGreaterThanOrEqual(4);
|
||||
expect(countChars(captured, '❌')).toBeGreaterThanOrEqual(2);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
// --- Case 3: Neutral-posture NEGATIVE (CT2) ---
|
||||
|
||||
describeIfSelected('Plan Prosons — Neutral-posture Negative', ['plan-review-prosons-neutral-neg'], () => {
|
||||
let planDir: string;
|
||||
let outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = setupPlanDir('skill-e2e-plan-prosons-neutral-neg-', DOMINANT_PLAN, 'plan-ceo-review');
|
||||
outFile = path.join(planDir, 'ask-capture.md');
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('plan-review-prosons-neutral-neg', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md.
|
||||
|
||||
Read plan.md — Option A dominates Option B on coverage. This is NOT a taste call. Pick the approach via AskUserQuestion (Step 0C-bis / Implementation Alternatives — coverage-differentiated, so Completeness: N/10 applies).
|
||||
|
||||
${captureInstruction(outFile)}
|
||||
|
||||
After writing the file, stop.`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 10,
|
||||
timeout: 240_000,
|
||||
testName: 'plan-review-prosons-neutral-neg',
|
||||
runId,
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-review prosons neutral negative', result);
|
||||
recordE2E(evalCollector, '/plan-review-prosons-neutral-neg', 'Plan Prosons — Neutral Negative', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(200);
|
||||
|
||||
// One option dominates — must NOT use "taste call" neutral-posture dodge.
|
||||
expect(captured).not.toMatch(NEUTRAL_POSTURE_RE);
|
||||
// (recommended) label MUST be present on the dominant option.
|
||||
expect(captured).toMatch(RECOMMENDED_LABEL_RE);
|
||||
// Recommendation line must contain "because" (concrete reason, not "no preference")
|
||||
expect(captured).toMatch(/[Rr]ecommendation:.*because/);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
// --- Case 4: Hard-stop POSITIVE (escape allowed when legitimately one-sided) ---
|
||||
|
||||
describeIfSelected('Plan Prosons — Hard-stop Positive', ['plan-ceo-review-prosons-cadence'], () => {
|
||||
let planDir: string;
|
||||
let outFile: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = setupPlanDir('skill-e2e-plan-prosons-hardstop-pos-', HARDSTOP_PLAN, 'plan-ceo-review');
|
||||
outFile = path.join(planDir, 'ask-capture.md');
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('plan-ceo-review-prosons-cadence', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md.
|
||||
|
||||
Read plan.md — this is a destructive one-way action (terminate all sessions). Ask the user to confirm via AskUserQuestion. This is a legitimate hard-stop choice — the hard-stop escape (\`✅ No cons — this is a hard-stop choice\`) is allowed here because there is no meaningful alternative besides doing or not doing the action.
|
||||
|
||||
${captureInstruction(outFile)}
|
||||
|
||||
After writing the file, stop.`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 10,
|
||||
timeout: 240_000,
|
||||
testName: 'plan-ceo-review-prosons-cadence',
|
||||
runId,
|
||||
model: 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
logCost('/plan-review prosons hard-stop positive', result);
|
||||
recordE2E(evalCollector, '/plan-ceo-review-prosons-cadence', 'Plan Prosons — Hard-stop Positive', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
expect(fs.existsSync(outFile)).toBe(true);
|
||||
const captured = fs.readFileSync(outFile, 'utf-8');
|
||||
expect(captured.length).toBeGreaterThan(100);
|
||||
|
||||
// Format scaffolding still required
|
||||
expect(captured).toMatch(PROS_CONS_HEADER_RE);
|
||||
// Hard-stop escape is ACCEPTED here (destructive one-way action)
|
||||
// Either the escape is used OR real pros/cons are present — both are valid.
|
||||
const hasEscape = HARD_STOP_ESCAPE_RE.test(captured);
|
||||
const hasProsAndCons = countChars(captured, '✅') >= 1 && countChars(captured, '❌') >= 1;
|
||||
expect(hasEscape || hasProsAndCons).toBe(true);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
afterAll(async () => {
|
||||
await finalizeEvalCollector(evalCollector);
|
||||
});
|
||||
@@ -566,10 +566,21 @@ describe('v0.4.1 preamble features', () => {
|
||||
const skillsWithPreamble = [...tier1Skills, ...tier2PlusSkills];
|
||||
|
||||
for (const skill of tier2PlusSkills) {
|
||||
test(`${skill} contains RECOMMENDATION format`, () => {
|
||||
test(`${skill} contains AskUserQuestion Pros/Cons format`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('RECOMMENDATION: Choose');
|
||||
// v1.7.0.0 Pros/Cons format tokens. The preamble resolver
|
||||
// (generate-ask-user-format.ts) injects all of these into every
|
||||
// tier-2+ skill. Drop any of them and the test catches it on the
|
||||
// next `bun test` run.
|
||||
expect(content).toContain('AskUserQuestion');
|
||||
expect(content).toContain('Pros / cons:');
|
||||
expect(content).toContain('Recommendation: <choice>');
|
||||
expect(content).toContain('Net:');
|
||||
expect(content).toContain('ELI10');
|
||||
expect(content).toContain('Stakes if we pick wrong:');
|
||||
// Concrete format markers must be documented in the resolver text
|
||||
expect(content).toMatch(/✅/);
|
||||
expect(content).toMatch(/❌/);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
+10
-2
@@ -85,8 +85,16 @@ describe('selectTests', () => {
|
||||
expect(result.selected).toContain('codex-offered-ceo-review');
|
||||
expect(result.selected).toContain('plan-ceo-review-format-mode');
|
||||
expect(result.selected).toContain('plan-ceo-review-format-approach');
|
||||
expect(result.selected.length).toBe(8);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 8);
|
||||
// v1.10.2.0 plan-mode handshake entries also depend on plan-ceo-review/**
|
||||
expect(result.selected).toContain('plan-ceo-review-plan-mode');
|
||||
expect(result.selected).toContain('plan-mode-no-op');
|
||||
expect(result.selected).toContain('e2e-harness-audit');
|
||||
expect(result.selected).toContain('plan-ceo-review-prosons-cadence');
|
||||
expect(result.selected).toContain('plan-review-prosons-format');
|
||||
expect(result.selected).toContain('plan-review-prosons-hardstop-neg');
|
||||
expect(result.selected).toContain('plan-review-prosons-neutral-neg');
|
||||
expect(result.selected.length).toBe(15);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 15);
|
||||
});
|
||||
|
||||
test('global touchfile triggers ALL tests', () => {
|
||||
|
||||
Reference in New Issue
Block a user