mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 21:25:27 +02:00
merge: integrate origin/main (v1.1.3.0) — /checkpoint → /context-save + /context-restore rename
Main shipped v1.1.3.0 fixing Claude Code's native /checkpoint alias shadowing gstack's skill. The old /checkpoint directory is gone, replaced by context-save/ and context-restore/. Storage path (~/.gstack/projects/$SLUG/checkpoints/) is unchanged, so existing saved contexts still load. Conflicts: - VERSION / package.json: kept 1.2.0.0 (above main's 1.1.3.0) - CHANGELOG: preserved 1.2.0.0 at top, inserted 1.1.3.0 below - scripts/resolvers/preamble.ts: same pattern as prior merges — main's side edited the monolithic file inline; I kept the submodule composition root intact (main's inline changes don't apply to this shape) Ported my continuous-checkpoint and context-health submodule prose to reference the new skill names: - generate-continuous-checkpoint.ts: "/checkpoint resume" → "/context-restore" - generate-context-health.ts: "/checkpoint" → "/context-save" Also updated user-facing prose in: - CHANGELOG.md (1.2.0.0 entry): "/checkpoint resume" → "/context-restore (formerly /checkpoint resume pre-v1.1.3)" - README.md Continuous checkpoint section: same rename Storage paths in generate-context-recovery.ts (`$_PROJ/checkpoints/`) left untouched — per main's v1.1.3.0 notes, the storage directory name stays `checkpoints/` to preserve backward-compat with saved files. Touchfiles.ts auto-merged cleanly — main's context-save-writes-file and context-restore-loads-latest replaced my old checkpoint-save-resume entry. Regenerated SKILL.md files. Ship golden fixtures refreshed. 423 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,349 @@
|
||||
/**
|
||||
* Tier-2 hardening tests for context-save + context-restore.
|
||||
*
|
||||
* These exercise the exact bash snippets from the SKILL.md templates,
|
||||
* without spawning claude -p. Free tier, runs in milliseconds.
|
||||
*
|
||||
* Covers the hardening work from commit 3df8ea86:
|
||||
* - Bash-side title sanitizer (allowlist a-z0-9.-, cap 60, default "untitled")
|
||||
* - Collision-safe filenames (random suffix on same-second double-save)
|
||||
* - head -20 cap on the restore-flow directory listing
|
||||
* - Migration HOME unset guard
|
||||
* - Empty-set "NO_CHECKPOINTS" fallback
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// The exact sanitize+collision bash used by context-save/SKILL.md Step 4.
|
||||
// Kept in sync with context-save/SKILL.md.tmpl. If the template changes
|
||||
// this helper out of alignment, the title-sanitize tests fail — intended.
|
||||
const TITLE_BASH = `
|
||||
RAW="\${TITLE_RAW:-untitled}"
|
||||
TITLE_SLUG=$(printf '%s' "$RAW" | tr '[:upper:]' '[:lower:]' | tr -s ' \\t' '-' | tr -cd 'a-z0-9.-' | cut -c1-60)
|
||||
TITLE_SLUG="\${TITLE_SLUG:-untitled}"
|
||||
FILE="\${CHECKPOINT_DIR}/\${TIMESTAMP}-\${TITLE_SLUG}.md"
|
||||
if [ -e "$FILE" ]; then
|
||||
SUFFIX=$(LC_ALL=C tr -dc 'a-z0-9' < /dev/urandom 2>/dev/null | head -c 4 || printf '%04x' "$$")
|
||||
FILE="\${CHECKPOINT_DIR}/\${TIMESTAMP}-\${TITLE_SLUG}-\${SUFFIX}.md"
|
||||
fi
|
||||
echo "TITLE_SLUG=$TITLE_SLUG"
|
||||
echo "FILE=$FILE"
|
||||
`;
|
||||
|
||||
// The exact find + sort + head used by context-restore/SKILL.md Step 1.
|
||||
const RESTORE_FIND_BASH = `
|
||||
if [ ! -d "$CHECKPOINT_DIR" ]; then
|
||||
echo "NO_CHECKPOINTS"
|
||||
else
|
||||
FILES=$(find "$CHECKPOINT_DIR" -maxdepth 1 -name "*.md" -type f 2>/dev/null | sort -r | head -20)
|
||||
if [ -z "$FILES" ]; then
|
||||
echo "NO_CHECKPOINTS"
|
||||
else
|
||||
echo "$FILES"
|
||||
fi
|
||||
fi
|
||||
`;
|
||||
|
||||
function runBash(script: string, env: Record<string, string>): { stdout: string; stderr: string; exitCode: number } {
|
||||
const result = spawnSync('bash', ['-c', script], {
|
||||
env: { ...process.env, ...env },
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
timeout: 5000,
|
||||
});
|
||||
return {
|
||||
stdout: result.stdout.toString(),
|
||||
stderr: result.stderr.toString(),
|
||||
exitCode: result.status ?? 1,
|
||||
};
|
||||
}
|
||||
|
||||
function parseKV(stdout: string): Record<string, string> {
|
||||
const out: Record<string, string> = {};
|
||||
for (const line of stdout.split('\n')) {
|
||||
const eq = line.indexOf('=');
|
||||
if (eq > 0) out[line.slice(0, eq)] = line.slice(eq + 1);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// ─── Title sanitizer ───────────────────────────────────────────────────────
|
||||
|
||||
describe('context-save: title sanitizer', () => {
|
||||
let tmp: string;
|
||||
beforeEach(() => { tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'ctx-san-')); });
|
||||
afterEach(() => { try { fs.rmSync(tmp, { recursive: true, force: true }); } catch {} });
|
||||
|
||||
test('shell metachars stripped to allowlist', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: '$(rm -rf /) `whoami` ; echo pwned',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).toMatch(/^[a-z0-9.-]*$/);
|
||||
expect(kv.TITLE_SLUG).not.toContain('$');
|
||||
expect(kv.TITLE_SLUG).not.toContain('(');
|
||||
expect(kv.TITLE_SLUG).not.toContain(';');
|
||||
expect(kv.TITLE_SLUG).not.toContain('`');
|
||||
});
|
||||
|
||||
test('path traversal attempt stripped', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: '../../../etc/passwd',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).not.toContain('/');
|
||||
// Slashes stripped, dots retained — result is contained within the
|
||||
// checkpoint directory (no path escape possible). The exact number of dots
|
||||
// depends on the input; what matters is the file stays inside $CHECKPOINT_DIR.
|
||||
expect(kv.FILE.startsWith(`${tmp}/`)).toBe(true);
|
||||
expect(path.dirname(kv.FILE)).toBe(tmp);
|
||||
});
|
||||
|
||||
test('uppercase lowercased', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'Wintermute Progress',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).toBe('wintermute-progress');
|
||||
});
|
||||
|
||||
test('whitespace collapsed to single hyphen', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'foo bar\t\tbaz',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).toBe('foo-bar-baz');
|
||||
});
|
||||
|
||||
test('length capped at 60 chars', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'a'.repeat(200),
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG.length).toBe(60);
|
||||
});
|
||||
|
||||
test('empty title falls back to "untitled"', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: '',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).toBe('untitled');
|
||||
});
|
||||
|
||||
test('only-special-chars title falls back to "untitled"', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: '!@#$%^&*()+=<>?',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).toBe('untitled');
|
||||
});
|
||||
|
||||
test('unicode stripped to ASCII allowlist', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: '日本語 emoji 🚀 test',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).toMatch(/^[a-z0-9.-]*$/);
|
||||
// Must contain the ASCII words that survived
|
||||
expect(kv.TITLE_SLUG).toContain('emoji');
|
||||
expect(kv.TITLE_SLUG).toContain('test');
|
||||
});
|
||||
|
||||
test('numbers + dots + hyphens preserved', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'v1.0.1-release-notes',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.TITLE_SLUG).toBe('v1.0.1-release-notes');
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Filename collision handling ───────────────────────────────────────────
|
||||
|
||||
describe('context-save: filename collision', () => {
|
||||
let tmp: string;
|
||||
beforeEach(() => { tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'ctx-col-')); });
|
||||
afterEach(() => { try { fs.rmSync(tmp, { recursive: true, force: true }); } catch {} });
|
||||
|
||||
test('first save with title uses predictable path', () => {
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'foo',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
expect(kv.FILE).toBe(`${tmp}/20260419-120000-foo.md`);
|
||||
});
|
||||
|
||||
test('second save same-second same-title gets random suffix', () => {
|
||||
// Pre-seed: file already exists at the predictable path.
|
||||
fs.writeFileSync(`${tmp}/20260419-120000-foo.md`, 'prior save');
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'foo',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
// Path must differ (append-only contract).
|
||||
expect(kv.FILE).not.toBe(`${tmp}/20260419-120000-foo.md`);
|
||||
// Suffix format: base-XXXX.md where XXXX matches the suffix allowlist.
|
||||
expect(kv.FILE).toMatch(new RegExp(`^${tmp.replace(/[/.]/g, '\\$&')}/20260419-120000-foo-[a-z0-9]+\\.md$`));
|
||||
});
|
||||
|
||||
test('collision suffix preserves append-only — prior file intact', () => {
|
||||
const priorPath = `${tmp}/20260419-120000-foo.md`;
|
||||
fs.writeFileSync(priorPath, 'critical prior save');
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'foo',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
// Write a new file at the collision-safe path.
|
||||
fs.writeFileSync(kv.FILE, 'new save');
|
||||
// Prior file must still exist and be untouched.
|
||||
expect(fs.readFileSync(priorPath, 'utf-8')).toBe('critical prior save');
|
||||
expect(fs.readFileSync(kv.FILE, 'utf-8')).toBe('new save');
|
||||
// Directory should have exactly 2 files.
|
||||
expect(fs.readdirSync(tmp).length).toBe(2);
|
||||
});
|
||||
|
||||
test('different titles same second — no collision, no suffix', () => {
|
||||
fs.writeFileSync(`${tmp}/20260419-120000-foo.md`, 'first save');
|
||||
const kv = parseKV(runBash(TITLE_BASH, {
|
||||
TITLE_RAW: 'bar',
|
||||
CHECKPOINT_DIR: tmp,
|
||||
TIMESTAMP: '20260419-120000',
|
||||
}).stdout);
|
||||
// Different title → predictable path, no suffix.
|
||||
expect(kv.FILE).toBe(`${tmp}/20260419-120000-bar.md`);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Restore flow: head-20 cap + empty-set ─────────────────────────────────
|
||||
|
||||
describe('context-restore: find + sort + head cap', () => {
|
||||
let tmp: string;
|
||||
beforeEach(() => { tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'ctx-rest-')); });
|
||||
afterEach(() => { try { fs.rmSync(tmp, { recursive: true, force: true }); } catch {} });
|
||||
|
||||
test('missing directory → NO_CHECKPOINTS', () => {
|
||||
const out = runBash(RESTORE_FIND_BASH, {
|
||||
CHECKPOINT_DIR: `${tmp}/nonexistent`,
|
||||
}).stdout;
|
||||
expect(out.trim()).toBe('NO_CHECKPOINTS');
|
||||
});
|
||||
|
||||
test('empty directory → NO_CHECKPOINTS', () => {
|
||||
const out = runBash(RESTORE_FIND_BASH, {
|
||||
CHECKPOINT_DIR: tmp,
|
||||
}).stdout;
|
||||
expect(out.trim()).toBe('NO_CHECKPOINTS');
|
||||
});
|
||||
|
||||
test('directory with non-.md files → NO_CHECKPOINTS', () => {
|
||||
fs.writeFileSync(`${tmp}/not-a-save.txt`, 'noise');
|
||||
fs.writeFileSync(`${tmp}/.DS_Store`, 'macos');
|
||||
const out = runBash(RESTORE_FIND_BASH, {
|
||||
CHECKPOINT_DIR: tmp,
|
||||
}).stdout;
|
||||
expect(out.trim()).toBe('NO_CHECKPOINTS');
|
||||
});
|
||||
|
||||
test('50 .md files → only 20 returned, newest first by filename', () => {
|
||||
// Seed 50 files with monotonically increasing timestamps.
|
||||
for (let i = 0; i < 50; i++) {
|
||||
const ts = `20260419-${String(120000 + i).padStart(6, '0')}`;
|
||||
fs.writeFileSync(`${tmp}/${ts}-file${i}.md`, `content ${i}`);
|
||||
}
|
||||
const out = runBash(RESTORE_FIND_BASH, {
|
||||
CHECKPOINT_DIR: tmp,
|
||||
}).stdout;
|
||||
const lines = out.trim().split('\n').filter(Boolean);
|
||||
expect(lines.length).toBe(20);
|
||||
// sort -r → newest first by filename. Highest timestamps (files 30-49).
|
||||
expect(lines[0]).toContain('file49');
|
||||
expect(lines[19]).toContain('file30');
|
||||
});
|
||||
|
||||
test('sort is by filename prefix, NOT mtime', () => {
|
||||
// Older filename, newer mtime. Sort -r must still put newer filename first.
|
||||
const olderByFilename = `${tmp}/20260101-120000-old.md`;
|
||||
const newerByFilename = `${tmp}/20260419-120000-new.md`;
|
||||
fs.writeFileSync(olderByFilename, 'old content');
|
||||
fs.writeFileSync(newerByFilename, 'new content');
|
||||
// Scramble mtimes: older filename gets newer mtime.
|
||||
const now = Math.floor(Date.now() / 1000);
|
||||
fs.utimesSync(olderByFilename, now, now);
|
||||
fs.utimesSync(newerByFilename, now - 86400 * 30, now - 86400 * 30);
|
||||
|
||||
const out = runBash(RESTORE_FIND_BASH, {
|
||||
CHECKPOINT_DIR: tmp,
|
||||
}).stdout;
|
||||
const lines = out.trim().split('\n').filter(Boolean);
|
||||
expect(lines[0]).toBe(newerByFilename);
|
||||
expect(lines[1]).toBe(olderByFilename);
|
||||
});
|
||||
|
||||
test('no listing-cwd fallback when empty (macOS xargs ls gotcha)', () => {
|
||||
// On macOS, `find ... | xargs ls -1t` with zero results falls back to
|
||||
// listing the current working directory. Our find|sort|head pattern must
|
||||
// NOT have that behavior. Running from a dir with many .md files.
|
||||
const out = runBash(RESTORE_FIND_BASH, {
|
||||
CHECKPOINT_DIR: tmp,
|
||||
// Intentionally: working directory is the gstack repo which has many .md files.
|
||||
}).stdout;
|
||||
expect(out.trim()).toBe('NO_CHECKPOINTS');
|
||||
// Must NOT contain any .md filename from cwd.
|
||||
expect(out).not.toContain('SKILL.md');
|
||||
expect(out).not.toContain('README.md');
|
||||
});
|
||||
});
|
||||
|
||||
// ─── Migration HOME guard ──────────────────────────────────────────────────
|
||||
|
||||
describe('migration v1.1.3.0: HOME guard', () => {
|
||||
let tmp: string;
|
||||
const MIGRATION = path.join(ROOT, 'gstack-upgrade', 'migrations', 'v1.1.3.0.sh');
|
||||
|
||||
beforeEach(() => { tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'ctx-home-')); });
|
||||
afterEach(() => { try { fs.rmSync(tmp, { recursive: true, force: true }); } catch {} });
|
||||
|
||||
test('HOME unset → exits 0 with diagnostic, no filesystem changes', () => {
|
||||
// Create a file that would be wiped by an HOME="" bug: /.claude/skills/gstack/checkpoint
|
||||
// (not actually writable by the test, but we verify the script doesn't TRY).
|
||||
// Spawn without HOME in env.
|
||||
const env = { PATH: process.env.PATH || '/usr/bin:/bin' } as Record<string, string>;
|
||||
const result = spawnSync('bash', [MIGRATION], {
|
||||
env,
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
timeout: 5000,
|
||||
});
|
||||
expect(result.status).toBe(0);
|
||||
expect(result.stderr.toString()).toContain('HOME is unset');
|
||||
});
|
||||
|
||||
test('HOME="" → exits 0 with diagnostic', () => {
|
||||
const result = spawnSync('bash', [MIGRATION], {
|
||||
env: { HOME: '', PATH: process.env.PATH || '/usr/bin:/bin' },
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
timeout: 5000,
|
||||
});
|
||||
expect(result.status).toBe(0);
|
||||
expect(result.stderr.toString()).toContain('HOME is unset or empty');
|
||||
// Critical: no stdout (no "Removed stale" messages — nothing touched).
|
||||
expect(result.stdout.toString().trim()).toBe('');
|
||||
});
|
||||
});
|
||||
+2
-2
@@ -611,7 +611,7 @@ Skill: </skill-name-if-running>
|
||||
- Background discipline — do NOT announce each commit to the user. They can see
|
||||
`git log` whenever they want.
|
||||
|
||||
**When `/checkpoint resume` runs,** it parses `[gstack-context]` blocks from WIP
|
||||
**When `/context-restore` runs,** it parses `[gstack-context]` blocks from WIP
|
||||
commits on the current branch to reconstruct session state. When `/ship` runs, it
|
||||
filter-squashes WIP commits only (preserving non-WIP commits) via
|
||||
`git rebase --autosquash` so the PR contains clean bisectable commits.
|
||||
@@ -629,7 +629,7 @@ During long-running skill sessions, periodically write a brief `[PROGRESS]` summ
|
||||
|
||||
If you notice you're going in circles — repeating the same diagnostic, re-reading the
|
||||
same file, or trying variants of a failed fix — STOP and reassess. Consider escalating
|
||||
or calling /checkpoint to save progress and start fresh.
|
||||
or calling /context-save to save progress and start fresh.
|
||||
|
||||
This is a soft nudge, not a measurable feature. No thresholds, no enforcement. The
|
||||
goal is self-awareness during long sessions. If the session stays short, skip it.
|
||||
|
||||
+2
-2
@@ -600,7 +600,7 @@ Skill: </skill-name-if-running>
|
||||
- Background discipline — do NOT announce each commit to the user. They can see
|
||||
`git log` whenever they want.
|
||||
|
||||
**When `/checkpoint resume` runs,** it parses `[gstack-context]` blocks from WIP
|
||||
**When `/context-restore` runs,** it parses `[gstack-context]` blocks from WIP
|
||||
commits on the current branch to reconstruct session state. When `/ship` runs, it
|
||||
filter-squashes WIP commits only (preserving non-WIP commits) via
|
||||
`git rebase --autosquash` so the PR contains clean bisectable commits.
|
||||
@@ -618,7 +618,7 @@ During long-running skill sessions, periodically write a brief `[PROGRESS]` summ
|
||||
|
||||
If you notice you're going in circles — repeating the same diagnostic, re-reading the
|
||||
same file, or trying variants of a failed fix — STOP and reassess. Consider escalating
|
||||
or calling /checkpoint to save progress and start fresh.
|
||||
or calling /context-save to save progress and start fresh.
|
||||
|
||||
This is a soft nudge, not a measurable feature. No thresholds, no enforcement. The
|
||||
goal is self-awareness during long sessions. If the session stays short, skip it.
|
||||
|
||||
+2
-2
@@ -602,7 +602,7 @@ Skill: </skill-name-if-running>
|
||||
- Background discipline — do NOT announce each commit to the user. They can see
|
||||
`git log` whenever they want.
|
||||
|
||||
**When `/checkpoint resume` runs,** it parses `[gstack-context]` blocks from WIP
|
||||
**When `/context-restore` runs,** it parses `[gstack-context]` blocks from WIP
|
||||
commits on the current branch to reconstruct session state. When `/ship` runs, it
|
||||
filter-squashes WIP commits only (preserving non-WIP commits) via
|
||||
`git rebase --autosquash` so the PR contains clean bisectable commits.
|
||||
@@ -620,7 +620,7 @@ During long-running skill sessions, periodically write a brief `[PROGRESS]` summ
|
||||
|
||||
If you notice you're going in circles — repeating the same diagnostic, re-reading the
|
||||
same file, or trying variants of a failed fix — STOP and reassess. Consider escalating
|
||||
or calling /checkpoint to save progress and start fresh.
|
||||
or calling /context-save to save progress and start fresh.
|
||||
|
||||
This is a soft nudge, not a measurable feature. No thresholds, no enforcement. The
|
||||
goal is self-awareness during long sessions. If the session stays short, skip it.
|
||||
|
||||
@@ -126,6 +126,10 @@ export async function runSkillTest(options: {
|
||||
runId?: string;
|
||||
/** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */
|
||||
model?: string;
|
||||
/** Extra env vars merged into the spawned claude -p process. Useful for
|
||||
* per-test GSTACK_HOME overrides so the test doesn't have to spell out
|
||||
* env setup in the prompt itself. */
|
||||
env?: Record<string, string>;
|
||||
}): Promise<SkillTestResult> {
|
||||
const {
|
||||
prompt,
|
||||
@@ -135,6 +139,7 @@ export async function runSkillTest(options: {
|
||||
timeout = 120_000,
|
||||
testName,
|
||||
runId,
|
||||
env: extraEnv,
|
||||
} = options;
|
||||
const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6';
|
||||
|
||||
@@ -171,6 +176,7 @@ export async function runSkillTest(options: {
|
||||
|
||||
const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
|
||||
cwd: workingDirectory,
|
||||
env: extraEnv ? { ...process.env, ...extraEnv } : undefined,
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
});
|
||||
|
||||
@@ -113,10 +113,24 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Learnings
|
||||
'learnings-show': ['learn/**', 'bin/gstack-learnings-search', 'bin/gstack-learnings-log', 'scripts/resolvers/learnings.ts'],
|
||||
|
||||
// Session Intelligence (timeline, context recovery, checkpoint)
|
||||
'timeline-event-flow': ['bin/gstack-timeline-log', 'bin/gstack-timeline-read'],
|
||||
'context-recovery-artifacts': ['scripts/resolvers/preamble.ts', 'bin/gstack-timeline-log', 'bin/gstack-slug', 'learn/**'],
|
||||
'checkpoint-save-resume': ['checkpoint/**', 'bin/gstack-slug'],
|
||||
// Session Intelligence (timeline, context recovery, /context-save + /context-restore)
|
||||
'timeline-event-flow': ['bin/gstack-timeline-log', 'bin/gstack-timeline-read'],
|
||||
'context-recovery-artifacts': ['scripts/resolvers/preamble.ts', 'bin/gstack-timeline-log', 'bin/gstack-slug', 'learn/**'],
|
||||
'context-save-writes-file': ['context-save/**', 'bin/gstack-slug'],
|
||||
'context-restore-loads-latest': ['context-restore/**', 'bin/gstack-slug'],
|
||||
|
||||
// Context skills E2E (live-fire, Skill-tool routing path) — see
|
||||
// test/skill-e2e-context-skills.test.ts. These are periodic-tier because
|
||||
// each one spawns claude -p and costs ~$0.20-$0.40. Collectively they
|
||||
// verify the thing the /checkpoint → /context-save rename was for.
|
||||
'context-save-routing': ['context-save/**', 'scripts/resolvers/preamble.ts'],
|
||||
'context-save-then-restore-roundtrip': ['context-save/**', 'context-restore/**', 'bin/gstack-slug'],
|
||||
'context-restore-fragment-match': ['context-restore/**'],
|
||||
'context-restore-empty-state': ['context-restore/**'],
|
||||
'context-restore-list-delegates': ['context-restore/**'],
|
||||
'context-restore-legacy-compat': ['context-restore/**'],
|
||||
'context-save-list-current-branch': ['context-save/**'],
|
||||
'context-save-list-all-branches': ['context-save/**'],
|
||||
|
||||
// Document-release
|
||||
'document-release': ['document-release/**'],
|
||||
@@ -262,9 +276,20 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'codex-offered-eng-review': 'gate',
|
||||
|
||||
// Session Intelligence — gate for data flow, periodic for agent integration
|
||||
'timeline-event-flow': 'gate', // Binary data flow (no LLM needed)
|
||||
'context-recovery-artifacts': 'gate', // Preamble reads seeded artifacts
|
||||
'checkpoint-save-resume': 'gate', // Checkpoint round-trip
|
||||
'timeline-event-flow': 'gate', // Binary data flow (no LLM needed)
|
||||
'context-recovery-artifacts': 'gate', // Preamble reads seeded artifacts
|
||||
'context-save-writes-file': 'gate', // /context-save writes a file
|
||||
'context-restore-loads-latest': 'gate', // Cross-branch newest-by-filename restore
|
||||
|
||||
// Context skills live-fire — periodic (each test spawns claude -p, ~$0.20-$0.40)
|
||||
'context-save-routing': 'periodic', // Proves /context-save routes via Skill tool
|
||||
'context-save-then-restore-roundtrip': 'periodic', // Full cycle in one session
|
||||
'context-restore-fragment-match': 'periodic', // /context-restore <fragment>
|
||||
'context-restore-empty-state': 'periodic', // Graceful zero-saves message
|
||||
'context-restore-list-delegates': 'periodic', // /context-restore list redirect
|
||||
'context-restore-legacy-compat': 'periodic', // Pre-rename files still load
|
||||
'context-save-list-current-branch': 'periodic', // Default branch filter
|
||||
'context-save-list-all-branches': 'periodic', // --all flag
|
||||
|
||||
// Ship — gate (end-to-end ship path)
|
||||
'ship-base-branch': 'gate',
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const MIGRATION = path.join(ROOT, 'gstack-upgrade', 'migrations', 'v1.1.3.0.sh');
|
||||
|
||||
function runMigration(tmpHome: string): { exitCode: number; stdout: string; stderr: string } {
|
||||
const result = spawnSync('bash', [MIGRATION], {
|
||||
env: { ...process.env, HOME: tmpHome },
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
timeout: 10_000,
|
||||
});
|
||||
return {
|
||||
exitCode: result.status ?? 1,
|
||||
stdout: result.stdout.toString(),
|
||||
stderr: result.stderr.toString(),
|
||||
};
|
||||
}
|
||||
|
||||
function setupFakeGstackRoot(tmpHome: string): string {
|
||||
// A real target that the gstack symlink can resolve into.
|
||||
const gstackDir = path.join(tmpHome, '.claude', 'skills', 'gstack');
|
||||
fs.mkdirSync(path.join(gstackDir, 'checkpoint'), { recursive: true });
|
||||
fs.writeFileSync(path.join(gstackDir, 'checkpoint', 'SKILL.md'), '# fake gstack checkpoint\n');
|
||||
return gstackDir;
|
||||
}
|
||||
|
||||
describe('migration v1.1.3.0 — checkpoint ownership guard', () => {
|
||||
let tmpHome: string;
|
||||
|
||||
beforeEach(() => {
|
||||
tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-migration-ownership-'));
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
try { fs.rmSync(tmpHome, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('scenario A: directory symlink into gstack → removed', () => {
|
||||
setupFakeGstackRoot(tmpHome);
|
||||
const skillsDir = path.join(tmpHome, '.claude', 'skills');
|
||||
const gstackCheckpoint = path.join(skillsDir, 'gstack', 'checkpoint');
|
||||
const topLevel = path.join(skillsDir, 'checkpoint');
|
||||
fs.symlinkSync(gstackCheckpoint, topLevel);
|
||||
|
||||
const result = runMigration(tmpHome);
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(fs.existsSync(topLevel)).toBe(false);
|
||||
// Also removes the gstack-owned inner copy (Shape 2 cleanup).
|
||||
expect(fs.existsSync(gstackCheckpoint)).toBe(false);
|
||||
expect(result.stdout).toContain('Removed stale /checkpoint symlink');
|
||||
});
|
||||
|
||||
test('scenario B: directory with SKILL.md symlinked into gstack → removed', () => {
|
||||
setupFakeGstackRoot(tmpHome);
|
||||
const skillsDir = path.join(tmpHome, '.claude', 'skills');
|
||||
const gstackSKILL = path.join(skillsDir, 'gstack', 'checkpoint', 'SKILL.md');
|
||||
const topLevel = path.join(skillsDir, 'checkpoint');
|
||||
fs.mkdirSync(topLevel, { recursive: true });
|
||||
fs.symlinkSync(gstackSKILL, path.join(topLevel, 'SKILL.md'));
|
||||
|
||||
const result = runMigration(tmpHome);
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(fs.existsSync(topLevel)).toBe(false);
|
||||
expect(result.stdout).toContain('Removed stale /checkpoint install directory');
|
||||
});
|
||||
|
||||
test('scenario C: user-owned regular directory with custom content → preserved', () => {
|
||||
setupFakeGstackRoot(tmpHome);
|
||||
const skillsDir = path.join(tmpHome, '.claude', 'skills');
|
||||
const topLevel = path.join(skillsDir, 'checkpoint');
|
||||
fs.mkdirSync(topLevel, { recursive: true });
|
||||
// User's own custom skill: regular file, not a symlink.
|
||||
fs.writeFileSync(path.join(topLevel, 'SKILL.md'), '# my custom /checkpoint\n');
|
||||
fs.writeFileSync(path.join(topLevel, 'extra.txt'), 'user content\n');
|
||||
|
||||
const result = runMigration(tmpHome);
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(fs.existsSync(topLevel)).toBe(true);
|
||||
expect(fs.existsSync(path.join(topLevel, 'SKILL.md'))).toBe(true);
|
||||
expect(fs.existsSync(path.join(topLevel, 'extra.txt'))).toBe(true);
|
||||
expect(result.stdout).toContain('Leaving');
|
||||
expect(result.stdout).toContain('not a gstack-owned install');
|
||||
});
|
||||
|
||||
test('scenario D: symlink pointing outside gstack → preserved', () => {
|
||||
setupFakeGstackRoot(tmpHome);
|
||||
const skillsDir = path.join(tmpHome, '.claude', 'skills');
|
||||
const topLevel = path.join(skillsDir, 'checkpoint');
|
||||
// User's own skill elsewhere on the filesystem.
|
||||
const userSkillDir = path.join(tmpHome, 'my-own-skill');
|
||||
fs.mkdirSync(userSkillDir, { recursive: true });
|
||||
fs.writeFileSync(path.join(userSkillDir, 'SKILL.md'), '# my custom /checkpoint\n');
|
||||
fs.symlinkSync(userSkillDir, topLevel);
|
||||
|
||||
const result = runMigration(tmpHome);
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(fs.existsSync(topLevel)).toBe(true);
|
||||
// The user's underlying dir is untouched.
|
||||
expect(fs.existsSync(path.join(userSkillDir, 'SKILL.md'))).toBe(true);
|
||||
expect(result.stdout).toContain('Leaving');
|
||||
expect(result.stdout).toContain('outside gstack');
|
||||
});
|
||||
|
||||
test('scenario E: nothing to do → no-op exit 0 (idempotent)', () => {
|
||||
// No checkpoint install at all. First run: nothing removed.
|
||||
setupFakeGstackRoot(tmpHome);
|
||||
// Delete the inner gstack/checkpoint to simulate post-upgrade state.
|
||||
fs.rmSync(path.join(tmpHome, '.claude', 'skills', 'gstack', 'checkpoint'), { recursive: true, force: true });
|
||||
|
||||
const result1 = runMigration(tmpHome);
|
||||
expect(result1.exitCode).toBe(0);
|
||||
|
||||
// Second run: still exit 0, still no-op.
|
||||
const result2 = runMigration(tmpHome);
|
||||
expect(result2.exitCode).toBe(0);
|
||||
});
|
||||
|
||||
test('scenario F: gstack not installed → no-op exit 0', () => {
|
||||
// No ~/.claude/skills/gstack/ at all. Also no checkpoint install.
|
||||
fs.mkdirSync(path.join(tmpHome, '.claude', 'skills'), { recursive: true });
|
||||
|
||||
const result = runMigration(tmpHome);
|
||||
expect(result.exitCode).toBe(0);
|
||||
});
|
||||
|
||||
test('scenario G: SKILL.md is a symlink pointing outside gstack → preserved', () => {
|
||||
setupFakeGstackRoot(tmpHome);
|
||||
const skillsDir = path.join(tmpHome, '.claude', 'skills');
|
||||
const topLevel = path.join(skillsDir, 'checkpoint');
|
||||
fs.mkdirSync(topLevel, { recursive: true });
|
||||
// A directory containing SKILL.md that's a symlink pointing outside gstack.
|
||||
const externalSkill = path.join(tmpHome, 'external', 'SKILL.md');
|
||||
fs.mkdirSync(path.dirname(externalSkill), { recursive: true });
|
||||
fs.writeFileSync(externalSkill, '# external skill\n');
|
||||
fs.symlinkSync(externalSkill, path.join(topLevel, 'SKILL.md'));
|
||||
|
||||
const result = runMigration(tmpHome);
|
||||
expect(result.exitCode).toBe(0);
|
||||
expect(fs.existsSync(topLevel)).toBe(true);
|
||||
expect(fs.existsSync(path.join(topLevel, 'SKILL.md'))).toBe(true);
|
||||
expect(result.stdout).toContain('Leaving');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,228 @@
|
||||
/**
|
||||
* Collision Sentinel — insurance policy against upstream slash-command collisions.
|
||||
*
|
||||
* History: in April 2026 Claude Code shipped /checkpoint as a native alias
|
||||
* for /rewind, silently shadowing the gstack /checkpoint skill. Users
|
||||
* typed /checkpoint expecting to save state; agents routed to the built-in
|
||||
* or confabulated "this is a built-in you need to type directly" and nothing
|
||||
* was saved. We found out from users, not from tests.
|
||||
*
|
||||
* This file is the "never again" test. It enumerates every gstack skill name
|
||||
* from every SKILL.md.tmpl file in the repo and cross-checks against a
|
||||
* per-host list of known built-in slash commands. If any gstack skill name
|
||||
* collides with a host built-in, this test fails and names the collision.
|
||||
*
|
||||
* Maintenance: when Claude Code (or any other host we support) ships a new
|
||||
* built-in slash command, add the name to the host's KNOWN_BUILTINS list
|
||||
* below. If a gstack skill needs to coexist with a built-in anyway (e.g.,
|
||||
* we decide the semantic overlap is acceptable), add it to
|
||||
* KNOWN_COLLISIONS_TOLERATED with a written justification.
|
||||
*
|
||||
* Free tier. ~50ms runtime.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// ─── Host built-in registries ──────────────────────────────────────────────
|
||||
//
|
||||
// One const per host we support. Names are the slash-command identifier WITHOUT
|
||||
// the leading slash. Keep sorted alphabetically within each host so diffs are
|
||||
// reviewable. Cite the source (docs URL, release notes, or "observed") in the
|
||||
// comment next to each entry — future maintainers need to know why an entry
|
||||
// is on the list.
|
||||
|
||||
const KNOWN_BUILTINS: Record<string, string[]> = {
|
||||
'claude-code': [
|
||||
// Slash commands observed in 'claude --help' or cited in docs as of 2026-04.
|
||||
// Sources:
|
||||
// https://code.claude.com/docs/en/checkpointing
|
||||
// https://claudelog.com/mechanics/rewind/
|
||||
// claude --help output
|
||||
// Claude Code skill list dumps from live sessions
|
||||
'agents', // Agent config
|
||||
'bare', // Minimal mode
|
||||
'checkpoint', // Alias of /rewind (the collision that started this file)
|
||||
'clear', // Clear the conversation
|
||||
'compact', // Context compaction
|
||||
'config', // Config UI
|
||||
'context', // Context usage display
|
||||
'continue', // --continue / resume last conversation
|
||||
'cost', // Cost display
|
||||
'exit', // Exit shell
|
||||
'help', // Help
|
||||
'init', // Initialize a new CLAUDE.md file
|
||||
'mcp', // MCP server config
|
||||
'model', // Model selection
|
||||
'permissions', // Permission config
|
||||
'plan', // Plan mode toggle (also Shift+Tab)
|
||||
'quit', // Quit
|
||||
'review', // Review a pull request (BUILT-IN shipped in 2026)
|
||||
'rewind', // Conversation rewind
|
||||
'security-review', // Security audit of pending changes
|
||||
'stats', // Session stats
|
||||
'usage', // API usage stats
|
||||
],
|
||||
// Add codex/kiro/opencode/slate/cursor/openclaw/hermes/factory/gbrain
|
||||
// built-in lists when we encounter collisions. Claude Code is the primary
|
||||
// shadow risk because it's the biggest audience and ships the most
|
||||
// frequently; other hosts collide less often.
|
||||
// TODO: codex CLI built-ins (login, logout, exec, review, etc. — but we
|
||||
// invoke codex from gstack, we don't install skills INTO codex the same
|
||||
// way, so this is lower priority).
|
||||
};
|
||||
|
||||
// Collisions we know about and have consciously decided to tolerate. The
|
||||
// justification is mandatory — reviewers need the context next time the
|
||||
// user reports confusion, and blind additions to this map should fail code
|
||||
// review.
|
||||
const KNOWN_COLLISIONS_TOLERATED: Record<string, string> = {
|
||||
// skill name → one-line justification + action plan
|
||||
'review': 'gstack /review (pre-landing diff analysis) pre-dates the Claude Code built-in /review (Review a pull request). The gstack skill is much richer (SQL safety, LLM trust boundary, specialist dispatch). Watch for user confusion reports and consider renaming to /diff-review or /pre-land if the collision bites. TODO: track user-reported incidents in TODOS.md.',
|
||||
};
|
||||
|
||||
// Generic-verb watchlist: skill names that are single common verbs, which
|
||||
// are at higher risk of being claimed by a future host built-in. Advisory
|
||||
// only — the test prints a warning but doesn't fail. If a name here stops
|
||||
// being safe, move it to the appropriate host's KNOWN_BUILTINS list.
|
||||
const GENERIC_VERB_WATCHLIST = [
|
||||
'save', 'load', 'run', 'test', 'build', 'deploy',
|
||||
'fork', 'branch', 'commit', 'push', 'pull', 'merge', 'rebase',
|
||||
'start', 'stop', 'restart', 'reset', 'pause', 'resume',
|
||||
'show', 'list', 'find', 'search', 'view',
|
||||
'create', 'delete', 'remove', 'update', 'rename',
|
||||
'login', 'logout', 'auth',
|
||||
];
|
||||
|
||||
// ─── Enumerator ────────────────────────────────────────────────────────────
|
||||
|
||||
interface GstackSkill {
|
||||
name: string;
|
||||
templatePath: string;
|
||||
}
|
||||
|
||||
function enumerateGstackSkills(): GstackSkill[] {
|
||||
const skills: GstackSkill[] = [];
|
||||
// Scan one level deep for */SKILL.md.tmpl plus root SKILL.md.tmpl.
|
||||
const candidates = [
|
||||
path.join(ROOT, 'SKILL.md.tmpl'),
|
||||
...fs.readdirSync(ROOT, { withFileTypes: true })
|
||||
.filter((d) => d.isDirectory())
|
||||
.map((d) => path.join(ROOT, d.name, 'SKILL.md.tmpl')),
|
||||
];
|
||||
for (const tmpl of candidates) {
|
||||
if (!fs.existsSync(tmpl)) continue;
|
||||
const content = fs.readFileSync(tmpl, 'utf-8');
|
||||
// Parse the 'name:' field from YAML frontmatter.
|
||||
const frontmatter = content.match(/^---\n([\s\S]+?)\n---/);
|
||||
if (!frontmatter) continue;
|
||||
const nameMatch = frontmatter[1].match(/^name:\s*(\S+)/m);
|
||||
if (!nameMatch) continue;
|
||||
skills.push({ name: nameMatch[1].trim(), templatePath: tmpl });
|
||||
}
|
||||
return skills;
|
||||
}
|
||||
|
||||
// ─── Tests ─────────────────────────────────────────────────────────────────
|
||||
|
||||
describe('skill-collision-sentinel', () => {
|
||||
const skills = enumerateGstackSkills();
|
||||
|
||||
test('at least one skill is discovered (sanity)', () => {
|
||||
// If this fails, the enumerator broke, not the collision check.
|
||||
expect(skills.length).toBeGreaterThan(10);
|
||||
});
|
||||
|
||||
test('no duplicate skill names within gstack', () => {
|
||||
const seen = new Map<string, string>();
|
||||
const dupes: string[] = [];
|
||||
for (const { name, templatePath } of skills) {
|
||||
if (seen.has(name)) {
|
||||
dupes.push(`${name} appears in both ${seen.get(name)} and ${templatePath}`);
|
||||
} else {
|
||||
seen.set(name, templatePath);
|
||||
}
|
||||
}
|
||||
if (dupes.length > 0) {
|
||||
throw new Error(`Duplicate skill names:\n ${dupes.join('\n ')}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Hard check: no gstack skill name collides with a known host built-in
|
||||
// unless the collision is explicitly tolerated. This is the test that
|
||||
// would have caught the /checkpoint bug in April 2026.
|
||||
for (const [host, builtins] of Object.entries(KNOWN_BUILTINS)) {
|
||||
test(`no skill name collides with a ${host} built-in (or has written justification)`, () => {
|
||||
const builtinSet = new Set(builtins);
|
||||
const collisions: Array<{ skill: string; builtin: string }> = [];
|
||||
for (const { name } of skills) {
|
||||
if (builtinSet.has(name) && !(name in KNOWN_COLLISIONS_TOLERATED)) {
|
||||
collisions.push({ skill: name, builtin: name });
|
||||
}
|
||||
}
|
||||
if (collisions.length > 0) {
|
||||
const msg = collisions.map(c =>
|
||||
` /${c.skill} collides with ${host} built-in /${c.builtin}.\n` +
|
||||
` Fix: rename the gstack skill (precedent: /checkpoint → /context-save+/context-restore),\n` +
|
||||
` OR add an entry to KNOWN_COLLISIONS_TOLERATED with a written justification.`
|
||||
).join('\n\n');
|
||||
throw new Error(`Found ${collisions.length} unresolved collision(s) with ${host} built-ins:\n\n${msg}`);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Every KNOWN_COLLISIONS_TOLERATED entry must correspond to a real skill
|
||||
// AND a real built-in. Prevents the exception list from rotting with
|
||||
// stale entries after a rename.
|
||||
test('KNOWN_COLLISIONS_TOLERATED entries are all still active collisions', () => {
|
||||
const skillNames = new Set(skills.map(s => s.name));
|
||||
const allBuiltins = new Set<string>();
|
||||
for (const list of Object.values(KNOWN_BUILTINS)) {
|
||||
for (const name of list) allBuiltins.add(name);
|
||||
}
|
||||
const stale: string[] = [];
|
||||
for (const name of Object.keys(KNOWN_COLLISIONS_TOLERATED)) {
|
||||
if (!skillNames.has(name)) {
|
||||
stale.push(` "${name}" is in KNOWN_COLLISIONS_TOLERATED but no gstack skill has that name — remove the exception`);
|
||||
} else if (!allBuiltins.has(name)) {
|
||||
stale.push(` "${name}" is in KNOWN_COLLISIONS_TOLERATED but no host's KNOWN_BUILTINS lists it — remove the exception`);
|
||||
}
|
||||
}
|
||||
if (stale.length > 0) {
|
||||
throw new Error(`Stale tolerance entries:\n${stale.join('\n')}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Self-check: the /checkpoint rename actually landed. If someone reverts
|
||||
// the rename by accident, this catches it.
|
||||
test('the /checkpoint collision that started this file is actually resolved', () => {
|
||||
const names = new Set(skills.map(s => s.name));
|
||||
expect(names.has('checkpoint')).toBe(false);
|
||||
// And the replacements exist.
|
||||
expect(names.has('context-save')).toBe(true);
|
||||
expect(names.has('context-restore')).toBe(true);
|
||||
});
|
||||
|
||||
// Advisory: print a warning for any skill whose name is a generic verb.
|
||||
// Doesn't fail — just informs reviewers.
|
||||
test('advisory: generic-verb watchlist (informational)', () => {
|
||||
const watchlist = new Set(GENERIC_VERB_WATCHLIST);
|
||||
const flagged: string[] = [];
|
||||
for (const { name } of skills) {
|
||||
if (watchlist.has(name)) flagged.push(name);
|
||||
}
|
||||
if (flagged.length > 0) {
|
||||
console.log(
|
||||
`\n⚠️ advisory: ${flagged.length} skill(s) use generic verbs that may be at risk ` +
|
||||
`of future host built-in collisions: ${flagged.map(n => `/${n}`).join(', ')}\n` +
|
||||
` These are NOT current collisions — they're names to watch. If any become ` +
|
||||
`taken, the per-host test above will fail.\n`
|
||||
);
|
||||
}
|
||||
// Test always passes — this is advisory.
|
||||
expect(true).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -70,31 +70,58 @@ Add a new /greet skill that prints a welcome message.
|
||||
// If Codex is unavailable on the test machine, the skill should print
|
||||
// [codex-unavailable] and still complete the Claude subagent half.
|
||||
const result = await runSkillTest({
|
||||
name: 'autoplan-dual-voice',
|
||||
workdir: workDir,
|
||||
testName: 'autoplan-dual-voice',
|
||||
workingDirectory: workDir,
|
||||
prompt: `/autoplan ${planPath}`,
|
||||
timeoutMs: 300_000, // 5 min
|
||||
evalCollector,
|
||||
timeout: 300_000, // 5 min
|
||||
// /autoplan spawns subagents and calls codex via Bash; it needs the
|
||||
// full tool set to get past Phase 1. Bash+Read+Write alone wasn't
|
||||
// enough — the skill stalled trying to invoke Agent/Skill.
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent', 'Skill'],
|
||||
maxTurns: 30,
|
||||
runId,
|
||||
});
|
||||
|
||||
// Accept EITHER outcome as success:
|
||||
// (a) Both voices produced output (ideal case)
|
||||
// (b) Codex unavailable + Claude voice produced output (graceful degrade)
|
||||
const out = result.stdout + result.stderr;
|
||||
const claudeVoiceFired = /Claude\s+(CEO|subagent)|claude-subagent/i.test(out);
|
||||
const codexVoiceFired = /codex\s+(exec|review|CEO\s+voice)|\[via:codex\]/i.test(out);
|
||||
const codexUnavailable = /\[codex-unavailable\]|AUTH_FAILED|codex_cli_missing/i.test(out);
|
||||
// Search ONLY the tool-call structure — NOT the prompt string that went in.
|
||||
// Matching against full transcript is risky because the prompt itself
|
||||
// contains "plan-ceo-review" and other marker strings that would produce
|
||||
// false positives regardless of skill behavior. Filter to tool_result
|
||||
// content + assistant messages emitted DURING execution.
|
||||
const transcript = Array.isArray(result.transcript) ? result.transcript : [];
|
||||
const executionContent = transcript
|
||||
.filter((entry: any) => entry && (entry.type === 'tool_use' || entry.type === 'tool_result' || entry.role === 'assistant'))
|
||||
.map((entry: any) => JSON.stringify(entry))
|
||||
.join('\n');
|
||||
const out = (result.output ?? '') + '\n' + executionContent;
|
||||
|
||||
// Claude voice: require evidence of a dispatched Agent subagent, not
|
||||
// merely the literal string "Agent(" (which could appear in any text).
|
||||
// Task/Agent tool_use entries have name:"Agent" or subagent_type:"..."
|
||||
const claudeVoiceFired = /"name":\s*"Agent"|"subagent_type":\s*"[^"]/.test(out) ||
|
||||
/Claude\s+(CEO|subagent)\s+(review|complete|finished)|claude-subagent\s/i.test(out);
|
||||
// Codex voice: require evidence of codex CLI invocation (command string in
|
||||
// a Bash tool_use), not prompt-text mentions.
|
||||
const codexVoiceFired = /"command":\s*"[^"]*codex\s+(exec|review)/.test(out) ||
|
||||
/CODEX SAYS\s*\(/i.test(out);
|
||||
// Unavailable markers: explicit probe-failure strings emitted by the skill.
|
||||
const codexUnavailable = /\[codex-unavailable\]|AUTH_FAILED\b|CODEX_NOT_AVAILABLE\b|codex_cli_missing|Codex CLI not found/i.test(out);
|
||||
|
||||
expect(claudeVoiceFired).toBe(true);
|
||||
expect(codexVoiceFired || codexUnavailable).toBe(true);
|
||||
|
||||
// Hang protection: if the skill reached Phase 1 at all, our hardening worked.
|
||||
// If it didn't, this is a regression from the pre-wave stdin-deadlock era.
|
||||
const reachedPhase1 = /Phase 1|CEO\s+Review|Strategy\s*&\s*Scope/i.test(out);
|
||||
// Hang protection: require phase completion evidence, not name mentions.
|
||||
// "Phase 1 complete" or a phase-transition marker, not "plan-ceo-review"
|
||||
// as a bare string (which appears in the prompt itself).
|
||||
const reachedPhase1 = /Phase\s+1\s+(complete|done|finished)|CEO\s+Review\s+(complete|done|approved)|Strategy\s*&\s*Scope\s+(complete|done)|Phase\s+2\s+(started|begin)/i.test(out);
|
||||
expect(reachedPhase1).toBe(true);
|
||||
|
||||
logCost(result);
|
||||
recordE2E('autoplan-dual-voice', result);
|
||||
logCost('autoplan-dual-voice', result);
|
||||
recordE2E(evalCollector, 'autoplan-dual-voice', 'Autoplan dual-voice E2E', result, {
|
||||
passed: claudeVoiceFired && (codexVoiceFired || codexUnavailable) && reachedPhase1,
|
||||
});
|
||||
},
|
||||
330_000, // per-test timeout slightly > spawn timeout so cleanup can run
|
||||
);
|
||||
|
||||
@@ -0,0 +1,514 @@
|
||||
/**
|
||||
* Tier-1 live-fire E2E for /context-save and /context-restore.
|
||||
*
|
||||
* These spawn `claude -p "/context-save ..."` with the Skill tool enabled
|
||||
* and the skill installed in the workdir's .claude/skills/. Unlike the
|
||||
* older hand-fed-section tests, these exercise the ROUTING path — the
|
||||
* exact thing that broke with the /checkpoint name collision and the
|
||||
* whole reason this rename exists. If /context-save stops routing to
|
||||
* the skill (e.g., upstream ships a built-in by that name), these fail.
|
||||
*
|
||||
* Periodic tier. ~$0.20-$0.40 per test, ~$2 total per run.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import {
|
||||
ROOT, runId, evalsEnabled,
|
||||
describeIfSelected, testConcurrentIfSelected,
|
||||
logCost, recordE2E,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const evalCollector = createEvalCollector('e2e-context-skills');
|
||||
|
||||
// Shared install helper: copy both skill files + bin scripts + routing CLAUDE.md
|
||||
// into a tmp workdir. Matches the pattern from skill-routing-e2e.test.ts so
|
||||
// claude -p discovers the skills via .claude/skills/ auto-scan.
|
||||
function setupWorkdir(suffix: string): { workDir: string; gstackHome: string; slug: string } {
|
||||
const workDir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-ctx-${suffix}-`));
|
||||
const gstackHome = path.join(workDir, '.gstack-home');
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 });
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(workDir, 'app.ts'), 'console.log("hello");\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Install skills into .claude/skills/ for claude -p auto-discovery.
|
||||
const skillsDir = path.join(workDir, '.claude', 'skills');
|
||||
for (const skill of ['context-save', 'context-restore']) {
|
||||
const destDir = path.join(skillsDir, skill);
|
||||
fs.mkdirSync(destDir, { recursive: true });
|
||||
fs.copyFileSync(path.join(ROOT, skill, 'SKILL.md'), path.join(destDir, 'SKILL.md'));
|
||||
}
|
||||
|
||||
// Install the bin scripts referenced by the preamble.
|
||||
const binDir = path.join(workDir, 'bin');
|
||||
fs.mkdirSync(binDir, { recursive: true });
|
||||
for (const script of [
|
||||
'gstack-timeline-log', 'gstack-timeline-read', 'gstack-slug',
|
||||
'gstack-learnings-log', 'gstack-learnings-search',
|
||||
'gstack-update-check', 'gstack-config', 'gstack-repo-mode',
|
||||
]) {
|
||||
const src = path.join(ROOT, 'bin', script);
|
||||
if (fs.existsSync(src)) {
|
||||
fs.copyFileSync(src, path.join(binDir, script));
|
||||
fs.chmodSync(path.join(binDir, script), 0o755);
|
||||
}
|
||||
}
|
||||
|
||||
// Routing CLAUDE.md: explicit instruction to always use the Skill tool.
|
||||
fs.writeFileSync(path.join(workDir, 'CLAUDE.md'), `# Project Instructions
|
||||
|
||||
## Skill routing
|
||||
|
||||
When the user's request matches an available skill, ALWAYS invoke it using the Skill
|
||||
tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
|
||||
|
||||
Key routing rules:
|
||||
- Save progress, save state, save my work → invoke context-save
|
||||
- Resume, where was I, pick up where I left off → invoke context-restore
|
||||
|
||||
Environment:
|
||||
- Use GSTACK_HOME="${gstackHome}" for all gstack bin scripts.
|
||||
- The bin scripts are at ./bin/ (relative to this directory).
|
||||
- The skill files are at ./.claude/skills/context-save/SKILL.md and
|
||||
./.claude/skills/context-restore/SKILL.md.
|
||||
`);
|
||||
|
||||
const slug = path.basename(workDir).replace(/[^a-zA-Z0-9._-]/g, '');
|
||||
return { workDir, gstackHome, slug };
|
||||
}
|
||||
|
||||
// Helper: seed a saved-context file into the storage dir.
|
||||
function seedSave(gstackHome: string, slug: string, filename: string, frontmatter: Record<string, string>, body: string) {
|
||||
const dir = path.join(gstackHome, 'projects', slug, 'checkpoints');
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
const fm = '---\n' + Object.entries(frontmatter).map(([k, v]) => `${k}: ${v}`).join('\n') + '\n---\n';
|
||||
fs.writeFileSync(path.join(dir, filename), fm + body);
|
||||
}
|
||||
|
||||
// Helper: extract the list of Skill tool invocations from the transcript.
|
||||
function skillCalls(result: { toolCalls: Array<{ tool: string; input: any }> }): string[] {
|
||||
return result.toolCalls
|
||||
.filter((tc) => tc.tool === 'Skill')
|
||||
.map((tc) => tc.input?.skill || '')
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
// Build a broader assertion surface: final assistant message + every tool
|
||||
// input and output. The agent often finishes with a tool call instead of a
|
||||
// text response, leaving result.output as an empty string — but the data we
|
||||
// want to assert on (skill invocation args, bash stdout like NO_CHECKPOINTS,
|
||||
// file paths) is all present in the transcript. Search there too.
|
||||
function fullOutputSurface(result: {
|
||||
output?: string;
|
||||
transcript?: any[];
|
||||
toolCalls?: Array<{ tool: string; input: any; output: string }>;
|
||||
}): string {
|
||||
const parts: string[] = [];
|
||||
if (result.output) parts.push(result.output);
|
||||
for (const tc of result.toolCalls || []) {
|
||||
parts.push(JSON.stringify(tc.input || {}));
|
||||
if (tc.output) parts.push(tc.output);
|
||||
}
|
||||
// Also stringify transcript for tool_result / user-message content that
|
||||
// isn't surfaced via toolCalls (e.g., Bash stdout echoed back).
|
||||
for (const entry of result.transcript || []) {
|
||||
try { parts.push(JSON.stringify(entry)); } catch { /* skip */ }
|
||||
}
|
||||
return parts.join('\n');
|
||||
}
|
||||
|
||||
// ────────────────────────────────────────────────────────────────────────
|
||||
// Live-fire E2E suite
|
||||
// ────────────────────────────────────────────────────────────────────────
|
||||
|
||||
describeIfSelected('Context Skills E2E (live-fire)', [
|
||||
'context-save-routing',
|
||||
'context-save-then-restore-roundtrip',
|
||||
'context-restore-fragment-match',
|
||||
'context-restore-empty-state',
|
||||
'context-restore-list-delegates',
|
||||
'context-restore-legacy-compat',
|
||||
'context-save-list-current-branch',
|
||||
'context-save-list-all-branches',
|
||||
], () => {
|
||||
afterAll(() => { finalizeEvalCollector(evalCollector); });
|
||||
|
||||
// ── 1. Routing: /context-save actually invokes the Skill tool ────────
|
||||
testConcurrentIfSelected('context-save-routing', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('routing');
|
||||
|
||||
// Prompt pattern: the slash command + explicit "invoke via Skill tool"
|
||||
// instruction. The GSTACK_HOME / ./bin bash setup that used to be in
|
||||
// the prompt now comes via env:. Prompt without the Skill-tool hint
|
||||
// causes the agent to interpret /context-save as a shell token and
|
||||
// skip Skill routing entirely — which defeats this test's purpose.
|
||||
const result = await runSkillTest({
|
||||
prompt: `Run /context-save wintermute progress. Invoke via the Skill tool. Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 12,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'context-save-routing',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-save-routing', result);
|
||||
|
||||
const invokedSkills = skillCalls(result);
|
||||
const routedToContextSave = invokedSkills.includes('context-save');
|
||||
// File should also be written to the storage dir.
|
||||
const checkpointDir = path.join(gstackHome, 'projects', slug, 'checkpoints');
|
||||
const files = fs.existsSync(checkpointDir) ? fs.readdirSync(checkpointDir).filter((f) => f.endsWith('.md')) : [];
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'context-save routes via Skill tool', 'Context Skills E2E', result, {
|
||||
passed: exitOk && routedToContextSave && files.length > 0,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(routedToContextSave).toBe(true);
|
||||
expect(files.length).toBeGreaterThan(0);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 180_000);
|
||||
|
||||
// ── 2. Round-trip: save then restore in the same session ─────────────
|
||||
testConcurrentIfSelected('context-save-then-restore-roundtrip', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('roundtrip');
|
||||
const magicMarker = 'wintermute-roundtrip-MX7FQZ';
|
||||
|
||||
// Stage a change so /context-save has something to capture.
|
||||
fs.writeFileSync(path.join(workDir, 'feature.ts'), `// ${magicMarker}\nexport const X = 1;\n`);
|
||||
spawnSync('git', ['add', 'feature.ts'], { cwd: workDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Two steps:
|
||||
1. Run /context-save ${magicMarker} — invoke via the Skill tool.
|
||||
2. Run /context-restore — invoke via the Skill tool. Report what it loaded.
|
||||
Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 25,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 240_000,
|
||||
testName: 'context-save-then-restore-roundtrip',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-save-then-restore-roundtrip', result);
|
||||
|
||||
const invokedSkills = skillCalls(result);
|
||||
const bothRouted = invokedSkills.includes('context-save') && invokedSkills.includes('context-restore');
|
||||
const checkpointDir = path.join(gstackHome, 'projects', slug, 'checkpoints');
|
||||
const files = fs.existsSync(checkpointDir) ? fs.readdirSync(checkpointDir).filter((f) => f.endsWith('.md')) : [];
|
||||
// Broader surface — agent may stop at restore's Skill call without
|
||||
// echoing the marker into result.output. The marker is also in the
|
||||
// Skill tool input (we passed it as the save title) and in the
|
||||
// file content that restore reads.
|
||||
const restoreMentionsTitle = fullOutputSurface(result).toLowerCase().includes(magicMarker.toLowerCase());
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'save-then-restore round-trip', 'Context Skills E2E', result, {
|
||||
passed: exitOk && bothRouted && files.length > 0 && restoreMentionsTitle,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(bothRouted).toBe(true);
|
||||
expect(files.length).toBeGreaterThan(0);
|
||||
expect(restoreMentionsTitle).toBe(true);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 240_000);
|
||||
|
||||
// ── 3. /context-restore <fragment> loads the matching save ───────────
|
||||
testConcurrentIfSelected('context-restore-fragment-match', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('fragment');
|
||||
|
||||
// Seed three saves with distinct titles.
|
||||
seedSave(gstackHome, slug, '20260101-120000-alpha-feature.md',
|
||||
{ status: 'in-progress', branch: 'feat/alpha', timestamp: '2026-01-01T12:00:00Z' },
|
||||
'## Working on: alpha feature\n\n### Summary\nAlpha content FRAGMATCH_ALPHA_BUILD\n');
|
||||
seedSave(gstackHome, slug, '20260202-120000-middle-payments.md',
|
||||
{ status: 'in-progress', branch: 'feat/payments', timestamp: '2026-02-02T12:00:00Z' },
|
||||
'## Working on: middle payments\n\n### Summary\nPayments content FRAGMATCH_PAYMENTS_BUILD\n');
|
||||
seedSave(gstackHome, slug, '20260303-120000-omega-release.md',
|
||||
{ status: 'in-progress', branch: 'feat/omega', timestamp: '2026-03-03T12:00:00Z' },
|
||||
'## Working on: omega release\n\n### Summary\nOmega content FRAGMATCH_OMEGA_BUILD\n');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Run /context-restore payments — load the saved context whose title contains "payments". Invoke via the Skill tool. Report what was loaded. Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 10,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'context-restore-fragment-match',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-restore-fragment-match', result);
|
||||
|
||||
// Broader surface — agent may stop at Skill call without echoing the
|
||||
// body marker. The payments file's body is in tool outputs (Read/Bash).
|
||||
const out = fullOutputSurface(result);
|
||||
const loadedPayments = out.includes('FRAGMATCH_PAYMENTS_BUILD');
|
||||
const didNotLoadOthers = !out.includes('FRAGMATCH_ALPHA_BUILD') && !out.includes('FRAGMATCH_OMEGA_BUILD');
|
||||
const routedToRestore = skillCalls(result).includes('context-restore');
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'context-restore <fragment> match', 'Context Skills E2E', result, {
|
||||
passed: exitOk && routedToRestore && loadedPayments && didNotLoadOthers,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(routedToRestore).toBe(true);
|
||||
expect(loadedPayments).toBe(true);
|
||||
expect(didNotLoadOthers).toBe(true);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 180_000);
|
||||
|
||||
// ── 4. /context-restore with zero saves → graceful empty-state ───────
|
||||
testConcurrentIfSelected('context-restore-empty-state', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('empty');
|
||||
// Ensure the storage dir is empty or missing — setupWorkdir doesn't seed.
|
||||
const checkpointDir = path.join(gstackHome, 'projects', slug, 'checkpoints');
|
||||
expect(fs.existsSync(checkpointDir)).toBe(false);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Run /context-restore — there are no saved contexts yet. Invoke via the Skill tool. Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 8,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Grep', 'Glob'],
|
||||
timeout: 90_000,
|
||||
testName: 'context-restore-empty-state',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-restore-empty-state', result);
|
||||
|
||||
// Build broad surface: agent often stops after a tool call with no final
|
||||
// text, so result.output is empty string. The bash "NO_CHECKPOINTS" echo
|
||||
// is in tool outputs; the "no saved contexts yet" phrase may only appear
|
||||
// in tool inputs / transcript entries.
|
||||
const out = fullOutputSurface(result);
|
||||
const gracefulMessage = /no saved context|no contexts? yet|nothing to restore|NO_CHECKPOINTS/i.test(out);
|
||||
const noCrash = !/error|exception|undefined/i.test(out) || gracefulMessage; // mention of "error" in the graceful message is fine
|
||||
const routedToRestore = skillCalls(result).includes('context-restore');
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'context-restore empty state', 'Context Skills E2E', result, {
|
||||
passed: exitOk && routedToRestore && gracefulMessage && noCrash,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(routedToRestore).toBe(true);
|
||||
expect(gracefulMessage).toBe(true);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 150_000);
|
||||
|
||||
// ── 5. /context-restore list redirects to /context-save list ─────────
|
||||
testConcurrentIfSelected('context-restore-list-delegates', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('delegates');
|
||||
seedSave(gstackHome, slug, '20260101-120000-seed.md',
|
||||
{ status: 'in-progress', branch: 'main', timestamp: '2026-01-01T12:00:00Z' },
|
||||
'## Working on: seed\n');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Run /context-restore list. Invoke via the Skill tool. Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 8,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Grep', 'Glob'],
|
||||
timeout: 90_000,
|
||||
testName: 'context-restore-list-delegates',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-restore-list-delegates', result);
|
||||
|
||||
// Broader surface — agent sometimes stops after the Skill call without
|
||||
// producing text output. The "use /context-save list" hint may only
|
||||
// appear in tool inputs / transcript.
|
||||
const out = fullOutputSurface(result);
|
||||
const mentionsSaveList = /context-save list/i.test(out);
|
||||
const routedToRestore = skillCalls(result).includes('context-restore');
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'context-restore list delegates', 'Context Skills E2E', result, {
|
||||
passed: exitOk && routedToRestore && mentionsSaveList,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(routedToRestore).toBe(true);
|
||||
expect(mentionsSaveList).toBe(true);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 150_000);
|
||||
|
||||
// ── 6. Legacy compat: pre-rename save files still load ───────────────
|
||||
testConcurrentIfSelected('context-restore-legacy-compat', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('legacy');
|
||||
|
||||
// Seed a save file in the pre-rename format (exactly how old /checkpoint
|
||||
// wrote them). The storage dir name is still "checkpoints/" — kept for
|
||||
// exactly this reason.
|
||||
seedSave(gstackHome, slug, '20260301-120000-legacy-pre-rename-work.md',
|
||||
{
|
||||
status: 'in-progress',
|
||||
branch: 'feat/pre-rename',
|
||||
timestamp: '2026-03-01T12:00:00Z',
|
||||
session_duration_s: '3600',
|
||||
},
|
||||
'## Working on: legacy pre-rename work\n\n### Summary\nWork saved by OLD_CHECKPOINT_SKILL_LEGACYCOMPAT before the rename.\n\n### Remaining Work\n1. Item from the before-times.\n');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Run /context-restore — load the most recent saved context. Invoke via the Skill tool. Report the content of the loaded file. Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 8,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'context-restore-legacy-compat',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-restore-legacy-compat', result);
|
||||
|
||||
// Check for ANY evidence the legacy file was loaded. The agent may
|
||||
// paraphrase the summary OR stop at a tool call without text output,
|
||||
// so require at least ONE of:
|
||||
// (a) the unique body marker (verbatim pass-through)
|
||||
// (b) the title phrase "legacy pre-rename work"
|
||||
// (c) the filename or its timestamp prefix
|
||||
// (d) the branch name "feat/pre-rename"
|
||||
// Search across the full transcript, not just result.output.
|
||||
const out = fullOutputSurface(result);
|
||||
const loadedLegacy =
|
||||
out.includes('OLD_CHECKPOINT_SKILL_LEGACYCOMPAT') ||
|
||||
/legacy.+pre-rename/i.test(out) ||
|
||||
/20260301-120000-legacy/i.test(out) ||
|
||||
/feat\/pre-rename/i.test(out) ||
|
||||
/pre-rename/i.test(out);
|
||||
const routedToRestore = skillCalls(result).includes('context-restore');
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'legacy /checkpoint file loads via /context-restore', 'Context Skills E2E', result, {
|
||||
passed: exitOk && routedToRestore && loadedLegacy,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(routedToRestore).toBe(true);
|
||||
expect(loadedLegacy).toBe(true);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 180_000);
|
||||
|
||||
// ── 7. /context-save list: default filters to current branch ─────────
|
||||
testConcurrentIfSelected('context-save-list-current-branch', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('list-current');
|
||||
|
||||
// Seed 3 files on 3 different branches. Current branch is "main".
|
||||
seedSave(gstackHome, slug, '20260101-120000-main-work.md',
|
||||
{ status: 'in-progress', branch: 'main', timestamp: '2026-01-01T12:00:00Z' },
|
||||
'## Working on: main work LISTCURR_MAIN_TOKEN\n');
|
||||
seedSave(gstackHome, slug, '20260202-120000-feat-alpha.md',
|
||||
{ status: 'in-progress', branch: 'feat/alpha', timestamp: '2026-02-02T12:00:00Z' },
|
||||
'## Working on: alpha LISTCURR_ALPHA_TOKEN\n');
|
||||
seedSave(gstackHome, slug, '20260303-120000-feat-beta.md',
|
||||
{ status: 'in-progress', branch: 'feat/beta', timestamp: '2026-03-03T12:00:00Z' },
|
||||
'## Working on: beta LISTCURR_BETA_TOKEN\n');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Run /context-save list — list saved contexts for the CURRENT branch only (default, no --all). Invoke via the Skill tool. The current branch is "main". Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 10,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'context-save-list-current-branch',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-save-list-current-branch', result);
|
||||
|
||||
// Broad surface: the list output may only appear in bash tool_result
|
||||
// entries (find output, file reads) rather than the agent's final text.
|
||||
const out = fullOutputSurface(result);
|
||||
// Must show the main-branch save. Hide the other branches' saves.
|
||||
// Match by filename timestamp (stable, unambiguous) plus a looser
|
||||
// prose check.
|
||||
const showsMain = /20260101-120000|main-work/.test(out);
|
||||
const hidesAlpha = !/20260202-120000/.test(out);
|
||||
const hidesBeta = !/20260303-120000/.test(out);
|
||||
const routed = skillCalls(result).includes('context-save');
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'context-save list (current branch default)', 'Context Skills E2E', result, {
|
||||
passed: exitOk && routed && showsMain && hidesAlpha && hidesBeta,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(routed).toBe(true);
|
||||
expect(showsMain).toBe(true);
|
||||
expect(hidesAlpha).toBe(true);
|
||||
expect(hidesBeta).toBe(true);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 180_000);
|
||||
|
||||
// ── 8. /context-save list --all: shows every branch ──────────────────
|
||||
testConcurrentIfSelected('context-save-list-all-branches', async () => {
|
||||
const { workDir, gstackHome, slug } = setupWorkdir('list-all');
|
||||
|
||||
seedSave(gstackHome, slug, '20260101-120000-main-work.md',
|
||||
{ status: 'in-progress', branch: 'main', timestamp: '2026-01-01T12:00:00Z' },
|
||||
'## Working on: main LISTALL_MAIN_TOKEN\n');
|
||||
seedSave(gstackHome, slug, '20260202-120000-feat-alpha.md',
|
||||
{ status: 'in-progress', branch: 'feat/alpha', timestamp: '2026-02-02T12:00:00Z' },
|
||||
'## Working on: alpha LISTALL_ALPHA_TOKEN\n');
|
||||
seedSave(gstackHome, slug, '20260303-120000-feat-beta.md',
|
||||
{ status: 'in-progress', branch: 'feat/beta', timestamp: '2026-03-03T12:00:00Z' },
|
||||
'## Working on: beta LISTALL_BETA_TOKEN\n');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Run /context-save list --all — list saved contexts from ALL branches (not just the current one). Invoke via the Skill tool. Report the full list. Do NOT use AskUserQuestion.`,
|
||||
workingDirectory: workDir,
|
||||
env: { GSTACK_HOME: gstackHome },
|
||||
maxTurns: 10,
|
||||
allowedTools: ['Skill', 'Bash', 'Read', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'context-save-list-all-branches',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-save-list-all-branches', result);
|
||||
|
||||
// Broad surface — same rationale as list-current-branch: the list output
|
||||
// may only be in bash tool_result, not in the agent's final text.
|
||||
const out = fullOutputSurface(result);
|
||||
const filesShown = [
|
||||
/20260101-120000/.test(out),
|
||||
/20260202-120000/.test(out),
|
||||
/20260303-120000/.test(out),
|
||||
].filter(Boolean).length;
|
||||
const routed = skillCalls(result).includes('context-save');
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'context-save list --all', 'Context Skills E2E', result, {
|
||||
passed: exitOk && routed && filesShown === 3,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(routed).toBe(true);
|
||||
expect(filesShown).toBe(3);
|
||||
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
|
||||
}, 180_000);
|
||||
});
|
||||
@@ -15,10 +15,11 @@ const evalCollector = createEvalCollector('e2e-session-intelligence');
|
||||
|
||||
// --- Session Intelligence E2E ---
|
||||
// Tests the core contract: timeline events flow in, context recovery flows out,
|
||||
// checkpoints round-trip.
|
||||
// /context-save + /context-restore round-trip.
|
||||
|
||||
describeIfSelected('Session Intelligence E2E', [
|
||||
'timeline-event-flow', 'context-recovery-artifacts', 'checkpoint-save-resume',
|
||||
'timeline-event-flow', 'context-recovery-artifacts',
|
||||
'context-save-writes-file', 'context-restore-loads-latest',
|
||||
], () => {
|
||||
let workDir: string;
|
||||
let gstackHome: string;
|
||||
@@ -194,28 +195,28 @@ IMPORTANT:
|
||||
console.log(`Context recovery: artifacts=${foundArtifacts}, lastSession=${foundLastSession}, timeline=${foundTimeline}`);
|
||||
}, 180_000);
|
||||
|
||||
// --- Test 3: Checkpoint save and resume ---
|
||||
// Run /checkpoint save via claude -p, verify file created. Then run /checkpoint resume
|
||||
// and verify it reads the checkpoint back.
|
||||
testConcurrentIfSelected('checkpoint-save-resume', async () => {
|
||||
// --- Test 3: /context-save writes a file ---
|
||||
// Hand-feed the save section of context-save/SKILL.md to claude -p and verify
|
||||
// a file gets written to the project's checkpoints dir with valid frontmatter.
|
||||
testConcurrentIfSelected('context-save-writes-file', async () => {
|
||||
const projectDir = path.join(gstackHome, 'projects', slug);
|
||||
fs.mkdirSync(path.join(projectDir, 'checkpoints'), { recursive: true });
|
||||
|
||||
// Copy the /checkpoint skill
|
||||
copyDirSync(path.join(ROOT, 'checkpoint'), path.join(workDir, 'checkpoint'));
|
||||
// Copy the /context-save skill
|
||||
copyDirSync(path.join(ROOT, 'context-save'), path.join(workDir, 'context-save'));
|
||||
|
||||
// Add a staged change so /checkpoint has something to capture
|
||||
// Add a staged change so /context-save has something to capture
|
||||
fs.writeFileSync(path.join(workDir, 'feature.ts'), 'export function newFeature() { return true; }\n');
|
||||
spawnSync('git', ['add', 'feature.ts'], { cwd: workDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
// Extract the checkpoint save section from the skill template
|
||||
const full = fs.readFileSync(path.join(ROOT, 'checkpoint', 'SKILL.md'), 'utf-8');
|
||||
const saveStart = full.indexOf('## Save');
|
||||
const resumeStart = full.indexOf('## Resume');
|
||||
const saveSection = full.slice(saveStart, resumeStart > saveStart ? resumeStart : undefined);
|
||||
// Extract the save section from the skill template (before the List section)
|
||||
const full = fs.readFileSync(path.join(ROOT, 'context-save', 'SKILL.md'), 'utf-8');
|
||||
const saveStart = full.indexOf('## Save flow');
|
||||
const listStart = full.indexOf('## List flow');
|
||||
const saveSection = full.slice(saveStart, listStart > saveStart ? listStart : undefined);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are testing the /checkpoint skill. Follow these instructions to save a checkpoint.
|
||||
prompt: `You are testing the /context-save skill. Follow these instructions to save a context file.
|
||||
|
||||
${saveSection.slice(0, 2000)}
|
||||
|
||||
@@ -223,7 +224,7 @@ IMPORTANT:
|
||||
- Use GSTACK_HOME="${gstackHome}" as an environment variable when running bin scripts.
|
||||
- The bin scripts are at ./bin/ (relative to this directory), not at ~/.claude/skills/gstack/bin/.
|
||||
Replace any references to ~/.claude/skills/gstack/bin/ with ./bin/ when running commands.
|
||||
- Save the checkpoint to ${projectDir}/checkpoints/ with a filename like "20260401-test-checkpoint.md".
|
||||
- Save the file to ${projectDir}/checkpoints/ with a filename like "20260401-test-context.md".
|
||||
- Include YAML frontmatter with status, branch, and timestamp.
|
||||
- Include a summary of what's being worked on (you can see from git status).
|
||||
- Do NOT use AskUserQuestion.`,
|
||||
@@ -231,38 +232,134 @@ IMPORTANT:
|
||||
maxTurns: 10,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'checkpoint-save-resume',
|
||||
testName: 'context-save-writes-file',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('checkpoint save', result);
|
||||
logCost('context-save', result);
|
||||
|
||||
// Check that a checkpoint file was created
|
||||
// Check that a context file was created
|
||||
const checkpointDir = path.join(projectDir, 'checkpoints');
|
||||
const checkpointFiles = fs.existsSync(checkpointDir)
|
||||
const files = fs.existsSync(checkpointDir)
|
||||
? fs.readdirSync(checkpointDir).filter(f => f.endsWith('.md'))
|
||||
: [];
|
||||
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
const checkpointCreated = checkpointFiles.length > 0;
|
||||
const fileCreated = files.length > 0;
|
||||
|
||||
let checkpointContent = '';
|
||||
if (checkpointCreated) {
|
||||
checkpointContent = fs.readFileSync(path.join(checkpointDir, checkpointFiles[0]), 'utf-8');
|
||||
let fileContent = '';
|
||||
if (fileCreated) {
|
||||
fileContent = fs.readFileSync(path.join(checkpointDir, files[0]), 'utf-8');
|
||||
}
|
||||
|
||||
// Verify checkpoint has expected structure
|
||||
const hasYamlFrontmatter = checkpointContent.includes('---') && checkpointContent.includes('status:');
|
||||
const hasBranch = checkpointContent.includes('branch:') || checkpointContent.includes('main');
|
||||
const hasYamlFrontmatter = fileContent.includes('---') && fileContent.includes('status:');
|
||||
const hasBranch = fileContent.includes('branch:') || fileContent.includes('main');
|
||||
|
||||
recordE2E(evalCollector, 'checkpoint save-resume', 'Session Intelligence E2E', result, {
|
||||
passed: exitOk && checkpointCreated && hasYamlFrontmatter,
|
||||
recordE2E(evalCollector, 'context-save writes file', 'Session Intelligence E2E', result, {
|
||||
passed: exitOk && fileCreated && hasYamlFrontmatter,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(checkpointCreated).toBe(true);
|
||||
expect(fileCreated).toBe(true);
|
||||
expect(hasYamlFrontmatter).toBe(true);
|
||||
|
||||
console.log(`Checkpoint: ${checkpointFiles.length} files created, YAML frontmatter: ${hasYamlFrontmatter}, branch: ${hasBranch}`);
|
||||
console.log(`context-save: ${files.length} files created, YAML frontmatter: ${hasYamlFrontmatter}, branch: ${hasBranch}`);
|
||||
}, 180_000);
|
||||
|
||||
// --- Test 4: /context-restore loads the newest file across branches ---
|
||||
// Seed two saved-context files with different YYYYMMDD-HHMMSS prefixes and
|
||||
// different branches in their frontmatter. Hand-feed the restore section to
|
||||
// claude -p. Verify the agent identifies the newer file (by filename prefix)
|
||||
// and presents its content, regardless of the current branch.
|
||||
testConcurrentIfSelected('context-restore-loads-latest', async () => {
|
||||
const projectDir = path.join(gstackHome, 'projects', slug);
|
||||
const checkpointDir = path.join(projectDir, 'checkpoints');
|
||||
fs.mkdirSync(checkpointDir, { recursive: true });
|
||||
|
||||
// Copy the /context-restore skill
|
||||
copyDirSync(path.join(ROOT, 'context-restore'), path.join(workDir, 'context-restore'));
|
||||
|
||||
// Seed two files: older on branch-a (title "old-work"), newer on branch-b
|
||||
// (title "newer-wintermute-work"). Current branch (main) matches neither.
|
||||
const olderFile = path.join(checkpointDir, '20260101-120000-old-work.md');
|
||||
const newerFile = path.join(checkpointDir, '20260202-130000-newer-wintermute-work.md');
|
||||
fs.writeFileSync(olderFile, `---
|
||||
status: in-progress
|
||||
branch: branch-a
|
||||
timestamp: 2026-01-01T12:00:00-07:00
|
||||
---
|
||||
|
||||
## Working on: old work
|
||||
|
||||
### Summary
|
||||
This is older work on branch-a.
|
||||
|
||||
### Remaining Work
|
||||
1. Should NOT be loaded by default restore.
|
||||
`);
|
||||
fs.writeFileSync(newerFile, `---
|
||||
status: in-progress
|
||||
branch: branch-b
|
||||
timestamp: 2026-02-02T13:00:00-07:00
|
||||
---
|
||||
|
||||
## Working on: newer wintermute work
|
||||
|
||||
### Summary
|
||||
This is the newest saved context. Cross-branch restore should load THIS file.
|
||||
|
||||
### Remaining Work
|
||||
1. Finish the wintermute integration.
|
||||
`);
|
||||
|
||||
// Deliberately scramble mtimes so filesystem mtime DISAGREES with filename
|
||||
// prefix — this proves we're using filename ordering, not ls -1t.
|
||||
const pastOlderMtime = Math.floor(Date.now() / 1000); // now (newest mtime)
|
||||
const pastNewerMtime = pastOlderMtime - 60 * 60 * 24 * 30; // 30 days ago
|
||||
fs.utimesSync(olderFile, pastOlderMtime, pastOlderMtime);
|
||||
fs.utimesSync(newerFile, pastNewerMtime, pastNewerMtime);
|
||||
|
||||
// Extract the restore-flow section from the skill template
|
||||
const full = fs.readFileSync(path.join(ROOT, 'context-restore', 'SKILL.md'), 'utf-8');
|
||||
const restoreStart = full.indexOf('## Restore flow');
|
||||
const importantStart = full.indexOf('## Important Rules', restoreStart);
|
||||
const restoreSection = full.slice(restoreStart, importantStart > restoreStart ? importantStart : undefined);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are testing the /context-restore skill. Follow these instructions to restore the most recent saved context.
|
||||
|
||||
${restoreSection.slice(0, 2500)}
|
||||
|
||||
IMPORTANT:
|
||||
- Use GSTACK_HOME="${gstackHome}" as an environment variable when running bin scripts.
|
||||
- The bin scripts are at ./bin/ (relative to this directory), not at ~/.claude/skills/gstack/bin/.
|
||||
- Look in ${checkpointDir} for saved context files.
|
||||
- Current branch is "main" — do NOT filter by current branch. Load across all branches.
|
||||
- The newest file by YYYYMMDD-HHMMSS prefix is the canonical "most recent". Filesystem mtime has been scrambled — do not use it.
|
||||
- Do NOT use AskUserQuestion. Just present the content of the newest file.`,
|
||||
workingDirectory: workDir,
|
||||
maxTurns: 8,
|
||||
allowedTools: ['Bash', 'Read', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'context-restore-loads-latest',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('context-restore', result);
|
||||
|
||||
const output = result.output ?? '';
|
||||
const loadedNewer = output.includes('newer wintermute work') || output.includes('wintermute integration');
|
||||
const loadedOlder = output.includes('old work') && !output.includes('newer');
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
|
||||
recordE2E(evalCollector, 'context-restore loads latest', 'Session Intelligence E2E', result, {
|
||||
passed: exitOk && loadedNewer && !loadedOlder,
|
||||
});
|
||||
|
||||
expect(exitOk).toBe(true);
|
||||
expect(loadedNewer).toBe(true);
|
||||
expect(loadedOlder).toBe(false);
|
||||
|
||||
console.log(`context-restore: loadedNewer=${loadedNewer}, loadedOlder=${loadedOlder}`);
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user