merge: incorporate origin/main into community-mode branch

Main advanced from 0.12.0.0 to 0.12.5.0 (voice directive, deploy
dry-run, smarter browsing, headed mode, full commit coverage, codex
hang fixes). Our branch had a stale 0.12.0.0 entry for community mode.

Conflicts resolved:
- VERSION/package.json: take main's 0.12.5.0
- CHANGELOG: take main's entries; our community-mode entry rewrites at ship
- gen-skill-docs.ts: removed duplicate slug functions (main moved to resolvers/utility.ts)
- touchfiles.ts: removed duplicate review-plan-completion tier entry
- All 21 SKILL.md files: regenerated from templates (never resolve generated files manually)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-26 19:21:47 -06:00
89 changed files with 10224 additions and 644 deletions
+51
View File
@@ -152,6 +152,24 @@ describe('gen-skill-docs', () => {
}
});
test('every Codex SKILL.md description stays under 900-char warning threshold', () => {
const WARN_THRESHOLD = 900;
const agentsDir = path.join(ROOT, '.agents', 'skills');
if (!fs.existsSync(agentsDir)) return;
const violations: string[] = [];
for (const entry of fs.readdirSync(agentsDir, { withFileTypes: true })) {
if (!entry.isDirectory()) continue;
const skillMd = path.join(agentsDir, entry.name, 'SKILL.md');
if (!fs.existsSync(skillMd)) continue;
const content = fs.readFileSync(skillMd, 'utf-8');
const description = extractDescription(content);
if (description.length > WARN_THRESHOLD) {
violations.push(`${entry.name}: ${description.length} chars (limit ${MAX_SKILL_DESCRIPTION_LENGTH}, ${MAX_SKILL_DESCRIPTION_LENGTH - description.length} remaining)`);
}
}
expect(violations).toEqual([]);
});
test('package.json version matches VERSION file', () => {
const pkg = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf-8'));
const version = fs.readFileSync(path.join(ROOT, 'VERSION'), 'utf-8').trim();
@@ -333,6 +351,39 @@ describe('BASE_BRANCH_DETECT resolver', () => {
test('resolver output uses "the base branch" phrasing', () => {
expect(shipContent).toContain('the base branch');
});
test('resolver output contains GitLab CLI commands', () => {
expect(shipContent).toContain('glab');
});
test('resolver output contains git-native fallback', () => {
expect(shipContent).toContain('git symbolic-ref');
});
test('resolver output mentions GitLab platform', () => {
expect(shipContent).toMatch(/gitlab/i);
});
});
describe('GitLab support in generated skills', () => {
const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
const shipSkillContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
test('retro contains GitLab MR number extraction', () => {
expect(retroContent).toContain('[#!]');
});
test('retro uses BASE_BRANCH_DETECT (contains glab)', () => {
expect(retroContent).toContain('glab');
});
test('ship contains glab mr create', () => {
expect(shipSkillContent).toContain('glab mr create');
});
test('ship checks .gitlab-ci.yml', () => {
expect(shipSkillContent).toContain('.gitlab-ci.yml');
});
});
/**
+16 -5
View File
@@ -79,6 +79,9 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Ship
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'],
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'],
// Retro
'retro': ['retro/**'],
@@ -131,10 +134,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
// Deploy skills
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
'canary-workflow': ['canary/**', 'browse/src/**'],
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
'land-and-deploy-first-run': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
'land-and-deploy-review-gate': ['land-and-deploy/**', 'bin/gstack-review-read'],
'canary-workflow': ['canary/**', 'browse/src/**'],
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
// Autoplan
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
@@ -184,6 +189,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'review-base-branch': 'gate',
'review-design-lite': 'periodic', // 4/7 threshold is subjective
'review-coverage-audit': 'gate',
'review-plan-completion': 'gate',
'review-dashboard-via': 'gate',
// Office Hours
'office-hours-spec-review': 'gate',
@@ -210,7 +217,6 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'ship-triage': 'gate',
'ship-plan-completion': 'gate',
'ship-plan-verification': 'gate',
'review-plan-completion': 'gate',
// Retro — gate for cheap branch detection, periodic for full Opus retro
'retro': 'periodic',
@@ -250,6 +256,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Deploy skills
'land-and-deploy-workflow': 'gate',
'land-and-deploy-first-run': 'gate',
'land-and-deploy-review-gate': 'gate',
'canary-workflow': 'gate',
'benchmark-workflow': 'gate',
'setup-deploy-workflow': 'gate',
@@ -313,6 +321,9 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
// Voice directive
'voice directive tone': ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
};
/**
+155
View File
@@ -85,6 +85,161 @@ Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
}, 180_000);
});
// --- Land-and-Deploy First-Run E2E ---
describeIfSelected('Land-and-Deploy first-run E2E', ['land-and-deploy-first-run'], () => {
let firstRunDir: string;
beforeAll(() => {
firstRunDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-first-run-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: firstRunDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "world"; }\n');
fs.writeFileSync(path.join(firstRunDir, 'fly.toml'), 'app = "first-run-app"\n\n[http_service]\n internal_port = 3000\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
run('git', ['checkout', '-b', 'feat/first-deploy']);
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "first deploy"; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'feat: first deploy']);
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(firstRunDir, 'land-and-deploy'));
});
afterAll(() => {
try { fs.rmSync(firstRunDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('land-and-deploy-first-run', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
You are on branch feat/first-deploy. This is the FIRST TIME running /land-and-deploy
for this project there is NO land-deploy-confirmed file.
This repo has a fly.toml with app = "first-run-app", indicating a Fly.io deployment.
IMPORTANT: There is NO remote and NO GitHub PR you cannot run gh commands.
Instead, simulate the Step 1.5 first-run dry-run validation:
1. Detect that this is a FIRST_RUN (no land-deploy-confirmed file)
2. Detect the deploy platform from fly.toml (Fly.io, app = first-run-app)
3. Infer the production URL (https://first-run-app.fly.dev)
4. Build the DEPLOY INFRASTRUCTURE VALIDATION table showing:
- Platform detected
- Command validation results (simulated as all passing)
- Staging detection results (none expected)
- What will happen steps
5. Write the dry-run report to .gstack/deploy-reports/dry-run-validation.md
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.
Just demonstrate the first-run dry-run output.`,
workingDirectory: firstRunDir,
maxTurns: 20,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'land-and-deploy-first-run',
runId,
});
logCost('/land-and-deploy first-run', result);
recordE2E(evalCollector, '/land-and-deploy first-run', 'Land-and-Deploy first-run E2E', result);
expect(result.exitReason).toBe('success');
// Verify dry-run report was created
const reportDir = path.join(firstRunDir, '.gstack', 'deploy-reports');
expect(fs.existsSync(reportDir)).toBe(true);
// Check report content mentions platform detection
const reportFiles = fs.readdirSync(reportDir);
expect(reportFiles.length).toBeGreaterThan(0);
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
const hasPlatform = reportContent.toLowerCase().includes('fly') || reportContent.toLowerCase().includes('first-run-app');
expect(hasPlatform).toBe(true);
}, 180_000);
});
// --- Land-and-Deploy Review Gate E2E ---
describeIfSelected('Land-and-Deploy review gate E2E', ['land-and-deploy-review-gate'], () => {
let reviewDir: string;
beforeAll(() => {
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-review-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(reviewDir, 'app.ts'), 'export function hello() { return "world"; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Create 6 more commits to make any review stale
for (let i = 1; i <= 6; i++) {
fs.writeFileSync(path.join(reviewDir, `file${i}.ts`), `export const x${i} = ${i};\n`);
run('git', ['add', '.']);
run('git', ['commit', '-m', `feat: add file${i}`]);
}
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(reviewDir, 'land-and-deploy'));
});
afterAll(() => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('land-and-deploy-review-gate', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
Focus on Step 3.5a and Step 3.5a-bis (the review staleness check and inline review offer).
This repo has 6 commits since the initial commit. There are NO review logs
(gstack-review-read would return NO_REVIEWS).
Simulate what the readiness gate would show:
1. Run gstack-review-read equivalent (simulate NO_REVIEWS output)
2. Determine review staleness: Eng Review should be "NOT RUN"
3. Note that Step 3.5a-bis would offer an inline review
4. Write a simulated readiness report to .gstack/deploy-reports/readiness-report.md
showing the review status as NOT RUN with the inline review offer text
Do NOT use AskUserQuestion. Do NOT run gh commands.
Show what the readiness gate output would look like.`,
workingDirectory: reviewDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'land-and-deploy-review-gate',
runId,
});
logCost('/land-and-deploy review-gate', result);
recordE2E(evalCollector, '/land-and-deploy review-gate', 'Land-and-Deploy review gate E2E', result);
expect(result.exitReason).toBe('success');
// Verify readiness report was created
const reportDir = path.join(reviewDir, '.gstack', 'deploy-reports');
expect(fs.existsSync(reportDir)).toBe(true);
const reportFiles = fs.readdirSync(reportDir);
expect(reportFiles.length).toBeGreaterThan(0);
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
// Should mention review status
const hasReviewMention = reportContent.toLowerCase().includes('review') ||
reportContent.toLowerCase().includes('not run');
expect(hasReviewMention).toBe(true);
}, 180_000);
});
// --- Canary skill E2E ---
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
+113
View File
@@ -529,6 +529,119 @@ Analyze the git history and produce the narrative report as described in the SKI
}, 420_000);
});
// --- Review Dashboard Via Attribution E2E ---
describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'], () => {
let dashDir: string;
beforeAll(() => {
dashDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-dashboard-via-'));
const run = (cmd: string, args: string[], cwd = dashDir) =>
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
// Create git repo with feature branch
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v1");\n');
run('git', ['add', 'app.ts']);
run('git', ['commit', '-m', 'initial']);
run('git', ['checkout', '-b', 'feature/dashboard-test']);
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v2");\n');
run('git', ['add', 'app.ts']);
run('git', ['commit', '-m', 'feat: update']);
// Get HEAD commit for review entries
const headResult = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { cwd: dashDir, stdio: 'pipe' });
const commit = headResult.stdout.toString().trim();
// Pre-populate review log with autoplan-sourced entries
// gstack-review-read reads from ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl
// For the test, we'll write a mock gstack-review-read script that returns our test data
const timestamp = new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
const reviewData = [
`{"skill":"plan-eng-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"${commit}"}`,
`{"skill":"plan-ceo-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"${commit}"}`,
`{"skill":"codex-plan-review","timestamp":"${timestamp}","status":"clean","source":"codex","commit":"${commit}"}`,
].join('\n');
// Write a mock gstack-review-read that returns our test data
const mockBinDir = path.join(dashDir, '.mock-bin');
fs.mkdirSync(mockBinDir, { recursive: true });
fs.writeFileSync(path.join(mockBinDir, 'gstack-review-read'), [
'#!/usr/bin/env bash',
`echo '${reviewData.split('\n').join("'\necho '")}'`,
'echo "---CONFIG---"',
'echo "false"',
'echo "---HEAD---"',
`echo "${commit}"`,
].join('\n'));
fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755);
// Copy ship skill
fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dashDir, 'ship-SKILL.md'));
});
afterAll(() => {
try { fs.rmSync(dashDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('review-dashboard-via', async () => {
const mockBinDir = path.join(dashDir, '.mock-bin');
const result = await runSkillTest({
prompt: `Read ship-SKILL.md. You only need to run the Review Readiness Dashboard section.
Instead of running ~/.claude/skills/gstack/bin/gstack-review-read, run this mock: ${mockBinDir}/gstack-review-read
Parse the output and display the dashboard table. Pay attention to:
1. The "via" field in entries show source attribution (e.g., "via /autoplan")
2. The codex-plan-review entry it should populate the Outside Voice row
3. Since Eng Review IS clear, there should be NO gate blocking just display the dashboard
Skip the preamble, lake intro, telemetry, and all other ship steps.
Write the dashboard output to ${dashDir}/dashboard-output.md`,
workingDirectory: dashDir,
maxTurns: 12,
timeout: 90_000,
testName: 'review-dashboard-via',
runId,
});
logCost('/ship dashboard-via', result);
recordE2E(evalCollector, '/ship review dashboard via attribution', 'Dashboard via field', result);
expect(result.exitReason).toBe('success');
// Check dashboard output for via attribution
const dashPath = path.join(dashDir, 'dashboard-output.md');
const allOutput = [
result.output || '',
...result.toolCalls.map(tc => tc.output || ''),
].join('\n').toLowerCase();
// Verify via attribution appears somewhere (conversation or file)
let dashContent = '';
if (fs.existsSync(dashPath)) {
dashContent = fs.readFileSync(dashPath, 'utf-8').toLowerCase();
}
const combined = allOutput + dashContent;
// Should mention autoplan attribution
expect(combined).toMatch(/autoplan/);
// Should show eng review as CLEAR (it has a clean entry)
expect(combined).toMatch(/clear/i);
// Should NOT contain AskUserQuestion gate (no blocking)
const gateQuestions = result.toolCalls.filter(tc =>
tc.tool === 'mcp__conductor__AskUserQuestion' ||
(tc.tool === 'AskUserQuestion')
);
// Ship dashboard should not gate when eng review is clear
expect(gateQuestions).toHaveLength(0);
}, 120_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
+63
View File
@@ -778,6 +778,69 @@ describeIfSelected('Other skill evals', [
}, 30_000);
});
// Voice directive eval — tests that the voice section produces the right tone
describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
testIfSelected('voice directive tone', async () => {
const t0 = Date.now();
// Read a tier 2+ skill to get the full voice directive in context
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
const voiceStart = content.indexOf('## Voice');
if (voiceStart === -1) {
throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
}
const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
const result = await callJudge<{
directness: number;
concreteness: number;
avoids_corporate: number;
avoids_ai_vocabulary: number;
connects_user_outcomes: number;
reasoning: string;
}>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
Score each dimension 1-5 where 5 is excellent:
1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
Return JSON only:
{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
THE VOICE DIRECTIVE:
${voiceSection}`);
console.log('Voice directive scores:', JSON.stringify(result, null, 2));
evalCollector?.addTest({
name: 'voice directive tone',
suite: 'Voice directive eval',
tier: 'llm-judge',
passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
&& result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
judge_scores: {
directness: result.directness,
concreteness: result.concreteness,
avoids_corporate: result.avoids_corporate,
avoids_ai_vocabulary: result.avoids_ai_vocabulary,
connects_user_outcomes: result.connects_user_outcomes,
},
judge_reasoning: result.reasoning,
});
expect(result.directness).toBeGreaterThanOrEqual(4);
expect(result.concreteness).toBeGreaterThanOrEqual(4);
expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
}, 30_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
+2 -7
View File
@@ -1325,7 +1325,7 @@ describe('Codex skill', () => {
expect(content).toContain('fall back to the Claude adversarial subagent');
// Review log uses new skill name
expect(content).toContain('adversarial-review');
expect(content).toContain('xhigh');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
});
@@ -1335,7 +1335,7 @@ describe('Codex skill', () => {
expect(content).toContain('< 50');
expect(content).toContain('200+');
expect(content).toContain('adversarial-review');
expect(content).toContain('xhigh');
expect(content).toContain('reasoning_effort="high"');
expect(content).toContain('Investigate and fix');
});
@@ -1369,11 +1369,6 @@ describe('Codex skill', () => {
expect(content).toContain('Persist Eng Review result');
});
test('/ship gate suggests /review or /plan-eng-review when Eng Review is missing', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Abort — run /review or /plan-eng-review first');
});
test('Review Readiness Dashboard includes Adversarial Review row', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Adversarial');