Merge branch 'main' into garrytan/team-supabase-store

Resolved 4 conflicts:
- scripts/gen-skill-docs.ts: kept ARTIFACT_SETUP + added main's new
  resolvers (SPEC_REVIEW_LOOP, DESIGN_SKETCH, BENEFITS_FROM,
  CODEX_REVIEW_STEP). Updated codex review-log to use new paths.
- ship/SKILL.md.tmpl: adopted {{CODEX_REVIEW_STEP}} macro from main
- test/skill-e2e.test.ts: added main's new E2E tests (office-hours
  spec review, plan-ceo benefits-from) + kept our E2E isolation cleanup

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-21 09:29:05 -07:00
96 changed files with 17868 additions and 342 deletions
+185
View File
@@ -0,0 +1,185 @@
/**
* Codex CLI E2E tests — verify skills work when invoked by Codex.
*
* Spawns `codex exec` with skills installed in a temp HOME, parses JSONL
* output, and validates structured results. Follows the same pattern as
* skill-e2e.test.ts but adapted for Codex CLI.
*
* Prerequisites:
* - `codex` binary installed (npm install -g @openai/codex)
* - Codex authenticated via ~/.codex/ config (no OPENAI_API_KEY env var needed)
* - EVALS=1 env var set (same gate as Claude E2E tests)
*
* Skips gracefully when prerequisites are not met.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { runCodexSkill, parseCodexJSONL, installSkillToTempHome } from './helpers/codex-session-runner';
import type { CodexResult } from './helpers/codex-session-runner';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
// --- Prerequisites check ---
const CODEX_AVAILABLE = (() => {
try {
const result = Bun.spawnSync(['which', 'codex']);
return result.exitCode === 0;
} catch { return false; }
})();
const evalsEnabled = !!process.env.EVALS;
// Skip all tests if codex is not available or EVALS is not set.
// Note: Codex uses its own auth from ~/.codex/ config — no OPENAI_API_KEY env var needed.
const SKIP = !CODEX_AVAILABLE || !evalsEnabled;
const describeCodex = SKIP ? describe.skip : describe;
// Log why we're skipping (helpful for debugging CI)
if (!evalsEnabled) {
// Silent — same as Claude E2E tests, EVALS=1 required
} else if (!CODEX_AVAILABLE) {
process.stderr.write('\nCodex E2E: SKIPPED — codex binary not found (install: npm i -g @openai/codex)\n');
}
// --- Diff-based test selection ---
// Codex E2E touchfiles — keyed by test name, same pattern as E2E_TOUCHFILES
const CODEX_E2E_TOUCHFILES: Record<string, string[]> = {
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
};
let selectedTests: string[] | null = null; // null = run all
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE
|| detectBaseBranch(ROOT)
|| 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length > 0) {
const selection = selectTests(changedFiles, CODEX_E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
selectedTests = selection.selected;
process.stderr.write(`\nCodex E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(CODEX_E2E_TOUCHFILES).length} tests\n`);
if (selection.skipped.length > 0) {
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
}
process.stderr.write('\n');
}
// If changedFiles is empty (e.g., on main branch), selectedTests stays null -> run all
}
/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
}
// --- Eval result collector ---
const evalCollector = evalsEnabled && !SKIP ? new EvalCollector('e2e-codex') : null;
/** DRY helper to record a Codex E2E test result into the eval collector. */
function recordCodexE2E(name: string, result: CodexResult, passed: boolean) {
evalCollector?.addTest({
name,
suite: 'codex-e2e',
tier: 'e2e',
passed,
duration_ms: result.durationMs,
cost_usd: 0, // Codex doesn't report cost in the same way; tokens are tracked
output: result.output?.slice(0, 2000),
turns_used: result.toolCalls.length, // approximate: tool calls as turns
exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
});
}
/** Print cost summary after a Codex E2E test. */
function logCodexCost(label: string, result: CodexResult) {
const durationSec = Math.round(result.durationMs / 1000);
console.log(`${label}: ${result.tokens} tokens, ${result.toolCalls.length} tool calls, ${durationSec}s`);
}
// Finalize eval results on exit
afterAll(async () => {
if (evalCollector) {
await evalCollector.finalize();
}
});
// --- Tests ---
describeCodex('Codex E2E', () => {
testIfSelected('codex-discover-skill', async () => {
// Install gstack-review skill to a temp HOME and ask Codex to list skills
const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
const result = await runCodexSkill({
skillDir,
prompt: 'List any skills or instructions you have available. Just list the names.',
timeoutMs: 60_000,
cwd: ROOT,
skillName: 'gstack-review',
});
logCodexCost('codex-discover-skill', result);
// Codex should have produced some output
const passed = result.exitCode === 0 && result.output.length > 0;
recordCodexE2E('codex-discover-skill', result, passed);
expect(result.exitCode).toBe(0);
expect(result.output.length).toBeGreaterThan(0);
// The output should reference the skill name in some form
const outputLower = result.output.toLowerCase();
expect(
outputLower.includes('review') || outputLower.includes('gstack') || outputLower.includes('skill'),
).toBe(true);
}, 120_000);
testIfSelected('codex-review-findings', async () => {
// Install gstack-review skill and ask Codex to review the current repo
const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
const result = await runCodexSkill({
skillDir,
prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
timeoutMs: 540_000,
cwd: ROOT,
skillName: 'gstack-review',
});
logCodexCost('codex-review-findings', result);
// Should produce structured review-like output
const output = result.output;
const passed = result.exitCode === 0 && output.length > 50;
recordCodexE2E('codex-review-findings', result, passed);
expect(result.exitCode).toBe(0);
expect(output.length).toBeGreaterThan(50);
// Review output should contain some review-like content
const outputLower = output.toLowerCase();
const hasReviewContent =
outputLower.includes('finding') ||
outputLower.includes('issue') ||
outputLower.includes('review') ||
outputLower.includes('change') ||
outputLower.includes('diff') ||
outputLower.includes('clean') ||
outputLower.includes('no issues') ||
outputLower.includes('p1') ||
outputLower.includes('p2');
expect(hasReviewContent).toBe(true);
}, 600_000);
});
+173
View File
@@ -0,0 +1,173 @@
/**
* Gemini CLI E2E tests — verify skills work when invoked by Gemini CLI.
*
* Spawns `gemini -p` with stream-json output in the repo root (where
* .agents/skills/ already exists), parses JSONL events, and validates
* structured results. Follows the same pattern as codex-e2e.test.ts.
*
* Prerequisites:
* - `gemini` binary installed (npm install -g @google/gemini-cli)
* - Gemini authenticated via ~/.gemini/ config or GEMINI_API_KEY env var
* - EVALS=1 env var set (same gate as Claude E2E tests)
*
* Skips gracefully when prerequisites are not met.
*/
import { describe, test, expect, afterAll } from 'bun:test';
import { runGeminiSkill } from './helpers/gemini-session-runner';
import type { GeminiResult } from './helpers/gemini-session-runner';
import { EvalCollector } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import * as path from 'path';
const ROOT = path.resolve(import.meta.dir, '..');
// --- Prerequisites check ---
const GEMINI_AVAILABLE = (() => {
try {
const result = Bun.spawnSync(['which', 'gemini']);
return result.exitCode === 0;
} catch { return false; }
})();
const evalsEnabled = !!process.env.EVALS;
// Skip all tests if gemini is not available or EVALS is not set.
const SKIP = !GEMINI_AVAILABLE || !evalsEnabled;
const describeGemini = SKIP ? describe.skip : describe;
// Log why we're skipping (helpful for debugging CI)
if (!evalsEnabled) {
// Silent — same as Claude E2E tests, EVALS=1 required
} else if (!GEMINI_AVAILABLE) {
process.stderr.write('\nGemini E2E: SKIPPED — gemini binary not found (install: npm i -g @google/gemini-cli)\n');
}
// --- Diff-based test selection ---
// Gemini E2E touchfiles — keyed by test name, same pattern as Codex E2E
const GEMINI_E2E_TOUCHFILES: Record<string, string[]> = {
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
};
let selectedTests: string[] | null = null; // null = run all
if (evalsEnabled && !process.env.EVALS_ALL) {
const baseBranch = process.env.EVALS_BASE
|| detectBaseBranch(ROOT)
|| 'main';
const changedFiles = getChangedFiles(baseBranch, ROOT);
if (changedFiles.length > 0) {
const selection = selectTests(changedFiles, GEMINI_E2E_TOUCHFILES, GLOBAL_TOUCHFILES);
selectedTests = selection.selected;
process.stderr.write(`\nGemini E2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(GEMINI_E2E_TOUCHFILES).length} tests\n`);
if (selection.skipped.length > 0) {
process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`);
}
process.stderr.write('\n');
}
// If changedFiles is empty (e.g., on main branch), selectedTests stays null -> run all
}
/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
}
// --- Eval result collector ---
const evalCollector = evalsEnabled && !SKIP ? new EvalCollector('e2e-gemini') : null;
/** DRY helper to record a Gemini E2E test result into the eval collector. */
function recordGeminiE2E(name: string, result: GeminiResult, passed: boolean) {
evalCollector?.addTest({
name,
suite: 'gemini-e2e',
tier: 'e2e',
passed,
duration_ms: result.durationMs,
cost_usd: 0, // Gemini doesn't report cost in USD; tokens are tracked
output: result.output?.slice(0, 2000),
turns_used: result.toolCalls.length, // approximate: tool calls as turns
exit_reason: result.exitCode === 0 ? 'success' : `exit_code_${result.exitCode}`,
});
}
/** Print cost summary after a Gemini E2E test. */
function logGeminiCost(label: string, result: GeminiResult) {
const durationSec = Math.round(result.durationMs / 1000);
console.log(`${label}: ${result.tokens} tokens, ${result.toolCalls.length} tool calls, ${durationSec}s`);
}
// Finalize eval results on exit
afterAll(async () => {
if (evalCollector) {
await evalCollector.finalize();
}
});
// --- Tests ---
describeGemini('Gemini E2E', () => {
testIfSelected('gemini-discover-skill', async () => {
// Run Gemini in the repo root where .agents/skills/ exists
const result = await runGeminiSkill({
prompt: 'List any skills or instructions you have available. Just list the names.',
timeoutMs: 60_000,
cwd: ROOT,
});
logGeminiCost('gemini-discover-skill', result);
// Gemini should have produced some output
const passed = result.exitCode === 0 && result.output.length > 0;
recordGeminiE2E('gemini-discover-skill', result, passed);
expect(result.exitCode).toBe(0);
expect(result.output.length).toBeGreaterThan(0);
// The output should reference skills in some form
const outputLower = result.output.toLowerCase();
expect(
outputLower.includes('review') || outputLower.includes('gstack') || outputLower.includes('skill'),
).toBe(true);
}, 120_000);
testIfSelected('gemini-review-findings', async () => {
// Run gstack-review skill via Gemini on this repo
const result = await runGeminiSkill({
prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
timeoutMs: 540_000,
cwd: ROOT,
});
logGeminiCost('gemini-review-findings', result);
// Should produce structured review-like output
const output = result.output;
const passed = result.exitCode === 0 && output.length > 50;
recordGeminiE2E('gemini-review-findings', result, passed);
expect(result.exitCode).toBe(0);
expect(output.length).toBeGreaterThan(50);
// Review output should contain some review-like content
const outputLower = output.toLowerCase();
const hasReviewContent =
outputLower.includes('finding') ||
outputLower.includes('issue') ||
outputLower.includes('review') ||
outputLower.includes('change') ||
outputLower.includes('diff') ||
outputLower.includes('clean') ||
outputLower.includes('no issues') ||
outputLower.includes('p1') ||
outputLower.includes('p2');
expect(hasReviewContent).toBe(true);
}, 600_000);
});
+480 -23
View File
@@ -6,6 +6,22 @@ import * as path from 'path';
const ROOT = path.resolve(import.meta.dir, '..');
// Dynamic template discovery — matches the generator's findTemplates() behavior.
// New skills automatically get test coverage without updating a static list.
const ALL_SKILLS = (() => {
const skills: Array<{ dir: string; name: string }> = [];
if (fs.existsSync(path.join(ROOT, 'SKILL.md.tmpl'))) {
skills.push({ dir: '.', name: 'root gstack' });
}
for (const entry of fs.readdirSync(ROOT, { withFileTypes: true })) {
if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
if (fs.existsSync(path.join(ROOT, entry.name, 'SKILL.md.tmpl'))) {
skills.push({ dir: entry.name, name: entry.name });
}
}
return skills;
})();
describe('gen-skill-docs', () => {
test('generated SKILL.md contains all command categories', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
@@ -56,29 +72,6 @@ describe('gen-skill-docs', () => {
}
});
// All skills that must have templates — single source of truth
const ALL_SKILLS = [
{ dir: '.', name: 'root gstack' },
{ dir: 'browse', name: 'browse' },
{ dir: 'qa', name: 'qa' },
{ dir: 'qa-only', name: 'qa-only' },
{ dir: 'review', name: 'review' },
{ dir: 'ship', name: 'ship' },
{ dir: 'plan-ceo-review', name: 'plan-ceo-review' },
{ dir: 'plan-eng-review', name: 'plan-eng-review' },
{ dir: 'retro', name: 'retro' },
{ dir: 'setup-browser-cookies', name: 'setup-browser-cookies' },
{ dir: 'gstack-upgrade', name: 'gstack-upgrade' },
{ dir: 'plan-design-review', name: 'plan-design-review' },
{ dir: 'design-review', name: 'design-review' },
{ dir: 'design-consultation', name: 'design-consultation' },
{ dir: 'document-release', name: 'document-release' },
{ dir: 'careful', name: 'careful' },
{ dir: 'freeze', name: 'freeze' },
{ dir: 'guard', name: 'guard' },
{ dir: 'unfreeze', name: 'unfreeze' },
];
test('every skill has a SKILL.md.tmpl template', () => {
for (const skill of ALL_SKILLS) {
const tmplPath = path.join(ROOT, skill.dir, 'SKILL.md.tmpl');
@@ -422,3 +415,467 @@ describe('REVIEW_DASHBOARD resolver', () => {
expect(content).not.toContain('Review Chaining');
});
});
// --- {{SPEC_REVIEW_LOOP}} resolver tests ---
describe('SPEC_REVIEW_LOOP resolver', () => {
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
test('contains all 5 review dimensions', () => {
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
expect(content).toContain(dim);
}
});
test('references Agent tool for subagent dispatch', () => {
expect(content).toMatch(/Agent.*tool/i);
});
test('specifies max 3 iterations', () => {
expect(content).toMatch(/3.*iteration|maximum.*3/i);
});
test('includes quality score', () => {
expect(content).toContain('quality score');
});
test('includes metrics path', () => {
expect(content).toContain('spec-review.jsonl');
});
test('includes convergence guard', () => {
expect(content).toMatch(/[Cc]onvergence/);
});
test('includes graceful failure handling', () => {
expect(content).toMatch(/skip.*review|unavailable/i);
});
});
// --- {{DESIGN_SKETCH}} resolver tests ---
describe('DESIGN_SKETCH resolver', () => {
const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8');
test('references DESIGN.md for design system constraints', () => {
expect(content).toContain('DESIGN.md');
});
test('contains wireframe or sketch terminology', () => {
expect(content).toMatch(/wireframe|sketch/i);
});
test('references browse binary for rendering', () => {
expect(content).toContain('$B goto');
});
test('references screenshot capture', () => {
expect(content).toContain('$B screenshot');
});
test('specifies rough aesthetic', () => {
expect(content).toMatch(/[Rr]ough|hand-drawn/);
});
test('includes skip conditions', () => {
expect(content).toMatch(/no UI component|skip/i);
});
});
// --- {{BENEFITS_FROM}} resolver tests ---
describe('BENEFITS_FROM resolver', () => {
const ceoContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
const engContent = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
test('plan-ceo-review contains prerequisite skill offer', () => {
expect(ceoContent).toContain('Prerequisite Skill Offer');
expect(ceoContent).toContain('/office-hours');
});
test('plan-eng-review contains prerequisite skill offer', () => {
expect(engContent).toContain('Prerequisite Skill Offer');
expect(engContent).toContain('/office-hours');
});
test('offer includes graceful decline', () => {
expect(ceoContent).toContain('No worries');
});
test('skills without benefits-from do NOT have prerequisite offer', () => {
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
expect(qaContent).not.toContain('Prerequisite Skill Offer');
});
});
// ─── Codex Generation Tests ─────────────────────────────────
describe('Codex generation (--host codex)', () => {
const AGENTS_DIR = path.join(ROOT, '.agents', 'skills');
// Dynamic discovery of expected Codex skills: all templates except /codex
const CODEX_SKILLS = (() => {
const skills: Array<{ dir: string; codexName: string }> = [];
if (fs.existsSync(path.join(ROOT, 'SKILL.md.tmpl'))) {
skills.push({ dir: '.', codexName: 'gstack' });
}
for (const entry of fs.readdirSync(ROOT, { withFileTypes: true })) {
if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
if (entry.name === 'codex') continue; // /codex is excluded from Codex output
if (!fs.existsSync(path.join(ROOT, entry.name, 'SKILL.md.tmpl'))) continue;
const codexName = entry.name.startsWith('gstack-') ? entry.name : `gstack-${entry.name}`;
skills.push({ dir: entry.name, codexName });
}
return skills;
})();
test('--host codex generates correct output paths', () => {
for (const skill of CODEX_SKILLS) {
const skillMd = path.join(AGENTS_DIR, skill.codexName, 'SKILL.md');
expect(fs.existsSync(skillMd)).toBe(true);
}
});
test('codexSkillName mapping: root is gstack, others are gstack-{dir}', () => {
// Root → gstack
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack', 'SKILL.md'))).toBe(true);
// Subdirectories → gstack-{dir}
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'))).toBe(true);
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-ship', 'SKILL.md'))).toBe(true);
// gstack-upgrade doesn't double-prefix
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-upgrade', 'SKILL.md'))).toBe(true);
// No double-prefix: gstack-gstack-upgrade must NOT exist
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-gstack-upgrade', 'SKILL.md'))).toBe(false);
});
test('Codex frontmatter has ONLY name + description', () => {
for (const skill of CODEX_SKILLS) {
const content = fs.readFileSync(path.join(AGENTS_DIR, skill.codexName, 'SKILL.md'), 'utf-8');
expect(content.startsWith('---\n')).toBe(true);
const fmEnd = content.indexOf('\n---', 4);
expect(fmEnd).toBeGreaterThan(0);
const frontmatter = content.slice(4, fmEnd);
// Must have name and description
expect(frontmatter).toContain('name:');
expect(frontmatter).toContain('description:');
// Must NOT have allowed-tools, version, or hooks
expect(frontmatter).not.toContain('allowed-tools:');
expect(frontmatter).not.toContain('version:');
expect(frontmatter).not.toContain('hooks:');
}
});
test('no .claude/skills/ in Codex output', () => {
for (const skill of CODEX_SKILLS) {
const content = fs.readFileSync(path.join(AGENTS_DIR, skill.codexName, 'SKILL.md'), 'utf-8');
expect(content).not.toContain('.claude/skills');
}
});
test('no ~/.claude/ paths in Codex output', () => {
for (const skill of CODEX_SKILLS) {
const content = fs.readFileSync(path.join(AGENTS_DIR, skill.codexName, 'SKILL.md'), 'utf-8');
expect(content).not.toContain('~/.claude/');
}
});
test('/codex skill excluded from Codex output', () => {
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-codex', 'SKILL.md'))).toBe(false);
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-codex'))).toBe(false);
});
test('Codex review step stripped from Codex-host ship and review', () => {
const shipContent = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-ship', 'SKILL.md'), 'utf-8');
expect(shipContent).not.toContain('codex review --base');
expect(shipContent).not.toContain('Investigate and fix');
const reviewContent = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
expect(reviewContent).not.toContain('codex review --base');
expect(reviewContent).not.toContain('Investigate and fix');
});
test('--host codex --dry-run freshness', () => {
const result = Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex', '--dry-run'], {
cwd: ROOT,
stdout: 'pipe',
stderr: 'pipe',
});
expect(result.exitCode).toBe(0);
const output = result.stdout.toString();
// Every Codex skill should be FRESH
for (const skill of CODEX_SKILLS) {
expect(output).toContain(`FRESH: .agents/skills/${skill.codexName}/SKILL.md`);
}
expect(output).not.toContain('STALE');
});
test('--host agents alias produces same output as --host codex', () => {
const codexResult = Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex', '--dry-run'], {
cwd: ROOT,
stdout: 'pipe',
stderr: 'pipe',
});
const agentsResult = Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'agents', '--dry-run'], {
cwd: ROOT,
stdout: 'pipe',
stderr: 'pipe',
});
expect(codexResult.exitCode).toBe(0);
expect(agentsResult.exitCode).toBe(0);
// Both should produce the same output (same FRESH lines)
expect(codexResult.stdout.toString()).toBe(agentsResult.stdout.toString());
});
test('multiline descriptions preserved in Codex output', () => {
// office-hours has a multiline description — verify it survives the frontmatter transform
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-office-hours', 'SKILL.md'), 'utf-8');
const fmEnd = content.indexOf('\n---', 4);
const frontmatter = content.slice(4, fmEnd);
// Description should span multiple lines (block scalar)
const descLines = frontmatter.split('\n').filter(l => l.startsWith(' '));
expect(descLines.length).toBeGreaterThan(1);
// Verify key phrases survived
expect(frontmatter).toContain('YC Office Hours');
});
test('hook skills have safety prose and no hooks: in frontmatter', () => {
const HOOK_SKILLS = ['gstack-careful', 'gstack-freeze', 'gstack-guard'];
for (const skillName of HOOK_SKILLS) {
const content = fs.readFileSync(path.join(AGENTS_DIR, skillName, 'SKILL.md'), 'utf-8');
// Must have safety advisory prose
expect(content).toContain('Safety Advisory');
// Must NOT have hooks: in frontmatter
const fmEnd = content.indexOf('\n---', 4);
const frontmatter = content.slice(4, fmEnd);
expect(frontmatter).not.toContain('hooks:');
}
});
test('all Codex SKILL.md files have auto-generated header', () => {
for (const skill of CODEX_SKILLS) {
const content = fs.readFileSync(path.join(AGENTS_DIR, skill.codexName, 'SKILL.md'), 'utf-8');
expect(content).toContain('AUTO-GENERATED from SKILL.md.tmpl');
expect(content).toContain('Regenerate: bun run gen:skill-docs');
}
});
test('Codex preamble uses codex paths', () => {
// Check a skill that has a preamble (review is a good candidate)
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
expect(content).toContain('~/.codex/skills/gstack');
expect(content).toContain('.agents/skills/gstack');
});
// ─── Path rewriting regression tests ─────────────────────────
test('sidecar paths point to .agents/skills/gstack/review/ (not gstack-review/)', () => {
// Regression: gen-skill-docs rewrote .claude/skills/review → .agents/skills/gstack-review
// but setup puts sidecars under .agents/skills/gstack/review/. Must match setup layout.
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
// Correct: references to sidecar files use gstack/review/ path
expect(content).toContain('.agents/skills/gstack/review/checklist.md');
expect(content).toContain('.agents/skills/gstack/review/design-checklist.md');
// Wrong: must NOT reference gstack-review/checklist.md (file doesn't exist there)
expect(content).not.toContain('.agents/skills/gstack-review/checklist.md');
expect(content).not.toContain('.agents/skills/gstack-review/design-checklist.md');
});
test('sidecar paths in ship skill point to gstack/review/ for pre-landing review', () => {
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-ship', 'SKILL.md'), 'utf-8');
// Ship references the review checklist in its pre-landing review step
if (content.includes('checklist.md')) {
expect(content).toContain('.agents/skills/gstack/review/');
expect(content).not.toContain('.agents/skills/gstack-review/checklist');
}
});
test('greptile-triage sidecar path is correct', () => {
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
if (content.includes('greptile-triage')) {
expect(content).toContain('.agents/skills/gstack/review/greptile-triage.md');
expect(content).not.toContain('.agents/skills/gstack-review/greptile-triage');
}
});
test('all four path rewrite rules produce correct output', () => {
// Test each of the 4 path rewrite rules individually
const content = fs.readFileSync(path.join(AGENTS_DIR, 'gstack-review', 'SKILL.md'), 'utf-8');
// Rule 1: ~/.claude/skills/gstack → ~/.codex/skills/gstack
expect(content).not.toContain('~/.claude/skills/gstack');
expect(content).toContain('~/.codex/skills/gstack');
// Rule 2: .claude/skills/gstack → .agents/skills/gstack
expect(content).not.toContain('.claude/skills/gstack');
// Rule 3: .claude/skills/review → .agents/skills/gstack/review
expect(content).not.toContain('.claude/skills/review');
// Rule 4: .claude/skills → .agents/skills (catch-all)
expect(content).not.toContain('.claude/skills');
});
test('path rewrite rules apply to all Codex skills with sidecar references', () => {
// Verify across ALL generated skills, not just review
for (const skill of CODEX_SKILLS) {
const content = fs.readFileSync(path.join(AGENTS_DIR, skill.codexName, 'SKILL.md'), 'utf-8');
// No skill should reference Claude paths
expect(content).not.toContain('~/.claude/skills');
expect(content).not.toContain('.claude/skills');
// If a skill references checklist.md, it must use the correct sidecar path
if (content.includes('checklist.md') && !content.includes('design-checklist.md')) {
expect(content).not.toContain('gstack-review/checklist.md');
}
}
});
// ─── Claude output regression guard ─────────────────────────
test('Claude output unchanged: review skill still uses .claude/skills/ paths', () => {
// Codex changes must NOT affect Claude output
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
expect(content).toContain('.claude/skills/review/checklist.md');
expect(content).toContain('~/.claude/skills/gstack');
// Must NOT contain Codex paths
expect(content).not.toContain('.agents/skills');
expect(content).not.toContain('~/.codex/');
});
test('Claude output unchanged: ship skill still uses .claude/skills/ paths', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('~/.claude/skills/gstack');
expect(content).not.toContain('.agents/skills');
expect(content).not.toContain('~/.codex/');
});
test('Claude output unchanged: all Claude skills have zero Codex paths', () => {
for (const skill of ALL_SKILLS) {
const content = fs.readFileSync(path.join(ROOT, skill.dir, 'SKILL.md'), 'utf-8');
expect(content).not.toContain('~/.codex/');
expect(content).not.toContain('.agents/skills');
}
});
});
// ─── Setup script validation ─────────────────────────────────
// These tests verify the setup script's install layout matches
// what the generator produces — catching the bug where setup
// installed Claude-format source dirs for Codex users.
describe('setup script validation', () => {
const setupContent = fs.readFileSync(path.join(ROOT, 'setup'), 'utf-8');
test('setup has separate link functions for Claude and Codex', () => {
expect(setupContent).toContain('link_claude_skill_dirs');
expect(setupContent).toContain('link_codex_skill_dirs');
// Old unified function must not exist
expect(setupContent).not.toMatch(/^link_skill_dirs\(\)/m);
});
test('Claude install uses link_claude_skill_dirs', () => {
// The Claude install section (section 4) should use the Claude function
const claudeSection = setupContent.slice(
setupContent.indexOf('# 4. Install for Claude'),
setupContent.indexOf('# 5. Install for Codex')
);
expect(claudeSection).toContain('link_claude_skill_dirs');
expect(claudeSection).not.toContain('link_codex_skill_dirs');
});
test('Codex install uses link_codex_skill_dirs', () => {
// The Codex install section (section 5) should use the Codex function
const codexSection = setupContent.slice(
setupContent.indexOf('# 5. Install for Codex'),
setupContent.indexOf('# 6. Create')
);
expect(codexSection).toContain('link_codex_skill_dirs');
expect(codexSection).not.toContain('link_claude_skill_dirs');
});
test('link_codex_skill_dirs reads from .agents/skills/', () => {
// The Codex link function must reference .agents/skills for generated Codex skills
const fnStart = setupContent.indexOf('link_codex_skill_dirs()');
const fnEnd = setupContent.indexOf('}', setupContent.indexOf('linked[@]}', fnStart));
const fnBody = setupContent.slice(fnStart, fnEnd);
expect(fnBody).toContain('.agents/skills');
expect(fnBody).toContain('gstack*');
});
test('link_claude_skill_dirs creates relative symlinks', () => {
// Claude links should be relative: ln -snf "gstack/skill_name"
const fnStart = setupContent.indexOf('link_claude_skill_dirs()');
const fnEnd = setupContent.indexOf('}', setupContent.indexOf('linked[@]}', fnStart));
const fnBody = setupContent.slice(fnStart, fnEnd);
expect(fnBody).toContain('ln -snf "gstack/$skill_name"');
});
test('setup supports --host auto|claude|codex', () => {
expect(setupContent).toContain('--host');
expect(setupContent).toContain('claude|codex|auto');
});
test('auto mode detects claude and codex binaries', () => {
expect(setupContent).toContain('command -v claude');
expect(setupContent).toContain('command -v codex');
});
test('create_agents_sidecar links runtime assets', () => {
// Sidecar must link bin, browse, review, qa
const fnStart = setupContent.indexOf('create_agents_sidecar()');
const fnEnd = setupContent.indexOf('}', setupContent.indexOf('done', fnStart));
const fnBody = setupContent.slice(fnStart, fnEnd);
expect(fnBody).toContain('bin');
expect(fnBody).toContain('browse');
expect(fnBody).toContain('review');
expect(fnBody).toContain('qa');
});
});
describe('telemetry', () => {
test('generated SKILL.md contains telemetry start block', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('_TEL_START');
expect(content).toContain('_SESSION_ID');
expect(content).toContain('TELEMETRY:');
expect(content).toContain('TEL_PROMPTED:');
expect(content).toContain('gstack-config get telemetry');
});
test('generated SKILL.md contains telemetry opt-in prompt', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('.telemetry-prompted');
expect(content).toContain('Help gstack get better');
expect(content).toContain('gstack-config set telemetry community');
expect(content).toContain('gstack-config set telemetry anonymous');
expect(content).toContain('gstack-config set telemetry off');
});
test('generated SKILL.md contains telemetry epilogue', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('Telemetry (run last)');
expect(content).toContain('gstack-telemetry-log');
expect(content).toContain('_TEL_END');
expect(content).toContain('_TEL_DUR');
expect(content).toContain('SKILL_NAME');
expect(content).toContain('OUTCOME');
expect(content).toContain('PLAN MODE EXCEPTION');
});
test('generated SKILL.md contains pending marker handling', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
expect(content).toContain('.pending');
expect(content).toContain('_pending_finalize');
});
test('telemetry blocks appear in all skill files that use PREAMBLE', () => {
const skills = ['qa', 'ship', 'review', 'plan-ceo-review', 'plan-eng-review', 'retro'];
for (const skill of skills) {
const skillPath = path.join(ROOT, skill, 'SKILL.md');
if (fs.existsSync(skillPath)) {
const content = fs.readFileSync(skillPath, 'utf-8');
expect(content).toContain('_TEL_START');
expect(content).toContain('Telemetry (run last)');
}
}
});
});
+282
View File
@@ -0,0 +1,282 @@
/**
* Codex CLI subprocess runner for skill E2E testing.
*
* Spawns `codex exec` as a completely independent process, parses its JSONL
* output, and returns structured results. Follows the same pattern as
* session-runner.ts but adapted for the Codex CLI.
*
* Key differences from Claude session-runner:
* - Uses `codex exec` instead of `claude -p`
* - Output is JSONL with different event types (item.completed, turn.completed, thread.started)
* - Uses `--json` flag instead of `--output-format stream-json`
* - Needs temp HOME with skill installed at ~/.codex/skills/{skillName}/SKILL.md
*/
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
// --- Interfaces ---
export interface CodexResult {
output: string; // Full agent message text
reasoning: string[]; // [codex thinking] blocks
toolCalls: string[]; // [codex ran] commands
tokens: number; // Total tokens used
exitCode: number; // Process exit code
durationMs: number; // Wall clock time
sessionId: string | null; // Thread ID for session continuity
rawLines: string[]; // Raw JSONL lines for debugging
}
// --- JSONL parser (ported from Python in codex/SKILL.md.tmpl) ---
export interface ParsedCodexJSONL {
output: string;
reasoning: string[];
toolCalls: string[];
tokens: number;
sessionId: string | null;
}
/**
* Parse an array of JSONL lines from `codex exec --json` into structured data.
* Pure function — no I/O, no side effects.
*
* Handles these Codex event types:
* - thread.started → extract thread_id (session ID)
* - item.completed → extract reasoning, agent_message, command_execution
* - turn.completed → extract token usage
*/
export function parseCodexJSONL(lines: string[]): ParsedCodexJSONL {
const outputParts: string[] = [];
const reasoning: string[] = [];
const toolCalls: string[] = [];
let tokens = 0;
let sessionId: string | null = null;
for (const line of lines) {
if (!line.trim()) continue;
try {
const obj = JSON.parse(line);
const t = obj.type || '';
if (t === 'thread.started') {
const tid = obj.thread_id || '';
if (tid) sessionId = tid;
} else if (t === 'item.completed' && obj.item) {
const item = obj.item;
const itype = item.type || '';
const text = item.text || '';
if (itype === 'reasoning' && text) {
reasoning.push(text);
} else if (itype === 'agent_message' && text) {
outputParts.push(text);
} else if (itype === 'command_execution') {
const cmd = item.command || '';
if (cmd) toolCalls.push(cmd);
}
} else if (t === 'turn.completed') {
const usage = obj.usage || {};
const turnTokens = (usage.input_tokens || 0) + (usage.output_tokens || 0);
tokens += turnTokens;
}
} catch { /* skip malformed lines */ }
}
return {
output: outputParts.join('\n'),
reasoning,
toolCalls,
tokens,
sessionId,
};
}
// --- Skill installation helper ---
/**
* Install a SKILL.md into a temp HOME directory for Codex to discover.
* Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME.
*
* Returns the temp HOME path. Caller is responsible for cleanup.
*/
export function installSkillToTempHome(
skillDir: string,
skillName: string,
tempHome?: string,
): string {
const home = tempHome || fs.mkdtempSync(path.join(os.tmpdir(), 'codex-e2e-'));
const destDir = path.join(home, '.codex', 'skills', skillName);
fs.mkdirSync(destDir, { recursive: true });
const srcSkill = path.join(skillDir, 'SKILL.md');
if (fs.existsSync(srcSkill)) {
fs.copyFileSync(srcSkill, path.join(destDir, 'SKILL.md'));
}
return home;
}
// --- Main runner ---
/**
* Run a Codex skill via `codex exec` and return structured results.
*
* Spawns codex in a temp HOME with the skill installed, parses JSONL output,
* and returns a CodexResult. Skips gracefully if codex binary is not found.
*/
export async function runCodexSkill(opts: {
skillDir: string; // Path to skill directory containing SKILL.md
prompt: string; // What to ask Codex to do with the skill
timeoutMs?: number; // Default 300000 (5 min)
cwd?: string; // Working directory
skillName?: string; // Skill name for installation (default: dirname)
sandbox?: string; // Sandbox mode (default: 'read-only')
}): Promise<CodexResult> {
const {
skillDir,
prompt,
timeoutMs = 300_000,
cwd,
skillName,
sandbox = 'read-only',
} = opts;
const startTime = Date.now();
const name = skillName || path.basename(skillDir) || 'gstack';
// Check if codex binary exists
const whichResult = Bun.spawnSync(['which', 'codex']);
if (whichResult.exitCode !== 0) {
return {
output: 'SKIP: codex binary not found',
reasoning: [],
toolCalls: [],
tokens: 0,
exitCode: -1,
durationMs: Date.now() - startTime,
sessionId: null,
rawLines: [],
};
}
// Set up temp HOME with skill installed
const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), 'codex-e2e-'));
const realHome = os.homedir();
try {
installSkillToTempHome(skillDir, name, tempHome);
// Symlink real Codex auth config so codex can authenticate from temp HOME.
// Codex stores auth in ~/.codex/ — we need the config but not the skills
// (we install our own test skills above).
const realCodexConfig = path.join(realHome, '.codex');
const tempCodexDir = path.join(tempHome, '.codex');
if (fs.existsSync(realCodexConfig)) {
// Copy auth-related files from real ~/.codex/ into temp ~/.codex/
// (skills/ is already set up by installSkillToTempHome)
const entries = fs.readdirSync(realCodexConfig);
for (const entry of entries) {
if (entry === 'skills') continue; // don't clobber our test skills
const src = path.join(realCodexConfig, entry);
const dst = path.join(tempCodexDir, entry);
if (!fs.existsSync(dst)) {
fs.cpSync(src, dst, { recursive: true });
}
}
}
// Build codex exec command
const args = ['exec', prompt, '--json', '-s', sandbox];
// Spawn codex with temp HOME so it discovers our installed skill
const proc = Bun.spawn(['codex', ...args], {
cwd: cwd || skillDir,
stdout: 'pipe',
stderr: 'pipe',
env: {
...process.env,
HOME: tempHome,
},
});
// Race against timeout
let timedOut = false;
const timeoutId = setTimeout(() => {
timedOut = true;
proc.kill();
}, timeoutMs);
// Stream and collect JSONL from stdout
const collectedLines: string[] = [];
const stderrPromise = new Response(proc.stderr).text();
const reader = proc.stdout.getReader();
const decoder = new TextDecoder();
let buf = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buf += decoder.decode(value, { stream: true });
const lines = buf.split('\n');
buf = lines.pop() || '';
for (const line of lines) {
if (!line.trim()) continue;
collectedLines.push(line);
// Real-time progress to stderr
try {
const event = JSON.parse(line);
if (event.type === 'item.completed' && event.item) {
const item = event.item;
if (item.type === 'command_execution' && item.command) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [codex ${elapsed}s] ran: ${item.command.slice(0, 100)}\n`);
} else if (item.type === 'agent_message' && item.text) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [codex ${elapsed}s] message: ${item.text.slice(0, 100)}\n`);
}
}
} catch { /* skip — parseCodexJSONL will handle it later */ }
}
}
} catch { /* stream read error — fall through to exit code handling */ }
// Flush remaining buffer
if (buf.trim()) {
collectedLines.push(buf);
}
const stderr = await stderrPromise;
const exitCode = await proc.exited;
clearTimeout(timeoutId);
const durationMs = Date.now() - startTime;
// Parse all collected JSONL lines
const parsed = parseCodexJSONL(collectedLines);
// Log stderr if non-empty (may contain auth errors, etc.)
if (stderr.trim()) {
process.stderr.write(` [codex stderr] ${stderr.trim().slice(0, 200)}\n`);
}
return {
output: parsed.output,
reasoning: parsed.reasoning,
toolCalls: parsed.toolCalls,
tokens: parsed.tokens,
exitCode: timedOut ? 124 : exitCode,
durationMs,
sessionId: parsed.sessionId,
rawLines: collectedLines,
};
} finally {
// Clean up temp HOME
try { fs.rmSync(tempHome, { recursive: true, force: true }); } catch { /* non-fatal */ }
}
}
+104
View File
@@ -0,0 +1,104 @@
import { describe, test, expect } from 'bun:test';
import { parseGeminiJSONL } from './gemini-session-runner';
// Fixture: actual Gemini CLI stream-json output with tool use
const FIXTURE_LINES = [
'{"type":"init","timestamp":"2026-03-20T15:14:46.455Z","session_id":"test-session-123","model":"auto-gemini-3"}',
'{"type":"message","timestamp":"2026-03-20T15:14:46.456Z","role":"user","content":"list the files"}',
'{"type":"message","timestamp":"2026-03-20T15:14:49.650Z","role":"assistant","content":"I will list the files.","delta":true}',
'{"type":"tool_use","timestamp":"2026-03-20T15:14:49.690Z","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
'{"type":"tool_result","timestamp":"2026-03-20T15:14:49.931Z","tool_id":"cmd_1","status":"success","output":"file1.ts\\nfile2.ts"}',
'{"type":"message","timestamp":"2026-03-20T15:14:51.945Z","role":"assistant","content":"Here are the files.","delta":true}',
'{"type":"result","timestamp":"2026-03-20T15:14:52.030Z","status":"success","stats":{"total_tokens":27147,"input_tokens":26928,"output_tokens":87,"cached":0,"duration_ms":5575,"tool_calls":1}}',
];
describe('parseGeminiJSONL', () => {
test('extracts session ID from init event', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.sessionId).toBe('test-session-123');
});
test('concatenates assistant message deltas into output', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.output).toBe('I will list the files.Here are the files.');
});
test('ignores user messages', () => {
const lines = [
'{"type":"message","role":"user","content":"this should be ignored"}',
'{"type":"message","role":"assistant","content":"this should be kept","delta":true}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.output).toBe('this should be kept');
});
test('extracts tool names from tool_use events', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.toolCalls).toHaveLength(1);
expect(parsed.toolCalls[0]).toBe('run_shell_command');
});
test('extracts total tokens from result stats', () => {
const parsed = parseGeminiJSONL(FIXTURE_LINES);
expect(parsed.tokens).toBe(27147);
});
test('skips malformed lines without throwing', () => {
const lines = [
'{"type":"init","session_id":"ok"}',
'this is not json',
'{"type":"message","role":"assistant","content":"hello","delta":true}',
'{incomplete json',
'{"type":"result","status":"success","stats":{"total_tokens":100}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBe('ok');
expect(parsed.output).toBe('hello');
expect(parsed.tokens).toBe(100);
});
test('skips empty and whitespace-only lines', () => {
const lines = [
'',
' ',
'{"type":"init","session_id":"s1"}',
'\t',
'{"type":"result","status":"success","stats":{"total_tokens":50}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBe('s1');
expect(parsed.tokens).toBe(50);
});
test('handles empty input', () => {
const parsed = parseGeminiJSONL([]);
expect(parsed.output).toBe('');
expect(parsed.toolCalls).toHaveLength(0);
expect(parsed.tokens).toBe(0);
expect(parsed.sessionId).toBeNull();
});
test('handles missing fields gracefully', () => {
const lines = [
'{"type":"init"}', // no session_id
'{"type":"message","role":"assistant"}', // no content
'{"type":"tool_use"}', // no tool_name
'{"type":"result","status":"success"}', // no stats
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.sessionId).toBeNull();
expect(parsed.output).toBe('');
expect(parsed.toolCalls).toHaveLength(0);
expect(parsed.tokens).toBe(0);
});
test('handles multiple tool_use events', () => {
const lines = [
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_1","parameters":{"command":"ls"}}',
'{"type":"tool_use","tool_name":"read_file","tool_id":"cmd_2","parameters":{"path":"foo.ts"}}',
'{"type":"tool_use","tool_name":"run_shell_command","tool_id":"cmd_3","parameters":{"command":"cat bar.ts"}}',
];
const parsed = parseGeminiJSONL(lines);
expect(parsed.toolCalls).toEqual(['run_shell_command', 'read_file', 'run_shell_command']);
});
});
+201
View File
@@ -0,0 +1,201 @@
/**
* Gemini CLI subprocess runner for skill E2E testing.
*
* Spawns `gemini -p` as an independent process, parses its stream-json
* output, and returns structured results. Follows the same pattern as
* codex-session-runner.ts but adapted for the Gemini CLI.
*
* Key differences from Codex session-runner:
* - Uses `gemini -p` instead of `codex exec`
* - Output is NDJSON with event types: init, message, tool_use, tool_result, result
* - Uses `--output-format stream-json --yolo` instead of `--json -s read-only`
* - No temp HOME needed — Gemini discovers skills from `.agents/skills/` in cwd
* - Message events are streamed with `delta: true` — must concatenate
*/
import * as path from 'path';
// --- Interfaces ---
export interface GeminiResult {
output: string; // Full assistant message text (concatenated deltas)
toolCalls: string[]; // Tool names from tool_use events
tokens: number; // Total tokens used
exitCode: number; // Process exit code
durationMs: number; // Wall clock time
sessionId: string | null; // Session ID from init event
rawLines: string[]; // Raw JSONL lines for debugging
}
// --- JSONL parser ---
export interface ParsedGeminiJSONL {
output: string;
toolCalls: string[];
tokens: number;
sessionId: string | null;
}
/**
* Parse an array of JSONL lines from `gemini -p --output-format stream-json`.
* Pure function — no I/O, no side effects.
*
* Handles these Gemini event types:
* - init → extract session_id
* - message (role=assistant, delta=true) → concatenate content into output
* - tool_use → extract tool_name
* - tool_result → logged but not extracted
* - result → extract token usage from stats
*/
export function parseGeminiJSONL(lines: string[]): ParsedGeminiJSONL {
const outputParts: string[] = [];
const toolCalls: string[] = [];
let tokens = 0;
let sessionId: string | null = null;
for (const line of lines) {
if (!line.trim()) continue;
try {
const obj = JSON.parse(line);
const t = obj.type || '';
if (t === 'init') {
const sid = obj.session_id || '';
if (sid) sessionId = sid;
} else if (t === 'message') {
if (obj.role === 'assistant' && obj.content) {
outputParts.push(obj.content);
}
} else if (t === 'tool_use') {
const name = obj.tool_name || '';
if (name) toolCalls.push(name);
} else if (t === 'result') {
const stats = obj.stats || {};
tokens = (stats.total_tokens || 0);
}
} catch { /* skip malformed lines */ }
}
return {
output: outputParts.join(''),
toolCalls,
tokens,
sessionId,
};
}
// --- Main runner ---
/**
* Run a prompt via `gemini -p` and return structured results.
*
* Spawns gemini with stream-json output, parses JSONL events,
* and returns a GeminiResult. Skips gracefully if gemini binary is not found.
*/
export async function runGeminiSkill(opts: {
prompt: string; // What to ask Gemini
timeoutMs?: number; // Default 300000 (5 min)
cwd?: string; // Working directory (where .agents/skills/ lives)
}): Promise<GeminiResult> {
const {
prompt,
timeoutMs = 300_000,
cwd,
} = opts;
const startTime = Date.now();
// Check if gemini binary exists
const whichResult = Bun.spawnSync(['which', 'gemini']);
if (whichResult.exitCode !== 0) {
return {
output: 'SKIP: gemini binary not found',
toolCalls: [],
tokens: 0,
exitCode: -1,
durationMs: Date.now() - startTime,
sessionId: null,
rawLines: [],
};
}
// Build gemini command
const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];
// Spawn gemini — uses real HOME for auth, cwd for skill discovery
const proc = Bun.spawn(['gemini', ...args], {
cwd: cwd || process.cwd(),
stdout: 'pipe',
stderr: 'pipe',
});
// Race against timeout
let timedOut = false;
const timeoutId = setTimeout(() => {
timedOut = true;
proc.kill();
}, timeoutMs);
// Stream and collect JSONL from stdout
const collectedLines: string[] = [];
const stderrPromise = new Response(proc.stderr).text();
const reader = proc.stdout.getReader();
const decoder = new TextDecoder();
let buf = '';
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
buf += decoder.decode(value, { stream: true });
const lines = buf.split('\n');
buf = lines.pop() || '';
for (const line of lines) {
if (!line.trim()) continue;
collectedLines.push(line);
// Real-time progress to stderr
try {
const event = JSON.parse(line);
if (event.type === 'tool_use' && event.tool_name) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [gemini ${elapsed}s] tool: ${event.tool_name}\n`);
} else if (event.type === 'message' && event.role === 'assistant' && event.content) {
const elapsed = Math.round((Date.now() - startTime) / 1000);
process.stderr.write(` [gemini ${elapsed}s] message: ${event.content.slice(0, 100)}\n`);
}
} catch { /* skip — parseGeminiJSONL will handle it later */ }
}
}
} catch { /* stream read error — fall through to exit code handling */ }
// Flush remaining buffer
if (buf.trim()) {
collectedLines.push(buf);
}
const stderr = await stderrPromise;
const exitCode = await proc.exited;
clearTimeout(timeoutId);
const durationMs = Date.now() - startTime;
// Parse all collected JSONL lines
const parsed = parseGeminiJSONL(collectedLines);
// Log stderr if non-empty (may contain auth errors, etc.)
if (stderr.trim()) {
process.stderr.write(` [gemini stderr] ${stderr.trim().slice(0, 200)}\n`);
}
return {
output: parsed.output,
toolCalls: parsed.toolCalls,
tokens: parsed.tokens,
exitCode: timedOut ? 124 : exitCode,
durationMs,
sessionId: parsed.sessionId,
rawLines: collectedLines,
};
}
+19 -1
View File
@@ -57,9 +57,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'review-base-branch': ['review/**'],
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
// Office Hours
'office-hours-spec-review': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
// Plan reviews
'plan-ceo-review': ['plan-ceo-review/**'],
'plan-ceo-review-selective': ['plan-ceo-review/**'],
'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
'plan-eng-review': ['plan-eng-review/**'],
'plan-eng-review-artifact': ['plan-eng-review/**'],
@@ -73,9 +77,17 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Document-release
'document-release': ['document-release/**'],
// Codex
// Codex (Claude E2E — tests /codex skill via Claude)
'codex-review': ['codex/**'],
// Codex E2E (tests skills via Codex CLI)
'codex-discover-skill': ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
// Gemini E2E (tests skills via Gemini CLI)
'gemini-discover-skill': ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
// QA bootstrap
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
@@ -136,6 +148,10 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
// Office Hours
'office-hours/SKILL.md spec review': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'office-hours/SKILL.md design sketch': ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
// Other skills
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
@@ -147,6 +163,8 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
*/
export const GLOBAL_TOUCHFILES = [
'test/helpers/session-runner.ts',
'test/helpers/codex-session-runner.ts',
'test/helpers/gemini-session-runner.ts',
'test/helpers/eval-store.ts',
'test/helpers/llm-judge.ts',
'scripts/gen-skill-docs.ts',
+122
View File
@@ -2910,6 +2910,128 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
}, 360_000);
});
// --- Office Hours Spec Review E2E ---
describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'], () => {
let ohDir: string;
beforeAll(() => {
ohDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-oh-spec-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: ohDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(ohDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy office-hours skill
fs.mkdirSync(path.join(ohDir, 'office-hours'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'office-hours', 'SKILL.md'),
path.join(ohDir, 'office-hours', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
});
test('/office-hours SKILL.md contains spec review loop', async () => {
const result = await runSkillTest({
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
Summarize what the "Spec Review Loop" section does — specifically:
1. How many dimensions does the reviewer check?
2. What tool is used to dispatch the reviewer?
3. What's the maximum number of iterations?
4. What metrics are tracked?
Write your summary to ${ohDir}/spec-review-summary.md`,
workingDirectory: ohDir,
maxTurns: 8,
timeout: 120_000,
testName: 'office-hours-spec-review',
runId,
});
logCost('/office-hours spec review', result);
recordE2E('/office-hours-spec-review', 'Office Hours Spec Review E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(ohDir, 'spec-review-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the key concepts
expect(summary).toMatch(/5.*dimension|dimension.*5|completeness|consistency|clarity|scope|feasibility/);
expect(summary).toMatch(/agent|subagent/);
expect(summary).toMatch(/3.*iteration|iteration.*3|maximum.*3/);
}
}, 180_000);
});
// --- Plan CEO Review Benefits-From E2E ---
describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefits'], () => {
let benefitsDir: string;
beforeAll(() => {
benefitsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benefits-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: benefitsDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(benefitsDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy plan-ceo-review skill
fs.mkdirSync(path.join(benefitsDir, 'plan-ceo-review'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
path.join(benefitsDir, 'plan-ceo-review', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
Summarize what happens when no design doc is found — specifically:
1. Is /office-hours offered as a prerequisite?
2. What options does the user get?
3. Is there a mid-session detection for when the user seems lost?
Write your summary to ${benefitsDir}/benefits-summary.md`,
workingDirectory: benefitsDir,
maxTurns: 8,
timeout: 120_000,
testName: 'plan-ceo-review-benefits',
runId,
});
logCost('/plan-ceo-review benefits-from', result);
recordE2E('/plan-ceo-review-benefits', 'Plan CEO Review Benefits-From E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(benefitsDir, 'benefits-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the skill chaining
expect(summary).toMatch(/office.hours/);
expect(summary).toMatch(/design doc|no design/i);
}
}, 180_000);
});
// Module-level afterAll — finalize eval collector and clean up E2E isolation
afterAll(async () => {
if (evalCollector) {
+158 -4
View File
@@ -630,6 +630,59 @@ describe('office-hours skill structure', () => {
test('contains builder operating principles', () => {
expect(content).toContain('Delight is the currency');
});
// Spec Review Loop (Phase 5.5)
test('contains spec review loop', () => {
expect(content).toContain('Spec Review Loop');
});
test('contains adversarial review dimensions', () => {
for (const dim of ['Completeness', 'Consistency', 'Clarity', 'Scope', 'Feasibility']) {
expect(content).toContain(dim);
}
});
test('contains subagent dispatch instruction', () => {
expect(content).toMatch(/Agent.*tool|subagent/i);
});
test('contains max 3 iterations', () => {
expect(content).toMatch(/3.*iteration|maximum.*3/i);
});
test('contains quality score', () => {
expect(content).toContain('quality score');
});
test('contains spec review metrics path', () => {
expect(content).toContain('spec-review.jsonl');
});
test('contains convergence guard', () => {
expect(content).toMatch(/convergence/i);
});
// Visual Sketch (Phase 4.5)
test('contains visual sketch section', () => {
expect(content).toContain('Visual Sketch');
});
test('contains wireframe generation', () => {
expect(content).toMatch(/wireframe|sketch/i);
});
test('contains DESIGN.md awareness', () => {
expect(content).toContain('DESIGN.md');
});
test('contains browse rendering', () => {
expect(content).toContain('$B goto');
expect(content).toContain('$B screenshot');
});
test('contains rough aesthetic instruction', () => {
expect(content).toMatch(/rough|hand-drawn/i);
});
});
describe('investigate skill structure', () => {
@@ -842,6 +895,22 @@ describe('CEO review mode validation', () => {
expect(content).toContain('HOLD SCOPE');
expect(content).toContain('REDUCTION');
});
// Skill chaining (benefits-from)
test('contains prerequisite skill offer for office-hours', () => {
expect(content).toContain('Prerequisite Skill Offer');
expect(content).toContain('/office-hours');
});
test('contains mid-session detection', () => {
expect(content).toContain('Mid-session detection');
expect(content).toMatch(/still figuring out|seems lost/i);
});
// Spec review on CEO plans
test('contains spec review loop for CEO plan documents', () => {
expect(content).toContain('Spec Review Loop');
});
});
// --- gstack-slug helper ---
@@ -1174,18 +1243,36 @@ describe('Codex skill', () => {
expect(content).toContain('mktemp');
});
test('codex integration in /review offers second opinion', () => {
test('codex integration in /review has config-driven review step', () => {
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
expect(content).toContain('Codex second opinion');
expect(content).toContain('Codex review');
expect(content).toContain('codex_reviews');
expect(content).toContain('codex review');
expect(content).toContain('adversarial');
expect(content).toContain('xhigh');
expect(content).toContain('Investigate and fix');
expect(content).toContain('CROSS-MODEL');
});
test('codex integration in /ship offers review gate', () => {
test('codex integration in /ship has config-driven review step', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Codex');
expect(content).toContain('Codex review');
expect(content).toContain('codex_reviews');
expect(content).toContain('codex review');
expect(content).toContain('codex-review');
expect(content).toContain('xhigh');
expect(content).toContain('Investigate and fix');
});
test('codex-host ship/review do NOT contain codex review step', () => {
const shipContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-ship', 'SKILL.md'), 'utf-8');
expect(shipContent).not.toContain('codex review --base');
expect(shipContent).not.toContain('Investigate and fix');
const reviewContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-review', 'SKILL.md'), 'utf-8');
expect(reviewContent).not.toContain('codex review --base');
expect(reviewContent).not.toContain('codex_reviews');
expect(reviewContent).not.toContain('Investigate and fix');
});
test('codex integration in /plan-eng-review offers plan critique', () => {
@@ -1244,3 +1331,70 @@ describe('Skill trigger phrases', () => {
});
}
});
// ─── Codex Skill Validation ──────────────────────────────────
describe('Codex skill validation', () => {
const AGENTS_DIR = path.join(ROOT, '.agents', 'skills');
// Discover all Claude skills with templates (except /codex which is Claude-only)
const CLAUDE_SKILLS_WITH_TEMPLATES = (() => {
const skills: string[] = [];
for (const entry of fs.readdirSync(ROOT, { withFileTypes: true })) {
if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
if (entry.name === 'codex') continue; // Claude-only skill
if (fs.existsSync(path.join(ROOT, entry.name, 'SKILL.md.tmpl'))) {
skills.push(entry.name);
}
}
return skills;
})();
test('all skills (except /codex) have both Claude and Codex variants', () => {
for (const skillDir of CLAUDE_SKILLS_WITH_TEMPLATES) {
// Claude variant
const claudeMd = path.join(ROOT, skillDir, 'SKILL.md');
expect(fs.existsSync(claudeMd)).toBe(true);
// Codex variant
const codexName = skillDir.startsWith('gstack-') ? skillDir : `gstack-${skillDir}`;
const codexMd = path.join(AGENTS_DIR, codexName, 'SKILL.md');
expect(fs.existsSync(codexMd)).toBe(true);
}
// Root template has both too
expect(fs.existsSync(path.join(ROOT, 'SKILL.md'))).toBe(true);
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack', 'SKILL.md'))).toBe(true);
});
test('/codex skill is Claude-only — no Codex variant', () => {
// Claude variant should exist
expect(fs.existsSync(path.join(ROOT, 'codex', 'SKILL.md'))).toBe(true);
// Codex variant must NOT exist
expect(fs.existsSync(path.join(AGENTS_DIR, 'gstack-codex', 'SKILL.md'))).toBe(false);
});
test('Codex skill names follow gstack-{name} convention', () => {
const codexDirs = fs.readdirSync(AGENTS_DIR);
for (const dir of codexDirs) {
// Every directory should start with gstack
expect(dir.startsWith('gstack')).toBe(true);
// Root is just 'gstack', others are 'gstack-{name}'
if (dir !== 'gstack') {
expect(dir.startsWith('gstack-')).toBe(true);
}
}
});
test('$B commands in Codex SKILL.md files are valid browse commands', () => {
const codexDirs = fs.readdirSync(AGENTS_DIR);
for (const dir of codexDirs) {
const skillMd = path.join(AGENTS_DIR, dir, 'SKILL.md');
if (!fs.existsSync(skillMd)) continue;
const content = fs.readFileSync(skillMd, 'utf-8');
// Only validate if the skill contains $B commands
if (!content.includes('$B ')) continue;
const result = validateSkill(skillMd);
expect(result.invalid).toHaveLength(0);
}
});
});
+278
View File
@@ -0,0 +1,278 @@
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { execSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin');
// Each test gets a fresh temp directory for GSTACK_STATE_DIR
let tmpDir: string;
function run(cmd: string, env: Record<string, string> = {}): string {
return execSync(cmd, {
cwd: ROOT,
env: { ...process.env, GSTACK_STATE_DIR: tmpDir, GSTACK_DIR: ROOT, ...env },
encoding: 'utf-8',
timeout: 10000,
}).trim();
}
function setConfig(key: string, value: string) {
run(`${BIN}/gstack-config set ${key} ${value}`);
}
function readJsonl(): string[] {
const file = path.join(tmpDir, 'analytics', 'skill-usage.jsonl');
if (!fs.existsSync(file)) return [];
return fs.readFileSync(file, 'utf-8').trim().split('\n').filter(Boolean);
}
function parseJsonl(): any[] {
return readJsonl().map(line => JSON.parse(line));
}
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-tel-'));
});
afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});
describe('gstack-telemetry-log', () => {
test('appends valid JSONL when tier=anonymous', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 142 --outcome success --session-id test-123`);
const events = parseJsonl();
expect(events).toHaveLength(1);
expect(events[0].v).toBe(1);
expect(events[0].skill).toBe('qa');
expect(events[0].duration_s).toBe(142);
expect(events[0].outcome).toBe('success');
expect(events[0].session_id).toBe('test-123');
expect(events[0].event_type).toBe('skill_run');
expect(events[0].os).toBeTruthy();
expect(events[0].gstack_version).toBeTruthy();
});
test('produces no output when tier=off', () => {
setConfig('telemetry', 'off');
run(`${BIN}/gstack-telemetry-log --skill ship --duration 30 --outcome success --session-id test-456`);
expect(readJsonl()).toHaveLength(0);
});
test('defaults to off for invalid tier value', () => {
setConfig('telemetry', 'invalid_value');
run(`${BIN}/gstack-telemetry-log --skill ship --duration 30 --outcome success --session-id test-789`);
expect(readJsonl()).toHaveLength(0);
});
test('includes installation_id for community tier', () => {
setConfig('telemetry', 'community');
run(`${BIN}/gstack-telemetry-log --skill review --duration 100 --outcome success --session-id comm-123`);
const events = parseJsonl();
expect(events).toHaveLength(1);
// installation_id should be a SHA-256 hash (64 hex chars)
expect(events[0].installation_id).toMatch(/^[a-f0-9]{64}$/);
});
test('installation_id is null for anonymous tier', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 50 --outcome success --session-id anon-123`);
const events = parseJsonl();
expect(events[0].installation_id).toBeNull();
});
test('includes error_class when provided', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill browse --duration 10 --outcome error --error-class timeout --session-id err-123`);
const events = parseJsonl();
expect(events[0].error_class).toBe('timeout');
expect(events[0].outcome).toBe('error');
});
test('handles missing duration gracefully', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --outcome success --session-id nodur-123`);
const events = parseJsonl();
expect(events[0].duration_s).toBeNull();
});
test('supports event_type flag', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --event-type upgrade_prompted --skill "" --outcome success --session-id up-123`);
const events = parseJsonl();
expect(events[0].event_type).toBe('upgrade_prompted');
});
test('includes local-only fields (_repo_slug, _branch)', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 50 --outcome success --session-id local-123`);
const events = parseJsonl();
// These should be present in local JSONL
expect(events[0]).toHaveProperty('_repo_slug');
expect(events[0]).toHaveProperty('_branch');
});
test('creates analytics directory if missing', () => {
// Remove analytics dir
const analyticsDir = path.join(tmpDir, 'analytics');
if (fs.existsSync(analyticsDir)) fs.rmSync(analyticsDir, { recursive: true });
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 50 --outcome success --session-id mkdir-123`);
expect(fs.existsSync(analyticsDir)).toBe(true);
expect(readJsonl()).toHaveLength(1);
});
});
describe('.pending marker', () => {
test('finalizes stale .pending from another session as outcome:unknown', () => {
setConfig('telemetry', 'anonymous');
// Write a fake .pending marker from a different session
const analyticsDir = path.join(tmpDir, 'analytics');
fs.mkdirSync(analyticsDir, { recursive: true });
fs.writeFileSync(
path.join(analyticsDir, '.pending-old-123'),
'{"skill":"old-skill","ts":"2026-03-18T00:00:00Z","session_id":"old-123","gstack_version":"0.6.4"}'
);
// Run telemetry-log with a DIFFERENT session — should finalize the old pending marker
run(`${BIN}/gstack-telemetry-log --skill qa --duration 50 --outcome success --session-id new-456`);
const events = parseJsonl();
expect(events).toHaveLength(2);
// First event: finalized pending
expect(events[0].skill).toBe('old-skill');
expect(events[0].outcome).toBe('unknown');
expect(events[0].session_id).toBe('old-123');
// Second event: new event
expect(events[1].skill).toBe('qa');
expect(events[1].outcome).toBe('success');
});
test('.pending-SESSION file is removed after finalization', () => {
setConfig('telemetry', 'anonymous');
const analyticsDir = path.join(tmpDir, 'analytics');
fs.mkdirSync(analyticsDir, { recursive: true });
const pendingPath = path.join(analyticsDir, '.pending-stale-session');
fs.writeFileSync(pendingPath, '{"skill":"stale","ts":"2026-03-18T00:00:00Z","session_id":"stale-session","gstack_version":"v"}');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 50 --outcome success --session-id new-456`);
expect(fs.existsSync(pendingPath)).toBe(false);
});
test('does not finalize own session pending marker', () => {
setConfig('telemetry', 'anonymous');
const analyticsDir = path.join(tmpDir, 'analytics');
fs.mkdirSync(analyticsDir, { recursive: true });
// Create pending for same session ID we'll use
const pendingPath = path.join(analyticsDir, '.pending-same-session');
fs.writeFileSync(pendingPath, '{"skill":"in-flight","ts":"2026-03-18T00:00:00Z","session_id":"same-session","gstack_version":"v"}');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 50 --outcome success --session-id same-session`);
// Should only have 1 event (the new one), not finalize own pending
const events = parseJsonl();
expect(events).toHaveLength(1);
expect(events[0].skill).toBe('qa');
});
test('tier=off still clears own session pending', () => {
setConfig('telemetry', 'off');
const analyticsDir = path.join(tmpDir, 'analytics');
fs.mkdirSync(analyticsDir, { recursive: true });
const pendingPath = path.join(analyticsDir, '.pending-off-123');
fs.writeFileSync(pendingPath, '{"skill":"stale","ts":"2026-03-18T00:00:00Z","session_id":"off-123","gstack_version":"v"}');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 50 --outcome success --session-id off-123`);
expect(fs.existsSync(pendingPath)).toBe(false);
// But no JSONL entries since tier=off
expect(readJsonl()).toHaveLength(0);
});
});
describe('gstack-analytics', () => {
test('shows "no data" for empty JSONL', () => {
const output = run(`${BIN}/gstack-analytics`);
expect(output).toContain('no data');
});
test('renders usage dashboard with events', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 120 --outcome success --session-id a-1`);
run(`${BIN}/gstack-telemetry-log --skill qa --duration 60 --outcome success --session-id a-2`);
run(`${BIN}/gstack-telemetry-log --skill ship --duration 30 --outcome error --error-class timeout --session-id a-3`);
const output = run(`${BIN}/gstack-analytics all`);
expect(output).toContain('/qa');
expect(output).toContain('/ship');
expect(output).toContain('2 runs');
expect(output).toContain('1 runs');
expect(output).toContain('Success rate: 66%');
expect(output).toContain('Errors: 1');
});
test('filters by time window', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 60 --outcome success --session-id t-1`);
const output7d = run(`${BIN}/gstack-analytics 7d`);
expect(output7d).toContain('/qa');
expect(output7d).toContain('last 7 days');
});
});
describe('gstack-telemetry-sync', () => {
test('exits silently with no endpoint configured', () => {
// Default: GSTACK_TELEMETRY_ENDPOINT is not set → exit 0
const result = run(`${BIN}/gstack-telemetry-sync`);
expect(result).toBe('');
});
test('exits silently with no JSONL file', () => {
const result = run(`${BIN}/gstack-telemetry-sync`, { GSTACK_TELEMETRY_ENDPOINT: 'http://localhost:9999' });
expect(result).toBe('');
});
});
describe('gstack-community-dashboard', () => {
test('shows unconfigured message when no Supabase config available', () => {
// Use a fake GSTACK_DIR with no supabase/config.sh
const output = run(`${BIN}/gstack-community-dashboard`, {
GSTACK_DIR: tmpDir,
GSTACK_SUPABASE_URL: '',
GSTACK_SUPABASE_ANON_KEY: '',
});
expect(output).toContain('Supabase not configured');
expect(output).toContain('gstack-analytics');
});
test('connects to Supabase when config exists', () => {
// Use the real GSTACK_DIR which has supabase/config.sh
const output = run(`${BIN}/gstack-community-dashboard`);
expect(output).toContain('gstack community dashboard');
// Should not show "not configured" since config.sh exists
expect(output).not.toContain('Supabase not configured');
});
});
+3 -2
View File
@@ -78,8 +78,9 @@ describe('selectTests', () => {
const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES);
expect(result.selected).toContain('plan-ceo-review');
expect(result.selected).toContain('plan-ceo-review-selective');
expect(result.selected.length).toBe(2);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2);
expect(result.selected).toContain('plan-ceo-review-benefits');
expect(result.selected.length).toBe(3);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 3);
});
test('global touchfile triggers ALL tests', () => {