mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 13:15:24 +02:00
merge: incorporate origin/main into community-mode branch
Main advanced from 0.12.0.0 to 0.12.5.0 (voice directive, deploy dry-run, smarter browsing, headed mode, full commit coverage, codex hang fixes). Our branch had a stale 0.12.0.0 entry for community mode. Conflicts resolved: - VERSION/package.json: take main's 0.12.5.0 - CHANGELOG: take main's entries; our community-mode entry rewrites at ship - gen-skill-docs.ts: removed duplicate slug functions (main moved to resolvers/utility.ts) - touchfiles.ts: removed duplicate review-plan-completion tier entry - All 21 SKILL.md files: regenerated from templates (never resolve generated files manually) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -152,6 +152,24 @@ describe('gen-skill-docs', () => {
|
||||
}
|
||||
});
|
||||
|
||||
test('every Codex SKILL.md description stays under 900-char warning threshold', () => {
|
||||
const WARN_THRESHOLD = 900;
|
||||
const agentsDir = path.join(ROOT, '.agents', 'skills');
|
||||
if (!fs.existsSync(agentsDir)) return;
|
||||
const violations: string[] = [];
|
||||
for (const entry of fs.readdirSync(agentsDir, { withFileTypes: true })) {
|
||||
if (!entry.isDirectory()) continue;
|
||||
const skillMd = path.join(agentsDir, entry.name, 'SKILL.md');
|
||||
if (!fs.existsSync(skillMd)) continue;
|
||||
const content = fs.readFileSync(skillMd, 'utf-8');
|
||||
const description = extractDescription(content);
|
||||
if (description.length > WARN_THRESHOLD) {
|
||||
violations.push(`${entry.name}: ${description.length} chars (limit ${MAX_SKILL_DESCRIPTION_LENGTH}, ${MAX_SKILL_DESCRIPTION_LENGTH - description.length} remaining)`);
|
||||
}
|
||||
}
|
||||
expect(violations).toEqual([]);
|
||||
});
|
||||
|
||||
test('package.json version matches VERSION file', () => {
|
||||
const pkg = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf-8'));
|
||||
const version = fs.readFileSync(path.join(ROOT, 'VERSION'), 'utf-8').trim();
|
||||
@@ -333,6 +351,39 @@ describe('BASE_BRANCH_DETECT resolver', () => {
|
||||
test('resolver output uses "the base branch" phrasing', () => {
|
||||
expect(shipContent).toContain('the base branch');
|
||||
});
|
||||
|
||||
test('resolver output contains GitLab CLI commands', () => {
|
||||
expect(shipContent).toContain('glab');
|
||||
});
|
||||
|
||||
test('resolver output contains git-native fallback', () => {
|
||||
expect(shipContent).toContain('git symbolic-ref');
|
||||
});
|
||||
|
||||
test('resolver output mentions GitLab platform', () => {
|
||||
expect(shipContent).toMatch(/gitlab/i);
|
||||
});
|
||||
});
|
||||
|
||||
describe('GitLab support in generated skills', () => {
|
||||
const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
|
||||
const shipSkillContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('retro contains GitLab MR number extraction', () => {
|
||||
expect(retroContent).toContain('[#!]');
|
||||
});
|
||||
|
||||
test('retro uses BASE_BRANCH_DETECT (contains glab)', () => {
|
||||
expect(retroContent).toContain('glab');
|
||||
});
|
||||
|
||||
test('ship contains glab mr create', () => {
|
||||
expect(shipSkillContent).toContain('glab mr create');
|
||||
});
|
||||
|
||||
test('ship checks .gitlab-ci.yml', () => {
|
||||
expect(shipSkillContent).toContain('.gitlab-ci.yml');
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
|
||||
@@ -79,6 +79,9 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Ship
|
||||
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
|
||||
'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||
'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'],
|
||||
'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||
'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Retro
|
||||
'retro': ['retro/**'],
|
||||
@@ -131,10 +134,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
||||
|
||||
// Deploy skills
|
||||
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
'canary-workflow': ['canary/**', 'browse/src/**'],
|
||||
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
|
||||
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
'land-and-deploy-first-run': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
|
||||
'land-and-deploy-review-gate': ['land-and-deploy/**', 'bin/gstack-review-read'],
|
||||
'canary-workflow': ['canary/**', 'browse/src/**'],
|
||||
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
|
||||
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
|
||||
|
||||
// Autoplan
|
||||
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
|
||||
@@ -184,6 +189,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'review-base-branch': 'gate',
|
||||
'review-design-lite': 'periodic', // 4/7 threshold is subjective
|
||||
'review-coverage-audit': 'gate',
|
||||
'review-plan-completion': 'gate',
|
||||
'review-dashboard-via': 'gate',
|
||||
|
||||
// Office Hours
|
||||
'office-hours-spec-review': 'gate',
|
||||
@@ -210,7 +217,6 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'ship-triage': 'gate',
|
||||
'ship-plan-completion': 'gate',
|
||||
'ship-plan-verification': 'gate',
|
||||
'review-plan-completion': 'gate',
|
||||
|
||||
// Retro — gate for cheap branch detection, periodic for full Opus retro
|
||||
'retro': 'periodic',
|
||||
@@ -250,6 +256,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
|
||||
// Deploy skills
|
||||
'land-and-deploy-workflow': 'gate',
|
||||
'land-and-deploy-first-run': 'gate',
|
||||
'land-and-deploy-review-gate': 'gate',
|
||||
'canary-workflow': 'gate',
|
||||
'benchmark-workflow': 'gate',
|
||||
'setup-deploy-workflow': 'gate',
|
||||
@@ -313,6 +321,9 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
||||
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
||||
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
||||
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
|
||||
|
||||
// Voice directive
|
||||
'voice directive tone': ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -85,6 +85,161 @@ Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Land-and-Deploy First-Run E2E ---
|
||||
|
||||
describeIfSelected('Land-and-Deploy first-run E2E', ['land-and-deploy-first-run'], () => {
|
||||
let firstRunDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
firstRunDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-first-run-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: firstRunDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "world"; }\n');
|
||||
fs.writeFileSync(path.join(firstRunDir, 'fly.toml'), 'app = "first-run-app"\n\n[http_service]\n internal_port = 3000\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
run('git', ['checkout', '-b', 'feat/first-deploy']);
|
||||
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "first deploy"; }\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'feat: first deploy']);
|
||||
|
||||
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(firstRunDir, 'land-and-deploy'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(firstRunDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('land-and-deploy-first-run', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
||||
|
||||
You are on branch feat/first-deploy. This is the FIRST TIME running /land-and-deploy
|
||||
for this project — there is NO land-deploy-confirmed file.
|
||||
|
||||
This repo has a fly.toml with app = "first-run-app", indicating a Fly.io deployment.
|
||||
|
||||
IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
|
||||
Instead, simulate the Step 1.5 first-run dry-run validation:
|
||||
1. Detect that this is a FIRST_RUN (no land-deploy-confirmed file)
|
||||
2. Detect the deploy platform from fly.toml (Fly.io, app = first-run-app)
|
||||
3. Infer the production URL (https://first-run-app.fly.dev)
|
||||
4. Build the DEPLOY INFRASTRUCTURE VALIDATION table showing:
|
||||
- Platform detected
|
||||
- Command validation results (simulated as all passing)
|
||||
- Staging detection results (none expected)
|
||||
- What will happen steps
|
||||
5. Write the dry-run report to .gstack/deploy-reports/dry-run-validation.md
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.
|
||||
Just demonstrate the first-run dry-run output.`,
|
||||
workingDirectory: firstRunDir,
|
||||
maxTurns: 20,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'land-and-deploy-first-run',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/land-and-deploy first-run', result);
|
||||
recordE2E(evalCollector, '/land-and-deploy first-run', 'Land-and-Deploy first-run E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify dry-run report was created
|
||||
const reportDir = path.join(firstRunDir, '.gstack', 'deploy-reports');
|
||||
expect(fs.existsSync(reportDir)).toBe(true);
|
||||
|
||||
// Check report content mentions platform detection
|
||||
const reportFiles = fs.readdirSync(reportDir);
|
||||
expect(reportFiles.length).toBeGreaterThan(0);
|
||||
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
|
||||
const hasPlatform = reportContent.toLowerCase().includes('fly') || reportContent.toLowerCase().includes('first-run-app');
|
||||
expect(hasPlatform).toBe(true);
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Land-and-Deploy Review Gate E2E ---
|
||||
|
||||
describeIfSelected('Land-and-Deploy review gate E2E', ['land-and-deploy-review-gate'], () => {
|
||||
let reviewDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-review-'));
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(reviewDir, 'app.ts'), 'export function hello() { return "world"; }\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Create 6 more commits to make any review stale
|
||||
for (let i = 1; i <= 6; i++) {
|
||||
fs.writeFileSync(path.join(reviewDir, `file${i}.ts`), `export const x${i} = ${i};\n`);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', `feat: add file${i}`]);
|
||||
}
|
||||
|
||||
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(reviewDir, 'land-and-deploy'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('land-and-deploy-review-gate', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
||||
|
||||
Focus on Step 3.5a and Step 3.5a-bis (the review staleness check and inline review offer).
|
||||
|
||||
This repo has 6 commits since the initial commit. There are NO review logs
|
||||
(gstack-review-read would return NO_REVIEWS).
|
||||
|
||||
Simulate what the readiness gate would show:
|
||||
1. Run gstack-review-read equivalent (simulate NO_REVIEWS output)
|
||||
2. Determine review staleness: Eng Review should be "NOT RUN"
|
||||
3. Note that Step 3.5a-bis would offer an inline review
|
||||
4. Write a simulated readiness report to .gstack/deploy-reports/readiness-report.md
|
||||
showing the review status as NOT RUN with the inline review offer text
|
||||
|
||||
Do NOT use AskUserQuestion. Do NOT run gh commands.
|
||||
Show what the readiness gate output would look like.`,
|
||||
workingDirectory: reviewDir,
|
||||
maxTurns: 15,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
|
||||
timeout: 120_000,
|
||||
testName: 'land-and-deploy-review-gate',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/land-and-deploy review-gate', result);
|
||||
recordE2E(evalCollector, '/land-and-deploy review-gate', 'Land-and-Deploy review gate E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify readiness report was created
|
||||
const reportDir = path.join(reviewDir, '.gstack', 'deploy-reports');
|
||||
expect(fs.existsSync(reportDir)).toBe(true);
|
||||
|
||||
const reportFiles = fs.readdirSync(reportDir);
|
||||
expect(reportFiles.length).toBeGreaterThan(0);
|
||||
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
|
||||
// Should mention review status
|
||||
const hasReviewMention = reportContent.toLowerCase().includes('review') ||
|
||||
reportContent.toLowerCase().includes('not run');
|
||||
expect(hasReviewMention).toBe(true);
|
||||
}, 180_000);
|
||||
});
|
||||
|
||||
// --- Canary skill E2E ---
|
||||
|
||||
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
|
||||
|
||||
@@ -529,6 +529,119 @@ Analyze the git history and produce the narrative report as described in the SKI
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Review Dashboard Via Attribution E2E ---
|
||||
|
||||
describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'], () => {
|
||||
let dashDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
dashDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-dashboard-via-'));
|
||||
const run = (cmd: string, args: string[], cwd = dashDir) =>
|
||||
spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
// Create git repo with feature branch
|
||||
run('git', ['init', '-b', 'main']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v1");\n');
|
||||
run('git', ['add', 'app.ts']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
run('git', ['checkout', '-b', 'feature/dashboard-test']);
|
||||
fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v2");\n');
|
||||
run('git', ['add', 'app.ts']);
|
||||
run('git', ['commit', '-m', 'feat: update']);
|
||||
|
||||
// Get HEAD commit for review entries
|
||||
const headResult = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { cwd: dashDir, stdio: 'pipe' });
|
||||
const commit = headResult.stdout.toString().trim();
|
||||
|
||||
// Pre-populate review log with autoplan-sourced entries
|
||||
// gstack-review-read reads from ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl
|
||||
// For the test, we'll write a mock gstack-review-read script that returns our test data
|
||||
const timestamp = new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
|
||||
const reviewData = [
|
||||
`{"skill":"plan-eng-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"${commit}"}`,
|
||||
`{"skill":"plan-ceo-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"${commit}"}`,
|
||||
`{"skill":"codex-plan-review","timestamp":"${timestamp}","status":"clean","source":"codex","commit":"${commit}"}`,
|
||||
].join('\n');
|
||||
|
||||
// Write a mock gstack-review-read that returns our test data
|
||||
const mockBinDir = path.join(dashDir, '.mock-bin');
|
||||
fs.mkdirSync(mockBinDir, { recursive: true });
|
||||
fs.writeFileSync(path.join(mockBinDir, 'gstack-review-read'), [
|
||||
'#!/usr/bin/env bash',
|
||||
`echo '${reviewData.split('\n').join("'\necho '")}'`,
|
||||
'echo "---CONFIG---"',
|
||||
'echo "false"',
|
||||
'echo "---HEAD---"',
|
||||
`echo "${commit}"`,
|
||||
].join('\n'));
|
||||
fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755);
|
||||
|
||||
// Copy ship skill
|
||||
fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dashDir, 'ship-SKILL.md'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(dashDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
testConcurrentIfSelected('review-dashboard-via', async () => {
|
||||
const mockBinDir = path.join(dashDir, '.mock-bin');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read ship-SKILL.md. You only need to run the Review Readiness Dashboard section.
|
||||
|
||||
Instead of running ~/.claude/skills/gstack/bin/gstack-review-read, run this mock: ${mockBinDir}/gstack-review-read
|
||||
|
||||
Parse the output and display the dashboard table. Pay attention to:
|
||||
1. The "via" field in entries — show source attribution (e.g., "via /autoplan")
|
||||
2. The codex-plan-review entry — it should populate the Outside Voice row
|
||||
3. Since Eng Review IS clear, there should be NO gate blocking — just display the dashboard
|
||||
|
||||
Skip the preamble, lake intro, telemetry, and all other ship steps.
|
||||
Write the dashboard output to ${dashDir}/dashboard-output.md`,
|
||||
workingDirectory: dashDir,
|
||||
maxTurns: 12,
|
||||
timeout: 90_000,
|
||||
testName: 'review-dashboard-via',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/ship dashboard-via', result);
|
||||
recordE2E(evalCollector, '/ship review dashboard via attribution', 'Dashboard via field', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Check dashboard output for via attribution
|
||||
const dashPath = path.join(dashDir, 'dashboard-output.md');
|
||||
const allOutput = [
|
||||
result.output || '',
|
||||
...result.toolCalls.map(tc => tc.output || ''),
|
||||
].join('\n').toLowerCase();
|
||||
|
||||
// Verify via attribution appears somewhere (conversation or file)
|
||||
let dashContent = '';
|
||||
if (fs.existsSync(dashPath)) {
|
||||
dashContent = fs.readFileSync(dashPath, 'utf-8').toLowerCase();
|
||||
}
|
||||
const combined = allOutput + dashContent;
|
||||
|
||||
// Should mention autoplan attribution
|
||||
expect(combined).toMatch(/autoplan/);
|
||||
// Should show eng review as CLEAR (it has a clean entry)
|
||||
expect(combined).toMatch(/clear/i);
|
||||
// Should NOT contain AskUserQuestion gate (no blocking)
|
||||
const gateQuestions = result.toolCalls.filter(tc =>
|
||||
tc.tool === 'mcp__conductor__AskUserQuestion' ||
|
||||
(tc.tool === 'AskUserQuestion')
|
||||
);
|
||||
// Ship dashboard should not gate when eng review is clear
|
||||
expect(gateQuestions).toHaveLength(0);
|
||||
}, 120_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
await finalizeEvalCollector(evalCollector);
|
||||
|
||||
@@ -778,6 +778,69 @@ describeIfSelected('Other skill evals', [
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Voice directive eval — tests that the voice section produces the right tone
|
||||
describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
|
||||
testIfSelected('voice directive tone', async () => {
|
||||
const t0 = Date.now();
|
||||
// Read a tier 2+ skill to get the full voice directive in context
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const voiceStart = content.indexOf('## Voice');
|
||||
if (voiceStart === -1) {
|
||||
throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
|
||||
}
|
||||
const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
|
||||
const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
|
||||
|
||||
const result = await callJudge<{
|
||||
directness: number;
|
||||
concreteness: number;
|
||||
avoids_corporate: number;
|
||||
avoids_ai_vocabulary: number;
|
||||
connects_user_outcomes: number;
|
||||
reasoning: string;
|
||||
}>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
|
||||
Score each dimension 1-5 where 5 is excellent:
|
||||
|
||||
1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
|
||||
2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
|
||||
3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
|
||||
4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
|
||||
5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
|
||||
|
||||
Return JSON only:
|
||||
{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
|
||||
|
||||
THE VOICE DIRECTIVE:
|
||||
${voiceSection}`);
|
||||
|
||||
console.log('Voice directive scores:', JSON.stringify(result, null, 2));
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: 'voice directive tone',
|
||||
suite: 'Voice directive eval',
|
||||
tier: 'llm-judge',
|
||||
passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
|
||||
&& result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: 0.02,
|
||||
judge_scores: {
|
||||
directness: result.directness,
|
||||
concreteness: result.concreteness,
|
||||
avoids_corporate: result.avoids_corporate,
|
||||
avoids_ai_vocabulary: result.avoids_ai_vocabulary,
|
||||
connects_user_outcomes: result.connects_user_outcomes,
|
||||
},
|
||||
judge_reasoning: result.reasoning,
|
||||
});
|
||||
|
||||
expect(result.directness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.concreteness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
|
||||
expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
@@ -1325,7 +1325,7 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('fall back to the Claude adversarial subagent');
|
||||
// Review log uses new skill name
|
||||
expect(content).toContain('adversarial-review');
|
||||
expect(content).toContain('xhigh');
|
||||
expect(content).toContain('reasoning_effort="high"');
|
||||
expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
|
||||
});
|
||||
|
||||
@@ -1335,7 +1335,7 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('< 50');
|
||||
expect(content).toContain('200+');
|
||||
expect(content).toContain('adversarial-review');
|
||||
expect(content).toContain('xhigh');
|
||||
expect(content).toContain('reasoning_effort="high"');
|
||||
expect(content).toContain('Investigate and fix');
|
||||
});
|
||||
|
||||
@@ -1369,11 +1369,6 @@ describe('Codex skill', () => {
|
||||
expect(content).toContain('Persist Eng Review result');
|
||||
});
|
||||
|
||||
test('/ship gate suggests /review or /plan-eng-review when Eng Review is missing', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Abort — run /review or /plan-eng-review first');
|
||||
});
|
||||
|
||||
test('Review Readiness Dashboard includes Adversarial Review row', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('Adversarial');
|
||||
|
||||
Reference in New Issue
Block a user