test: add triage resolver unit tests, plan-eng coverage audit E2E, and triage E2E

- TEST_FAILURE_TRIAGE resolver: 6 unit tests verifying all triage steps (T1-T4),
  REPO_MODE branching, and safety default for ambiguous failures
- plan-eng-coverage-audit E2E: tests /plan-eng-review coverage audit codepath
  (gap identified during eng review — existed on neither branch)
- ship-triage E2E: planted-bug fixture with in-branch (truncate null) and
  pre-existing (divide-by-zero) failures; verifies correct classification
- Touchfile entries for diff-based test selection

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-21 09:31:29 -07:00
parent 0394b8052d
commit fe3b5a30dc
3 changed files with 285 additions and 0 deletions
+40
View File
@@ -520,6 +520,46 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => {
});
});
// --- {{TEST_FAILURE_TRIAGE}} resolver tests ---
describe('TEST_FAILURE_TRIAGE resolver', () => {
const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
test('contains all 4 triage steps', () => {
expect(shipSkill).toContain('Step T1: Classify each failure');
expect(shipSkill).toContain('Step T2: Handle in-branch failures');
expect(shipSkill).toContain('Step T3: Handle pre-existing failures');
expect(shipSkill).toContain('Step T4: Execute the chosen action');
});
test('T1 includes classification criteria (in-branch vs pre-existing)', () => {
expect(shipSkill).toContain('In-branch');
expect(shipSkill).toContain('Likely pre-existing');
expect(shipSkill).toContain('git diff origin/');
});
test('T3 branches on REPO_MODE (solo vs collaborative)', () => {
expect(shipSkill).toContain('REPO_MODE');
expect(shipSkill).toContain('solo');
expect(shipSkill).toContain('collaborative');
});
test('solo mode offers fix-now, TODO, and skip options', () => {
expect(shipSkill).toContain('Investigate and fix now');
expect(shipSkill).toContain('Add as P0 TODO');
expect(shipSkill).toContain('Skip');
});
test('collaborative mode offers blame + assign option', () => {
expect(shipSkill).toContain('Blame + assign GitHub issue');
expect(shipSkill).toContain('gh issue create');
});
test('defaults ambiguous failures to in-branch (safety)', () => {
expect(shipSkill).toContain('When ambiguous, default to in-branch');
});
});
// --- {{SPEC_REVIEW_LOOP}} resolver tests ---
describe('SPEC_REVIEW_LOOP resolver', () => {
+2
View File
@@ -94,6 +94,8 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Coverage audit (shared fixture) + triage
'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'],
'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'],
'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'],
'ship-triage': ['ship/**', 'bin/gstack-repo-mode'],
// Design
'design-consultation-core': ['design-consultation/**'],
+243
View File
@@ -2858,6 +2858,249 @@ Output the diagram directly.`,
}, 180_000);
});
// --- Plan Eng Review Coverage Audit E2E ---
describeIfSelected('Plan Eng Review Coverage Audit E2E', ['plan-eng-coverage-audit'], () => {
let planCoverageDir: string;
beforeAll(() => {
planCoverageDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-coverage-'));
// Copy plan-eng-review skill files
copyDirSync(path.join(ROOT, 'plan-eng-review'), path.join(planCoverageDir, 'plan-eng-review'));
// Use shared fixture for billing project with coverage gaps
const { createCoverageAuditFixture } = require('./fixtures/coverage-audit-fixture');
createCoverageAuditFixture(planCoverageDir);
});
afterAll(() => {
try { fs.rmSync(planCoverageDir, { recursive: true, force: true }); } catch {}
});
test('/plan-eng-review coverage audit traces plan codepaths', async () => {
const result = await runSkillTest({
prompt: `Read the file plan-eng-review/SKILL.md for the plan review workflow instructions.
You are on the feature/billing branch. The base branch is main.
This is a test project — there is no remote, no PR to create.
ONLY run the Test Coverage Audit section from the plan review workflow.
Skip all other steps (architecture, code quality, performance, etc.).
The source code is in ${planCoverageDir}/src/billing.ts.
Existing tests are in ${planCoverageDir}/test/billing.test.ts.
Produce the ASCII coverage diagram showing which code paths are tested and which have gaps.
Output the diagram directly.`,
workingDirectory: planCoverageDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob', 'Grep'],
timeout: 120_000,
testName: 'plan-eng-coverage-audit',
runId,
});
logCost('/plan-eng-review coverage audit', result);
recordE2E('/plan-eng-review coverage audit', 'Plan Eng Review Coverage Audit E2E', result, {
passed: result.exitReason === 'success',
});
expect(result.exitReason).toBe('success');
// Check output contains coverage diagram elements
const output = result.output || '';
const outputLower = output.toLowerCase();
const hasGap = outputLower.includes('gap') || outputLower.includes('no test');
const hasTested = outputLower.includes('tested') || output.includes('✓') || output.includes('★');
const hasCoverage = outputLower.includes('coverage') || outputLower.includes('paths tested');
console.log(`Output has GAP markers: ${hasGap}`);
console.log(`Output has TESTED markers: ${hasTested}`);
console.log(`Output has coverage summary: ${hasCoverage}`);
// The agent MUST produce a coverage diagram with gap and tested markers
expect(hasGap || hasTested).toBe(true);
// At minimum, the agent should have read the source and test files
const readCalls = result.toolCalls.filter(tc => tc.tool === 'Read');
expect(readCalls.length).toBeGreaterThan(0);
}, 180_000);
});
// --- Triage E2E ---
describeIfSelected('Test Failure Triage E2E', ['ship-triage'], () => {
let triageDir: string;
beforeAll(() => {
triageDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-triage-'));
// Copy ship skill files
copyDirSync(path.join(ROOT, 'ship'), path.join(triageDir, 'ship'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: triageDir, stdio: 'pipe', timeout: 5000 });
// Init git repo
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Create a project with a pre-existing test failure on main
fs.writeFileSync(path.join(triageDir, 'package.json'), JSON.stringify({
name: 'triage-test-app',
version: '1.0.0',
scripts: { test: 'node test/run.js' },
}, null, 2));
fs.mkdirSync(path.join(triageDir, 'src'), { recursive: true });
fs.mkdirSync(path.join(triageDir, 'test'), { recursive: true });
// Source with a bug that exists on main (pre-existing)
fs.writeFileSync(path.join(triageDir, 'src', 'math.js'), `
module.exports = {
add: (a, b) => a + b,
divide: (a, b) => a / b, // BUG: no zero-division check (pre-existing)
};
`);
// Test file that catches the pre-existing bug
fs.writeFileSync(path.join(triageDir, 'test', 'math.test.js'), `
const { add, divide } = require('../src/math');
// This test passes
if (add(2, 3) !== 5) { console.error('FAIL: add(2,3) should be 5'); process.exit(1); }
console.log('PASS: add');
// This test FAILS — pre-existing bug (divide by zero returns Infinity, not an error)
try {
const result = divide(10, 0);
if (result === Infinity) { console.error('FAIL: divide(10,0) should throw, got Infinity'); process.exit(1); }
} catch(e) {
console.log('PASS: divide zero check');
}
`);
// Test runner
fs.writeFileSync(path.join(triageDir, 'test', 'run.js'), `
require('./math.test.js');
require('./string.test.js');
`);
// Commit on main with the pre-existing bug
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial: math utils with tests']);
// Create feature branch
run('git', ['checkout', '-b', 'feature/string-utils']);
// Add new code with a new bug (in-branch)
fs.writeFileSync(path.join(triageDir, 'src', 'string.js'), `
module.exports = {
capitalize: (s) => s.charAt(0).toUpperCase() + s.slice(1),
reverse: (s) => s.split('').reverse().join(''),
truncate: (s, len) => s.substring(0, len), // BUG: no null check (in-branch)
};
`);
// Add test that catches the in-branch bug
fs.writeFileSync(path.join(triageDir, 'test', 'string.test.js'), `
const { capitalize, reverse, truncate } = require('../src/string');
if (capitalize('hello') !== 'Hello') { console.error('FAIL: capitalize'); process.exit(1); }
console.log('PASS: capitalize');
if (reverse('abc') !== 'cba') { console.error('FAIL: reverse'); process.exit(1); }
console.log('PASS: reverse');
// This test FAILS — in-branch bug (null input causes TypeError)
try {
truncate(null, 5);
console.log('PASS: truncate null');
} catch(e) {
console.error('FAIL: truncate(null, 5) threw: ' + e.message);
process.exit(1);
}
`);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'feat: add string utilities']);
});
afterAll(() => {
try { fs.rmSync(triageDir, { recursive: true, force: true }); } catch {}
});
test('/ship triage correctly classifies in-branch vs pre-existing failures', async () => {
const result = await runSkillTest({
prompt: `Read the file ship/SKILL.md for the ship workflow instructions.
You are on the feature/string-utils branch. The base branch is main.
This is a test project — there is no remote, no PR to create.
Run the tests first:
\`\`\`bash
cd ${triageDir} && node test/run.js
\`\`\`
The tests will fail. Now run ONLY the Test Failure Ownership Triage (Steps T1-T4) from the ship workflow.
For each failing test, classify it as:
- **In-branch**: caused by changes on this branch (feature/string-utils)
- **Pre-existing**: existed before this branch (present on main)
Use git diff origin/main...HEAD (or git diff main...HEAD since there's no remote) to determine which files changed on this branch.
Output your classification for each failure clearly, labeling each as "IN-BRANCH" or "PRE-EXISTING" with your reasoning.
This is a solo repo (REPO_MODE=solo). For pre-existing failures, recommend fixing now.`,
workingDirectory: triageDir,
maxTurns: 20,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob', 'Grep'],
timeout: 180_000,
testName: 'ship-triage',
runId,
});
logCost('/ship triage', result);
const output = result.output || '';
const outputLower = output.toLowerCase();
// The triage should identify the string/truncate failure as in-branch
const hasInBranch = outputLower.includes('in-branch') || outputLower.includes('in branch') || outputLower.includes('introduced');
// The triage should identify the math/divide failure as pre-existing
const hasPreExisting = outputLower.includes('pre-existing') || outputLower.includes('pre existing') || outputLower.includes('existed before');
console.log(`Output identifies IN-BRANCH failures: ${hasInBranch}`);
console.log(`Output identifies PRE-EXISTING failures: ${hasPreExisting}`);
// Check that the string/truncate bug is classified as in-branch
const mentionsTruncate = outputLower.includes('truncate') || outputLower.includes('string');
const mentionsDivide = outputLower.includes('divide') || outputLower.includes('math');
console.log(`Mentions truncate/string (in-branch bug): ${mentionsTruncate}`);
console.log(`Mentions divide/math (pre-existing bug): ${mentionsDivide}`);
recordE2E('/ship triage', 'Test Failure Triage E2E', result, {
passed: result.exitReason === 'success' && hasInBranch && hasPreExisting,
has_in_branch_classification: hasInBranch,
has_pre_existing_classification: hasPreExisting,
mentions_truncate: mentionsTruncate,
mentions_divide: mentionsDivide,
});
expect(result.exitReason).toBe('success');
// Must classify at least one failure as in-branch AND one as pre-existing
expect(hasInBranch).toBe(true);
expect(hasPreExisting).toBe(true);
// Must mention the specific bugs
expect(mentionsTruncate).toBe(true);
expect(mentionsDivide).toBe(true);
}, 240_000);
});
// --- Codex skill E2E ---
describeIfSelected('Codex skill E2E', ['codex-review'], () => {