mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
9ce9e10aae
Per user feedback: don't shorten AskUserQuestion to AUQ — the abbreviation reads as cryptic. Apply across all the new code from this branch: - Rename test/skill-e2e-auq-format-compliance.test.ts → test/skill-e2e-ask-user-question-format-compliance.test.ts - Touchfile entry auq-format-pty → ask-user-question-format-pty (touchfiles.ts + matching assertion in touchfiles.test.ts) - Function rename navigateToModeAuq → navigateToModeAskUserQuestion - Variable auqVisible → askUserQuestionVisible - Outcome literal 'real_auq' → 'real_question' - All comments + JSDoc + CHANGELOG entry write AskUserQuestion in full - "AUQs" plural → "AskUserQuestions" No behavior change. 49/49 free tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
291 lines
9.8 KiB
TypeScript
291 lines
9.8 KiB
TypeScript
/**
|
||
* Unit tests for two helpers added alongside the new real-PTY E2E tests:
|
||
*
|
||
* - parseNumberedOptions(visible)
|
||
* Parses `❯ 1.` / ` 2.` numbered-option lines out of TTY text.
|
||
* Used by the AskUserQuestion format-compliance and mode-routing tests to look
|
||
* up an option index by its label without hard-coding positions.
|
||
*
|
||
* - findBudgetRegressions / assertNoBudgetRegression(comparison)
|
||
* Computes which tests grew >2× in tool calls or turns vs the prior
|
||
* eval run. Used by the budget-regression test.
|
||
*
|
||
* Free, deterministic, runs under `bun test`.
|
||
*/
|
||
|
||
import { describe, test, expect } from 'bun:test';
|
||
import { parseNumberedOptions } from './helpers/claude-pty-runner';
|
||
import {
|
||
assertNoBudgetRegression,
|
||
findBudgetRegressions,
|
||
type ComparisonResult,
|
||
type TestDelta,
|
||
} from './helpers/eval-store';
|
||
|
||
// --- parseNumberedOptions ---
|
||
|
||
describe('parseNumberedOptions', () => {
|
||
test('returns [] for empty input', () => {
|
||
expect(parseNumberedOptions('')).toEqual([]);
|
||
});
|
||
|
||
test('returns [] when no numbered list is rendered', () => {
|
||
expect(parseNumberedOptions('just some prose with no list')).toEqual([]);
|
||
});
|
||
|
||
test('parses a basic 3-option list with cursor on first', () => {
|
||
const visible = [
|
||
'Some prompt prose above.',
|
||
'',
|
||
'❯ 1. HOLD SCOPE',
|
||
' 2. SCOPE EXPANSION',
|
||
' 3. SELECTIVE EXPANSION',
|
||
'',
|
||
].join('\n');
|
||
expect(parseNumberedOptions(visible)).toEqual([
|
||
{ index: 1, label: 'HOLD SCOPE' },
|
||
{ index: 2, label: 'SCOPE EXPANSION' },
|
||
{ index: 3, label: 'SELECTIVE EXPANSION' },
|
||
]);
|
||
});
|
||
|
||
test('parses cursor on a non-first option', () => {
|
||
const visible = [
|
||
' 1. Option A',
|
||
'❯ 2. Option B',
|
||
' 3. Option C',
|
||
].join('\n');
|
||
const opts = parseNumberedOptions(visible);
|
||
expect(opts.map(o => o.index)).toEqual([1, 2, 3]);
|
||
expect(opts.map(o => o.label)).toEqual(['Option A', 'Option B', 'Option C']);
|
||
});
|
||
|
||
test('handles 9 options (max single-digit)', () => {
|
||
const lines = ['❯ 1. one'];
|
||
for (let i = 2; i <= 9; i++) lines.push(` ${i}. opt${i}`);
|
||
const opts = parseNumberedOptions(lines.join('\n'));
|
||
expect(opts.length).toBe(9);
|
||
expect(opts[8]).toEqual({ index: 9, label: 'opt9' });
|
||
});
|
||
|
||
test('truncates at first sequence gap', () => {
|
||
// Real bug shape: prose contains "1. blah" and "2. blah" then a real
|
||
// option list shows up later. We only return the consecutive run that
|
||
// starts at 1.
|
||
const visible = [
|
||
'❯ 1. Real option',
|
||
' 2. Other real option',
|
||
'some prose',
|
||
' 4. Stray number',
|
||
].join('\n');
|
||
expect(parseNumberedOptions(visible)).toEqual([
|
||
{ index: 1, label: 'Real option' },
|
||
{ index: 2, label: 'Other real option' },
|
||
]);
|
||
});
|
||
|
||
test('returns [] when sequence does not start at 1', () => {
|
||
const visible = [' 3. orphan', ' 4. orphan'].join('\n');
|
||
expect(parseNumberedOptions(visible)).toEqual([]);
|
||
});
|
||
|
||
test('returns [] for a single option (need at least 2 to be a real list)', () => {
|
||
expect(parseNumberedOptions('❯ 1. lonely')).toEqual([]);
|
||
});
|
||
|
||
test('preserves trailing markers on labels (e.g. recommended)', () => {
|
||
const visible = [
|
||
'❯ 1. Cover all 4 modes (recommended)',
|
||
' 2. Just HOLD + EXPANSION',
|
||
].join('\n');
|
||
const opts = parseNumberedOptions(visible);
|
||
expect(opts[0]!.label).toContain('(recommended)');
|
||
});
|
||
|
||
test('only matches the most recent list when buffer is large', () => {
|
||
// First (stale) list, then >4KB of intervening text, then the real list.
|
||
// parseNumberedOptions reads only the last 4KB, so the stale list is
|
||
// dropped — this is the desired behavior for tests that re-open the
|
||
// session and want the current prompt only.
|
||
const stale = ['❯ 1. STALE_A', ' 2. STALE_B'].join('\n');
|
||
const filler = 'x'.repeat(5000);
|
||
const fresh = ['❯ 1. FRESH_A', ' 2. FRESH_B'].join('\n');
|
||
const visible = stale + '\n' + filler + '\n' + fresh;
|
||
const opts = parseNumberedOptions(visible);
|
||
expect(opts.map(o => o.label)).toEqual(['FRESH_A', 'FRESH_B']);
|
||
});
|
||
|
||
test('anchors on LAST cursor when both stale and fresh fit in the tail', () => {
|
||
// Both lists fit in the same 4KB tail (small buffer). The granted
|
||
// permission dialog options come first, the real AskUserQuestion comes second.
|
||
// We must return the FRESH options, not the STALE ones.
|
||
const visible = [
|
||
'❯ 1. STALE_grant',
|
||
' 2. STALE_deny',
|
||
'some narration the agent printed after we granted',
|
||
'and a few more lines of bash output',
|
||
'❯ 1. FRESH_keep',
|
||
' 2. FRESH_drop',
|
||
].join('\n');
|
||
const opts = parseNumberedOptions(visible);
|
||
expect(opts.map(o => o.label)).toEqual(['FRESH_keep', 'FRESH_drop']);
|
||
});
|
||
|
||
test('falls back to last `1.` if cursor is not currently rendered on option 1', () => {
|
||
// The user pressed Down, so cursor is on option 2; but the parser
|
||
// should still return options 1+2 by anchoring on the last `1.` line.
|
||
const visible = [
|
||
' 1. Option A',
|
||
'❯ 2. Option B',
|
||
' 3. Option C',
|
||
].join('\n');
|
||
const opts = parseNumberedOptions(visible);
|
||
expect(opts.map(o => o.label)).toEqual(['Option A', 'Option B', 'Option C']);
|
||
});
|
||
});
|
||
|
||
// --- findBudgetRegressions / assertNoBudgetRegression ---
|
||
|
||
function makeDelta(
|
||
name: string,
|
||
beforeTools: Record<string, number>,
|
||
afterTools: Record<string, number>,
|
||
beforeTurns?: number,
|
||
afterTurns?: number,
|
||
): TestDelta {
|
||
return {
|
||
name,
|
||
before: { passed: true, cost_usd: 0, tool_summary: beforeTools, turns_used: beforeTurns },
|
||
after: { passed: true, cost_usd: 0, tool_summary: afterTools, turns_used: afterTurns },
|
||
status_change: 'unchanged',
|
||
};
|
||
}
|
||
|
||
function makeComparison(deltas: TestDelta[]): ComparisonResult {
|
||
return {
|
||
before_file: '/tmp/before.json',
|
||
after_file: '/tmp/after.json',
|
||
before_branch: 'main',
|
||
after_branch: 'feat/x',
|
||
before_timestamp: '2025-01-01T00:00:00Z',
|
||
after_timestamp: '2025-01-02T00:00:00Z',
|
||
deltas,
|
||
total_cost_delta: 0,
|
||
total_duration_delta: 0,
|
||
improved: 0,
|
||
regressed: 0,
|
||
unchanged: deltas.length,
|
||
tool_count_before: 0,
|
||
tool_count_after: 0,
|
||
};
|
||
}
|
||
|
||
describe('findBudgetRegressions', () => {
|
||
test('empty comparison → no regressions', () => {
|
||
expect(findBudgetRegressions(makeComparison([]))).toEqual([]);
|
||
});
|
||
|
||
test('no regression when after ≤ 2× before for tools', () => {
|
||
const c = makeComparison([
|
||
makeDelta('a', { Bash: 10 }, { Bash: 19 }), // 1.9× — under cap
|
||
]);
|
||
expect(findBudgetRegressions(c)).toEqual([]);
|
||
});
|
||
|
||
test('flags >2× tool growth', () => {
|
||
const c = makeComparison([
|
||
makeDelta('a', { Bash: 10, Read: 5 }, { Bash: 25, Read: 12 }), // 15→37 = 2.47×
|
||
]);
|
||
const regs = findBudgetRegressions(c);
|
||
expect(regs.length).toBe(1);
|
||
expect(regs[0]!.metric).toBe('tools');
|
||
expect(regs[0]!.before).toBe(15);
|
||
expect(regs[0]!.after).toBe(37);
|
||
});
|
||
|
||
test('flags >2× turn growth independently of tools', () => {
|
||
const c = makeComparison([
|
||
makeDelta('a', { Bash: 10 }, { Bash: 12 }, 5, 15), // turns 5→15 = 3×
|
||
]);
|
||
const regs = findBudgetRegressions(c);
|
||
expect(regs.length).toBe(1);
|
||
expect(regs[0]!.metric).toBe('turns');
|
||
});
|
||
|
||
test('skips tests with no prior tool data (new test)', () => {
|
||
const c = makeComparison([
|
||
makeDelta('new-test', {}, { Bash: 100 }), // no prior — should not flag
|
||
]);
|
||
expect(findBudgetRegressions(c)).toEqual([]);
|
||
});
|
||
|
||
test('skips when prior tool count is below the floor (noise floor)', () => {
|
||
// 1 → 4 tools is 4× ratio but meaningless on tiny numbers.
|
||
const c = makeComparison([
|
||
makeDelta('tiny', { Bash: 1 }, { Bash: 4 }),
|
||
]);
|
||
expect(findBudgetRegressions(c)).toEqual([]);
|
||
});
|
||
|
||
test('respects ratioCap override', () => {
|
||
const c = makeComparison([
|
||
makeDelta('a', { Bash: 10 }, { Bash: 16 }), // 1.6×
|
||
]);
|
||
expect(findBudgetRegressions(c, { ratioCap: 1.5 }).length).toBe(1);
|
||
expect(findBudgetRegressions(c, { ratioCap: 2.0 }).length).toBe(0);
|
||
});
|
||
|
||
test('respects GSTACK_BUDGET_RATIO env override', () => {
|
||
const c = makeComparison([
|
||
makeDelta('a', { Bash: 10 }, { Bash: 16 }), // 1.6×
|
||
]);
|
||
const prev = process.env.GSTACK_BUDGET_RATIO;
|
||
try {
|
||
process.env.GSTACK_BUDGET_RATIO = '1.5';
|
||
expect(findBudgetRegressions(c).length).toBe(1);
|
||
process.env.GSTACK_BUDGET_RATIO = '2.0';
|
||
expect(findBudgetRegressions(c).length).toBe(0);
|
||
} finally {
|
||
if (prev === undefined) delete process.env.GSTACK_BUDGET_RATIO;
|
||
else process.env.GSTACK_BUDGET_RATIO = prev;
|
||
}
|
||
});
|
||
|
||
test('handles missing tool_summary gracefully', () => {
|
||
const delta: TestDelta = {
|
||
name: 'sparse',
|
||
before: { passed: true, cost_usd: 0 },
|
||
after: { passed: true, cost_usd: 0 },
|
||
status_change: 'unchanged',
|
||
};
|
||
expect(findBudgetRegressions(makeComparison([delta]))).toEqual([]);
|
||
});
|
||
});
|
||
|
||
describe('assertNoBudgetRegression', () => {
|
||
test('does not throw on a clean comparison', () => {
|
||
const c = makeComparison([
|
||
makeDelta('a', { Bash: 10 }, { Bash: 11 }),
|
||
]);
|
||
expect(() => assertNoBudgetRegression(c)).not.toThrow();
|
||
});
|
||
|
||
test('throws with all violations and the cap value in the message', () => {
|
||
const c = makeComparison([
|
||
makeDelta('regressed-tools', { Bash: 10 }, { Bash: 30 }),
|
||
makeDelta('regressed-turns', { Bash: 5 }, { Bash: 6 }, 4, 13),
|
||
]);
|
||
let err: Error | null = null;
|
||
try {
|
||
assertNoBudgetRegression(c);
|
||
} catch (e) {
|
||
err = e as Error;
|
||
}
|
||
expect(err).not.toBeNull();
|
||
expect(err!.message).toContain('regressed-tools');
|
||
expect(err!.message).toContain('regressed-turns');
|
||
expect(err!.message).toContain('2.00×'); // default cap
|
||
expect(err!.message).toContain('GSTACK_BUDGET_RATIO');
|
||
});
|
||
});
|