mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 21:46:40 +02:00
merge: integrate origin/main (v0.4.0, v0.4.1) into team-supabase-store
Resolves conflicts in CHANGELOG.md (ordering), CONTRIBUTING.md (eval tools list merge), VERSION (take main's 0.4.1), qa/SKILL.md.tmpl (keep full methodology + baseline line), eval-store.test.ts (drop redundant comment). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+539
-12
@@ -2,7 +2,7 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import { runSkillTest } from './helpers/session-runner';
|
||||
import type { SkillTestResult } from './helpers/session-runner';
|
||||
import { outcomeJudge } from './helpers/llm-judge';
|
||||
import { EvalCollector } from './helpers/eval-store';
|
||||
import { EvalCollector, judgePassed } from './helpers/eval-store';
|
||||
import type { EvalTestEntry } from './helpers/eval-store';
|
||||
import { startTestServer } from '../browse/test/test-server';
|
||||
import { spawnSync } from 'child_process';
|
||||
@@ -281,6 +281,124 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
|
||||
// Clean up
|
||||
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
|
||||
}, 60_000);
|
||||
|
||||
test('contributor mode files a report on gstack error', async () => {
|
||||
const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-'));
|
||||
const logsDir = path.join(contribDir, 'contributor-logs');
|
||||
fs.mkdirSync(logsDir, { recursive: true });
|
||||
|
||||
// Extract contributor mode instructions from generated SKILL.md
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const contribStart = skillMd.indexOf('## Contributor Mode');
|
||||
const contribEnd = skillMd.indexOf('\n## ', contribStart + 1);
|
||||
const contribBlock = skillMd.slice(contribStart, contribEnd > 0 ? contribEnd : undefined);
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in contributor mode (_CONTRIB=true).
|
||||
|
||||
${contribBlock}
|
||||
|
||||
OVERRIDE: Write contributor logs to ${logsDir}/ instead of ~/.gstack/contributor-logs/
|
||||
|
||||
Now try this browse command (it will fail — there is no binary at this path):
|
||||
/nonexistent/path/browse goto https://example.com
|
||||
|
||||
This is a gstack issue (the browse binary is missing/misconfigured).
|
||||
File a contributor report about this issue. Then tell me what you filed.`,
|
||||
workingDirectory: contribDir,
|
||||
maxTurns: 8,
|
||||
timeout: 60_000,
|
||||
testName: 'contributor-mode',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('contributor mode', result);
|
||||
// Override passed: this test intentionally triggers a browse error (nonexistent binary)
|
||||
// so browseErrors will be non-empty — that's expected, not a failure
|
||||
recordE2E('contributor mode report', 'Skill E2E tests', result, {
|
||||
passed: result.exitReason === 'success',
|
||||
});
|
||||
|
||||
// Verify a contributor log was created with expected format
|
||||
const logFiles = fs.readdirSync(logsDir).filter(f => f.endsWith('.md'));
|
||||
expect(logFiles.length).toBeGreaterThan(0);
|
||||
|
||||
const logContent = fs.readFileSync(path.join(logsDir, logFiles[0]), 'utf-8');
|
||||
expect(logContent).toContain('Hey gstack team');
|
||||
expect(logContent).toContain('What I was trying to do');
|
||||
expect(logContent).toContain('What happened instead');
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
|
||||
}, 90_000);
|
||||
|
||||
test('session awareness adds ELI16 context when _SESSIONS >= 3', async () => {
|
||||
const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-'));
|
||||
|
||||
// Set up a git repo so there's project/branch context to reference
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: sessionDir, stdio: 'pipe', timeout: 5000 });
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(sessionDir, 'app.rb'), '# my app\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'init']);
|
||||
run('git', ['checkout', '-b', 'feature/add-payments']);
|
||||
// Add a remote so the agent can derive a project name
|
||||
run('git', ['remote', 'add', 'origin', 'https://github.com/acme/billing-app.git']);
|
||||
|
||||
// Extract AskUserQuestion format instructions from generated SKILL.md
|
||||
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||
const aqStart = skillMd.indexOf('## AskUserQuestion Format');
|
||||
const aqEnd = skillMd.indexOf('\n## ', aqStart + 1);
|
||||
const aqBlock = skillMd.slice(aqStart, aqEnd > 0 ? aqEnd : undefined);
|
||||
|
||||
const outputPath = path.join(sessionDir, 'question-output.md');
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are running a gstack skill. The session preamble detected _SESSIONS=4 (the user has 4 gstack windows open).
|
||||
|
||||
${aqBlock}
|
||||
|
||||
You are on branch feature/add-payments in the billing-app project. You were reviewing a plan to add Stripe integration.
|
||||
|
||||
You've hit a decision point: the plan doesn't specify whether to use Stripe Checkout (hosted) or Stripe Elements (embedded). You need to ask the user which approach to use.
|
||||
|
||||
Since this is non-interactive, DO NOT actually call AskUserQuestion. Instead, write the EXACT text you would display to the user (the full AskUserQuestion content) to the file: ${outputPath}
|
||||
|
||||
Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple windows and may not remember what this conversation is about. Re-ground them.`,
|
||||
workingDirectory: sessionDir,
|
||||
maxTurns: 8,
|
||||
timeout: 60_000,
|
||||
testName: 'session-awareness',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('session awareness', result);
|
||||
recordE2E('session awareness ELI16', 'Skill E2E tests', result);
|
||||
|
||||
// Verify the output contains ELI16 re-grounding context
|
||||
if (fs.existsSync(outputPath)) {
|
||||
const output = fs.readFileSync(outputPath, 'utf-8');
|
||||
const lower = output.toLowerCase();
|
||||
// Must mention project name
|
||||
expect(lower.includes('billing') || lower.includes('acme')).toBe(true);
|
||||
// Must mention branch
|
||||
expect(lower.includes('payment') || lower.includes('feature')).toBe(true);
|
||||
// Must mention what we're working on
|
||||
expect(lower.includes('stripe') || lower.includes('checkout') || lower.includes('payment')).toBe(true);
|
||||
// Must have a RECOMMENDATION
|
||||
expect(output).toContain('RECOMMENDATION');
|
||||
} else {
|
||||
// Check agent output as fallback
|
||||
const output = result.output || '';
|
||||
expect(output).toContain('RECOMMENDATION');
|
||||
}
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(sessionDir, { recursive: true, force: true }); } catch {}
|
||||
}, 90_000);
|
||||
});
|
||||
|
||||
// --- B4: QA skill E2E ---
|
||||
@@ -315,14 +433,16 @@ Run a Quick-depth QA test on ${testServer.url}/basic.html
|
||||
Do NOT use AskUserQuestion — run Quick tier directly.
|
||||
Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
workingDirectory: qaDir,
|
||||
maxTurns: 30,
|
||||
maxTurns: 35,
|
||||
timeout: 180_000,
|
||||
testName: 'qa-quick',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/qa quick', result);
|
||||
recordE2E('/qa quick', 'QA skill E2E', result);
|
||||
recordE2E('/qa quick', 'QA skill E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
// browseErrors can include false positives from hallucinated paths
|
||||
if (result.browseErrors.length > 0) {
|
||||
console.warn('/qa quick browse errors (non-fatal):', result.browseErrors);
|
||||
@@ -391,6 +511,78 @@ Write your review findings to ${reviewDir}/review-output.md`,
|
||||
}, 120_000);
|
||||
});
|
||||
|
||||
// --- Review: Enum completeness E2E ---
|
||||
|
||||
describeE2E('Review enum completeness E2E', () => {
|
||||
let enumDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
enumDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-enum-'));
|
||||
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: enumDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Commit baseline on main — order model with 4 statuses
|
||||
const baseContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-enum.rb'), 'utf-8');
|
||||
fs.writeFileSync(path.join(enumDir, 'order.rb'), baseContent);
|
||||
run('git', ['add', 'order.rb']);
|
||||
run('git', ['commit', '-m', 'initial order model']);
|
||||
|
||||
// Feature branch adds "returned" status but misses handlers
|
||||
run('git', ['checkout', '-b', 'feature/add-returned-status']);
|
||||
const diffContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-enum-diff.rb'), 'utf-8');
|
||||
fs.writeFileSync(path.join(enumDir, 'order.rb'), diffContent);
|
||||
run('git', ['add', 'order.rb']);
|
||||
run('git', ['commit', '-m', 'add returned status']);
|
||||
|
||||
// Copy review skill files
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(enumDir, 'review-SKILL.md'));
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(enumDir, 'review-checklist.md'));
|
||||
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(enumDir, 'review-greptile-triage.md'));
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review catches missing enum handlers for new status value', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
Also read review-checklist.md and apply it — pay special attention to the Enum & Value Completeness section.
|
||||
Run /review on the current diff (git diff main...HEAD).
|
||||
Write your review findings to ${enumDir}/review-output.md
|
||||
|
||||
The diff adds a new "returned" status to the Order model. Your job is to check if all consumers handle it.`,
|
||||
workingDirectory: enumDir,
|
||||
maxTurns: 15,
|
||||
timeout: 90_000,
|
||||
testName: 'review-enum-completeness',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/review enum', result);
|
||||
recordE2E('/review enum completeness', 'Review enum completeness E2E', result);
|
||||
expect(result.exitReason).toBe('success');
|
||||
|
||||
// Verify the review caught the missing enum handlers
|
||||
const reviewPath = path.join(enumDir, 'review-output.md');
|
||||
if (fs.existsSync(reviewPath)) {
|
||||
const review = fs.readFileSync(reviewPath, 'utf-8');
|
||||
// Should mention the missing "returned" handling in at least one of the methods
|
||||
const mentionsReturned = review.toLowerCase().includes('returned');
|
||||
const mentionsEnum = review.toLowerCase().includes('enum') || review.toLowerCase().includes('status');
|
||||
const mentionsCritical = review.toLowerCase().includes('critical');
|
||||
expect(mentionsReturned).toBe(true);
|
||||
expect(mentionsEnum || mentionsCritical).toBe(true);
|
||||
}
|
||||
}, 120_000);
|
||||
});
|
||||
|
||||
// --- B6/B7/B8: Planted-bug outcome evals ---
|
||||
|
||||
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
|
||||
@@ -451,11 +643,12 @@ Write every bug you found so far. Format each as:
|
||||
- Severity: high / medium / low
|
||||
- Evidence: what you observed
|
||||
|
||||
PHASE 3 — Interactive testing (systematic form + edge case testing):
|
||||
- For EVERY input field on the page: fill it, clear it, try invalid values
|
||||
- Specifically test: empty fields, invalid email formats, extra-long text, clearing numeric fields
|
||||
- Submit the form and immediately run $B console --errors
|
||||
- Click every link/button and check for broken behavior
|
||||
PHASE 3 — Interactive testing (targeted — max 15 commands):
|
||||
- Test email: type "user@" (no domain) and blur — does it validate?
|
||||
- Test quantity: clear the field entirely — check the total display
|
||||
- Test credit card: type a 25-character string — check for overflow
|
||||
- Submit the form with zip code empty — does it require zip?
|
||||
- Submit a valid form and run $B console --errors
|
||||
- After finding more bugs, UPDATE ${reportPath} with new findings
|
||||
|
||||
PHASE 4 — Finalize report:
|
||||
@@ -467,7 +660,7 @@ CRITICAL RULES:
|
||||
- Write the report file in PHASE 2 before doing interactive testing
|
||||
- The report MUST exist at ${reportPath} when you finish`,
|
||||
workingDirectory: testWorkDir,
|
||||
maxTurns: 40,
|
||||
maxTurns: 50,
|
||||
timeout: 300_000,
|
||||
testName: `qa-${label}`,
|
||||
runId,
|
||||
@@ -522,6 +715,7 @@ CRITICAL RULES:
|
||||
|
||||
// Record to eval collector with outcome judge results
|
||||
recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, {
|
||||
passed: judgePassed(judgeResult, groundTruth),
|
||||
detection_rate: judgeResult.detection_rate,
|
||||
false_positives: judgeResult.false_positives,
|
||||
evidence_quality: judgeResult.evidence_quality,
|
||||
@@ -629,7 +823,9 @@ Focus on reviewing the plan content: architecture, error handling, security, and
|
||||
});
|
||||
|
||||
logCost('/plan-ceo-review', result);
|
||||
recordE2E('/plan-ceo-review', 'Plan CEO Review E2E', result);
|
||||
recordE2E('/plan-ceo-review', 'Plan CEO Review E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
// Accept error_max_turns — the CEO review is very thorough and may exceed turns
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
@@ -722,7 +918,9 @@ Focus on architecture, code quality, tests, and performance sections.`,
|
||||
});
|
||||
|
||||
logCost('/plan-eng-review', result);
|
||||
recordE2E('/plan-eng-review', 'Plan Eng Review E2E', result);
|
||||
recordE2E('/plan-eng-review', 'Plan Eng Review E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
// Verify the review was written
|
||||
@@ -805,7 +1003,9 @@ Analyze the git history and produce the narrative report as described in the SKI
|
||||
});
|
||||
|
||||
logCost('/retro', result);
|
||||
recordE2E('/retro', 'Retro E2E', result);
|
||||
recordE2E('/retro', 'Retro E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
// Accept error_max_turns — retro does many git commands to analyze history
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
@@ -818,6 +1018,333 @@ Analyze the git history and produce the narrative report as described in the SKI
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- QA-Only E2E (report-only, no fixes) ---
|
||||
|
||||
describeE2E('QA-Only skill E2E', () => {
|
||||
let qaOnlyDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
testServer = testServer || startTestServer();
|
||||
qaOnlyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-only-'));
|
||||
setupBrowseShims(qaOnlyDir);
|
||||
|
||||
// Copy qa-only skill files
|
||||
copyDirSync(path.join(ROOT, 'qa-only'), path.join(qaOnlyDir, 'qa-only'));
|
||||
|
||||
// Copy qa templates (qa-only references qa/templates/qa-report-template.md)
|
||||
fs.mkdirSync(path.join(qaOnlyDir, 'qa', 'templates'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'qa', 'templates', 'qa-report-template.md'),
|
||||
path.join(qaOnlyDir, 'qa', 'templates', 'qa-report-template.md'),
|
||||
);
|
||||
|
||||
// Init git repo (qa-only checks for feature branch in diff-aware mode)
|
||||
const { spawnSync } = require('child_process');
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: qaOnlyDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
fs.writeFileSync(path.join(qaOnlyDir, 'index.html'), '<h1>Test</h1>\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa-only produces report without using Edit tool', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
|
||||
B="${browseBin}"
|
||||
|
||||
Read the file qa-only/SKILL.md for the QA-only workflow instructions.
|
||||
|
||||
Run a Quick QA test on ${testServer.url}/qa-eval.html
|
||||
Do NOT use AskUserQuestion — run Quick tier directly.
|
||||
Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`,
|
||||
workingDirectory: qaOnlyDir,
|
||||
maxTurns: 35,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Glob'], // NO Edit — the critical guardrail
|
||||
timeout: 180_000,
|
||||
testName: 'qa-only-no-fix',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/qa-only', result);
|
||||
|
||||
// Verify Edit was not used — the critical guardrail for report-only mode.
|
||||
// Glob is read-only and may be used for file discovery (e.g. finding SKILL.md).
|
||||
const editCalls = result.toolCalls.filter(tc => tc.tool === 'Edit');
|
||||
if (editCalls.length > 0) {
|
||||
console.warn('qa-only used Edit tool:', editCalls.length, 'times');
|
||||
}
|
||||
|
||||
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
|
||||
recordE2E('/qa-only no-fix', 'QA-Only skill E2E', result, {
|
||||
passed: exitOk && editCalls.length === 0,
|
||||
});
|
||||
|
||||
expect(editCalls).toHaveLength(0);
|
||||
|
||||
// Accept error_max_turns — the agent doing thorough QA is not a failure
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
// Verify git working tree is still clean (no source modifications)
|
||||
const gitStatus = spawnSync('git', ['status', '--porcelain'], {
|
||||
cwd: qaOnlyDir, stdio: 'pipe',
|
||||
});
|
||||
const statusLines = gitStatus.stdout.toString().trim().split('\n').filter(
|
||||
(l: string) => l.trim() && !l.includes('.prompt-tmp') && !l.includes('.gstack/') && !l.includes('qa-reports/'),
|
||||
);
|
||||
expect(statusLines.filter((l: string) => l.startsWith(' M') || l.startsWith('M '))).toHaveLength(0);
|
||||
}, 240_000);
|
||||
});
|
||||
|
||||
// --- QA Fix Loop E2E ---
|
||||
|
||||
describeE2E('QA Fix Loop E2E', () => {
|
||||
let qaFixDir: string;
|
||||
let qaFixServer: ReturnType<typeof Bun.serve> | null = null;
|
||||
|
||||
beforeAll(() => {
|
||||
qaFixDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-fix-'));
|
||||
setupBrowseShims(qaFixDir);
|
||||
|
||||
// Copy qa skill files
|
||||
copyDirSync(path.join(ROOT, 'qa'), path.join(qaFixDir, 'qa'));
|
||||
|
||||
// Create a simple HTML page with obvious fixable bugs
|
||||
fs.writeFileSync(path.join(qaFixDir, 'index.html'), `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><meta charset="utf-8"><title>Test App</title></head>
|
||||
<body>
|
||||
<h1>Welcome to Test App</h1>
|
||||
<nav>
|
||||
<a href="/about">About</a>
|
||||
<a href="/nonexistent-broken-page">Help</a> <!-- BUG: broken link -->
|
||||
</nav>
|
||||
<form id="contact">
|
||||
<input type="text" name="name" placeholder="Name">
|
||||
<input type="email" name="email" placeholder="Email">
|
||||
<button type="submit" disabled>Send</button> <!-- BUG: permanently disabled -->
|
||||
</form>
|
||||
<img src="/missing-logo.png"> <!-- BUG: missing alt text -->
|
||||
<script>console.error("TypeError: Cannot read property 'map' of undefined");</script> <!-- BUG: console error -->
|
||||
</body>
|
||||
</html>
|
||||
`);
|
||||
|
||||
// Init git repo with clean working tree
|
||||
const { spawnSync } = require('child_process');
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: qaFixDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial commit']);
|
||||
|
||||
// Start a local server serving from the working directory so fixes are reflected on refresh
|
||||
qaFixServer = Bun.serve({
|
||||
port: 0,
|
||||
hostname: '127.0.0.1',
|
||||
fetch(req) {
|
||||
const url = new URL(req.url);
|
||||
let filePath = url.pathname === '/' ? '/index.html' : url.pathname;
|
||||
filePath = filePath.replace(/^\//, '');
|
||||
const fullPath = path.join(qaFixDir, filePath);
|
||||
if (!fs.existsSync(fullPath)) {
|
||||
return new Response('Not Found', { status: 404 });
|
||||
}
|
||||
const content = fs.readFileSync(fullPath, 'utf-8');
|
||||
return new Response(content, {
|
||||
headers: { 'Content-Type': 'text/html' },
|
||||
});
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
qaFixServer?.stop();
|
||||
try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa fix loop finds bugs and commits fixes', async () => {
|
||||
const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
||||
|
||||
Read the file qa/SKILL.md for the QA workflow instructions.
|
||||
|
||||
Run a Quick-tier QA test on ${qaFixUrl}
|
||||
The source code for this page is at ${qaFixDir}/index.html — you can fix bugs there.
|
||||
Do NOT use AskUserQuestion — run Quick tier directly.
|
||||
Write your report to ${qaFixDir}/qa-reports/qa-report.md
|
||||
|
||||
This is a test+fix loop: find bugs, fix them in the source code, commit each fix, and re-verify.`,
|
||||
workingDirectory: qaFixDir,
|
||||
maxTurns: 40,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob', 'Grep'],
|
||||
timeout: 300_000,
|
||||
testName: 'qa-fix-loop',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/qa fix loop', result);
|
||||
recordE2E('/qa fix loop', 'QA Fix Loop E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
// Accept error_max_turns — fix loop may use many turns
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
// Verify at least one fix commit was made beyond the initial commit
|
||||
const gitLog = spawnSync('git', ['log', '--oneline'], {
|
||||
cwd: qaFixDir, stdio: 'pipe',
|
||||
});
|
||||
const commits = gitLog.stdout.toString().trim().split('\n');
|
||||
console.log(`/qa fix loop: ${commits.length} commits total (1 initial + ${commits.length - 1} fixes)`);
|
||||
expect(commits.length).toBeGreaterThan(1);
|
||||
|
||||
// Verify Edit tool was used (agent actually modified source code)
|
||||
const editCalls = result.toolCalls.filter(tc => tc.tool === 'Edit');
|
||||
expect(editCalls.length).toBeGreaterThan(0);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
// --- Plan-Eng-Review Test-Plan Artifact E2E ---
|
||||
|
||||
describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => {
|
||||
let planDir: string;
|
||||
let projectDir: string;
|
||||
|
||||
beforeAll(() => {
|
||||
planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-artifact-'));
|
||||
const { spawnSync } = require('child_process');
|
||||
const run = (cmd: string, args: string[]) =>
|
||||
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
|
||||
|
||||
run('git', ['init']);
|
||||
run('git', ['config', 'user.email', 'test@test.com']);
|
||||
run('git', ['config', 'user.name', 'Test']);
|
||||
|
||||
// Create base commit on main
|
||||
fs.writeFileSync(path.join(planDir, 'app.ts'), 'export function greet() { return "hello"; }\n');
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'initial']);
|
||||
|
||||
// Create feature branch with changes
|
||||
run('git', ['checkout', '-b', 'feature/add-dashboard']);
|
||||
fs.writeFileSync(path.join(planDir, 'dashboard.ts'), `export function Dashboard() {
|
||||
const data = fetchStats();
|
||||
return { users: data.users, revenue: data.revenue };
|
||||
}
|
||||
function fetchStats() {
|
||||
return fetch('/api/stats').then(r => r.json());
|
||||
}
|
||||
`);
|
||||
fs.writeFileSync(path.join(planDir, 'app.ts'), `import { Dashboard } from "./dashboard";
|
||||
export function greet() { return "hello"; }
|
||||
export function main() { return Dashboard(); }
|
||||
`);
|
||||
run('git', ['add', '.']);
|
||||
run('git', ['commit', '-m', 'feat: add dashboard']);
|
||||
|
||||
// Plan document
|
||||
fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Dashboard
|
||||
|
||||
## Changes
|
||||
1. New \`dashboard.ts\` with Dashboard component and fetchStats API call
|
||||
2. Updated \`app.ts\` to import and use Dashboard
|
||||
|
||||
## Architecture
|
||||
- Dashboard fetches from \`/api/stats\` endpoint
|
||||
- Returns user count and revenue metrics
|
||||
`);
|
||||
run('git', ['add', 'plan.md']);
|
||||
run('git', ['commit', '-m', 'add plan']);
|
||||
|
||||
// Copy plan-eng-review skill
|
||||
fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
|
||||
fs.copyFileSync(
|
||||
path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
|
||||
path.join(planDir, 'plan-eng-review', 'SKILL.md'),
|
||||
);
|
||||
|
||||
// Set up remote-slug shim and browse shims (plan-eng-review uses remote-slug for artifact path)
|
||||
setupBrowseShims(planDir);
|
||||
|
||||
// Create project directory for artifacts
|
||||
projectDir = path.join(os.homedir(), '.gstack', 'projects', 'test-project');
|
||||
fs.mkdirSync(projectDir, { recursive: true });
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
// Clean up test-plan artifacts (but not the project dir itself)
|
||||
try {
|
||||
const files = fs.readdirSync(projectDir);
|
||||
for (const f of files) {
|
||||
if (f.includes('test-plan')) {
|
||||
fs.unlinkSync(path.join(projectDir, f));
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
});
|
||||
|
||||
test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
|
||||
// Count existing test-plan files before
|
||||
const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
|
||||
|
||||
Read plan.md — that's the plan to review. This is a standalone plan with source code in app.ts and dashboard.ts.
|
||||
|
||||
Choose SMALL CHANGE mode. Skip any AskUserQuestion calls — this is non-interactive.
|
||||
|
||||
IMPORTANT: After your review, you MUST write the test-plan artifact as described in the "Test Plan Artifact" section of SKILL.md. The remote-slug shim is at ${planDir}/browse/bin/remote-slug.
|
||||
|
||||
Write your review to ${planDir}/review-output.md`,
|
||||
workingDirectory: planDir,
|
||||
maxTurns: 20,
|
||||
allowedTools: ['Bash', 'Read', 'Write', 'Glob', 'Grep'],
|
||||
timeout: 360_000,
|
||||
testName: 'plan-eng-review-artifact',
|
||||
runId,
|
||||
});
|
||||
|
||||
logCost('/plan-eng-review artifact', result);
|
||||
recordE2E('/plan-eng-review test-plan artifact', 'Plan-Eng-Review Test-Plan Artifact E2E', result, {
|
||||
passed: ['success', 'error_max_turns'].includes(result.exitReason),
|
||||
});
|
||||
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
|
||||
// Verify test-plan artifact was written
|
||||
const afterFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
|
||||
const newFiles = afterFiles.filter(f => !beforeFiles.includes(f));
|
||||
console.log(`Test-plan artifacts: ${beforeFiles.length} before, ${afterFiles.length} after, ${newFiles.length} new`);
|
||||
|
||||
if (newFiles.length > 0) {
|
||||
const content = fs.readFileSync(path.join(projectDir, newFiles[0]), 'utf-8');
|
||||
console.log(`Test-plan artifact (${newFiles[0]}): ${content.length} chars`);
|
||||
expect(content.length).toBeGreaterThan(50);
|
||||
} else {
|
||||
console.warn('No test-plan artifact found — agent may not have followed artifact instructions');
|
||||
}
|
||||
|
||||
// Soft assertion: we expect an artifact but agent compliance is not guaranteed
|
||||
expect(newFiles.length).toBeGreaterThanOrEqual(1);
|
||||
}, 420_000);
|
||||
});
|
||||
|
||||
// --- Deferred skill E2E tests (destructive or require interactive UI) ---
|
||||
|
||||
describeE2E('Deferred skill E2E', () => {
|
||||
|
||||
Reference in New Issue
Block a user