mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 21:25:27 +02:00
Merge remote-tracking branch 'origin/main' into garrytan/design
# Conflicts: # CLAUDE.md
This commit is contained in:
+18
-3
@@ -13,6 +13,11 @@ import * as os from 'os';
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// Skip unless EVALS=1. Session runner strips CLAUDE* env vars to avoid nested session issues.
|
||||
//
|
||||
// BLAME PROTOCOL: When an eval fails, do NOT claim "pre-existing" or "not related
|
||||
// to our changes" without proof. Run the same eval on main to verify. These tests
|
||||
// have invisible couplings — preamble text, SKILL.md content, and timing all affect
|
||||
// agent behavior. See CLAUDE.md "E2E eval failure blame protocol" for details.
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||
|
||||
@@ -322,10 +327,16 @@ File a contributor report about this issue. Then tell me what you filed.`,
|
||||
const logFiles = fs.readdirSync(logsDir).filter(f => f.endsWith('.md'));
|
||||
expect(logFiles.length).toBeGreaterThan(0);
|
||||
|
||||
// Verify new reflection-based format
|
||||
const logContent = fs.readFileSync(path.join(logsDir, logFiles[0]), 'utf-8');
|
||||
expect(logContent).toContain('Hey gstack team');
|
||||
expect(logContent).toContain('What I was trying to do');
|
||||
expect(logContent).toContain('What happened instead');
|
||||
expect(logContent).toMatch(/rating/i);
|
||||
// Verify report has repro steps (agent may use "Steps to reproduce", "Repro Steps", etc.)
|
||||
expect(logContent).toMatch(/repro|steps to reproduce|how to reproduce/i);
|
||||
// Verify report has date/version footer (agent may format differently)
|
||||
expect(logContent).toMatch(/date.*2026|2026.*date/i);
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
|
||||
@@ -424,16 +435,20 @@ describeE2E('QA skill E2E', () => {
|
||||
|
||||
test('/qa quick completes without browse errors', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
||||
prompt: `B="${browseBin}"
|
||||
|
||||
The test server is already running at: ${testServer.url}
|
||||
Target page: ${testServer.url}/basic.html
|
||||
|
||||
Read the file qa/SKILL.md for the QA workflow instructions.
|
||||
|
||||
Run a Quick-depth QA test on ${testServer.url}/basic.html
|
||||
Do NOT use AskUserQuestion — run Quick tier directly.
|
||||
Do NOT try to start a server or discover ports — the URL above is ready.
|
||||
Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
workingDirectory: qaDir,
|
||||
maxTurns: 35,
|
||||
timeout: 180_000,
|
||||
timeout: 240_000,
|
||||
testName: 'qa-quick',
|
||||
runId,
|
||||
});
|
||||
@@ -448,7 +463,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
}
|
||||
// Accept error_max_turns — the agent doing thorough QA work is not a failure
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
}, 240_000);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
// --- B5: Review skill E2E ---
|
||||
|
||||
@@ -528,6 +528,44 @@ describe('v0.4.1 preamble features', () => {
|
||||
}
|
||||
});
|
||||
|
||||
// --- Contributor mode preamble structure validation ---
|
||||
|
||||
describe('Contributor mode preamble structure', () => {
|
||||
const skillsWithPreamble = [
|
||||
'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md',
|
||||
'qa-only/SKILL.md',
|
||||
'setup-browser-cookies/SKILL.md',
|
||||
'ship/SKILL.md', 'review/SKILL.md',
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
];
|
||||
|
||||
for (const skill of skillsWithPreamble) {
|
||||
test(`${skill} has 0-10 rating in contributor mode`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('0 to 10');
|
||||
expect(content).toContain('My rating');
|
||||
});
|
||||
|
||||
test(`${skill} has calibration example`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('Calibration');
|
||||
expect(content).toContain('the bar');
|
||||
});
|
||||
|
||||
test(`${skill} has "what would make this a 10" field`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('What would make this a 10');
|
||||
});
|
||||
|
||||
test(`${skill} uses periodic reflection (not per-command)`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('workflow step');
|
||||
expect(content).not.toContain('After you use gstack-provided CLIs');
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe('Enum & Value Completeness in review checklist', () => {
|
||||
const checklist = fs.readFileSync(path.join(ROOT, 'review', 'checklist.md'), 'utf-8');
|
||||
|
||||
|
||||
Reference in New Issue
Block a user