mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
feat: await support in browse js/eval + contributor mode v2 (#104)
* feat: support await in $B js and eval commands Auto-wrap await expressions in async IIFE context so $B js "await fetch(...)" works without SyntaxError. - hasAwait() strips comments before detection - js: expression wrapping (async()=>(expr))() - eval: smart wrapping — single-line=expression, multi-line=block - 6 new unit tests covering async, false-positive, and return semantics * feat: redesign contributor mode — periodic reflection with 0-10 rating Replace passive "report when things break" with active reflection: - Rate gstack experience 0-10 at workflow step boundaries - Historical calibration example (await bug) anchors the reporting bar - "What would make this a 10" field focuses on actionable improvements - Removed category lists in favor of judgment-based assessment * test: add deterministic contributor mode preamble validation 40 new skill-validation tests (4 checks × 10 skills) verify: - 0-10 rating scale present - Calibration example present - "What would make this a 10" field present - Periodic reflection (not per-command) Update existing E2E contributor eval for new report format. * chore: bump version and changelog (v0.4.2) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: improve contributor mode + qa-quick E2E reliability Contributor mode: - Add "do not truncate" directive to template — agent was stopping after "My rating" without completing Steps/Raw output/What would make this a 10 sections - Restore assertions for Steps to reproduce and Date footer QA quick: - Make test server URL prominent: top of prompt, explicit "already running" and "do NOT discover ports" instructions - Bump session timeout 180s→240s and test timeout 240s→300s - Set B= at top of prompt (was buried in prose) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: use flexible assertions for contributor mode E2E Agent writes thorough reports with creative section names ("Repro Steps" vs "Steps to reproduce"). Match intent not formatting: - /repro|steps to reproduce/ for reproduction steps - /date.*2026/ for date footer presence Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: add E2E eval failure blame protocol "Not related to our changes" is an extraordinary claim that requires extraordinary proof. When evals fail during /ship: 1. Run the same eval on main — prove it fails there too 2. If it passes on main, it IS your change — trace the blame 3. If you can't verify, say "unverified" not "pre-existing" Added to CLAUDE.md and as a comment in skill-e2e.test.ts. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: update CONTRIBUTING.md and BROWSER.md for v0.4.2 CONTRIBUTING.md: update contributor mode description — now describes periodic 0-10 reflection loop instead of passive friction detection. BROWSER.md: add js/eval async documentation — await expressions are auto-wrapped in async context, single-line eval returns values directly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: restore v0.4.2 changelog entries lost during cherry-pick conflict The base branch detection entries from main were dropped when resolving the CHANGELOG conflict — should have merged both sets, not replaced. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+18
-3
@@ -13,6 +13,11 @@ import * as os from 'os';
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
// Skip unless EVALS=1. Session runner strips CLAUDE* env vars to avoid nested session issues.
|
||||
//
|
||||
// BLAME PROTOCOL: When an eval fails, do NOT claim "pre-existing" or "not related
|
||||
// to our changes" without proof. Run the same eval on main to verify. These tests
|
||||
// have invisible couplings — preamble text, SKILL.md content, and timing all affect
|
||||
// agent behavior. See CLAUDE.md "E2E eval failure blame protocol" for details.
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const describeE2E = evalsEnabled ? describe : describe.skip;
|
||||
|
||||
@@ -322,10 +327,16 @@ File a contributor report about this issue. Then tell me what you filed.`,
|
||||
const logFiles = fs.readdirSync(logsDir).filter(f => f.endsWith('.md'));
|
||||
expect(logFiles.length).toBeGreaterThan(0);
|
||||
|
||||
// Verify new reflection-based format
|
||||
const logContent = fs.readFileSync(path.join(logsDir, logFiles[0]), 'utf-8');
|
||||
expect(logContent).toContain('Hey gstack team');
|
||||
expect(logContent).toContain('What I was trying to do');
|
||||
expect(logContent).toContain('What happened instead');
|
||||
expect(logContent).toMatch(/rating/i);
|
||||
// Verify report has repro steps (agent may use "Steps to reproduce", "Repro Steps", etc.)
|
||||
expect(logContent).toMatch(/repro|steps to reproduce|how to reproduce/i);
|
||||
// Verify report has date/version footer (agent may format differently)
|
||||
expect(logContent).toMatch(/date.*2026|2026.*date/i);
|
||||
|
||||
// Clean up
|
||||
try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
|
||||
@@ -424,16 +435,20 @@ describeE2E('QA skill E2E', () => {
|
||||
|
||||
test('/qa quick completes without browse errors', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
||||
prompt: `B="${browseBin}"
|
||||
|
||||
The test server is already running at: ${testServer.url}
|
||||
Target page: ${testServer.url}/basic.html
|
||||
|
||||
Read the file qa/SKILL.md for the QA workflow instructions.
|
||||
|
||||
Run a Quick-depth QA test on ${testServer.url}/basic.html
|
||||
Do NOT use AskUserQuestion — run Quick tier directly.
|
||||
Do NOT try to start a server or discover ports — the URL above is ready.
|
||||
Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
workingDirectory: qaDir,
|
||||
maxTurns: 35,
|
||||
timeout: 180_000,
|
||||
timeout: 240_000,
|
||||
testName: 'qa-quick',
|
||||
runId,
|
||||
});
|
||||
@@ -448,7 +463,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
||||
}
|
||||
// Accept error_max_turns — the agent doing thorough QA work is not a failure
|
||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||
}, 240_000);
|
||||
}, 300_000);
|
||||
});
|
||||
|
||||
// --- B5: Review skill E2E ---
|
||||
|
||||
@@ -496,6 +496,44 @@ describe('v0.4.1 preamble features', () => {
|
||||
}
|
||||
});
|
||||
|
||||
// --- Contributor mode preamble structure validation ---
|
||||
|
||||
describe('Contributor mode preamble structure', () => {
|
||||
const skillsWithPreamble = [
|
||||
'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md',
|
||||
'qa-only/SKILL.md',
|
||||
'setup-browser-cookies/SKILL.md',
|
||||
'ship/SKILL.md', 'review/SKILL.md',
|
||||
'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
|
||||
'retro/SKILL.md',
|
||||
];
|
||||
|
||||
for (const skill of skillsWithPreamble) {
|
||||
test(`${skill} has 0-10 rating in contributor mode`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('0 to 10');
|
||||
expect(content).toContain('My rating');
|
||||
});
|
||||
|
||||
test(`${skill} has calibration example`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('Calibration');
|
||||
expect(content).toContain('the bar');
|
||||
});
|
||||
|
||||
test(`${skill} has "what would make this a 10" field`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('What would make this a 10');
|
||||
});
|
||||
|
||||
test(`${skill} uses periodic reflection (not per-command)`, () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
|
||||
expect(content).toContain('workflow step');
|
||||
expect(content).not.toContain('After you use gstack-provided CLIs');
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe('Enum & Value Completeness in review checklist', () => {
|
||||
const checklist = fs.readFileSync(path.join(ROOT, 'review', 'checklist.md'), 'utf-8');
|
||||
|
||||
|
||||
Reference in New Issue
Block a user