Merge remote-tracking branch 'origin/main' into garrytan/design

# Conflicts: # CLAUDE.md
2026-05-05 21:25:27 +02:00 · 2026-03-16 11:31:38 -05:00
parent 7bbb1c82ee 78e519e3b7
commit 581ca11e48
19 changed files with 329 additions and 107 deletions
@@ -13,6 +13,11 @@ import * as os from 'os';
 const ROOT = path.resolve(import.meta.dir, '..');

 // Skip unless EVALS=1. Session runner strips CLAUDE* env vars to avoid nested session issues.
+//
+// BLAME PROTOCOL: When an eval fails, do NOT claim "pre-existing" or "not related
+// to our changes" without proof. Run the same eval on main to verify. These tests
+// have invisible couplings — preamble text, SKILL.md content, and timing all affect
+// agent behavior. See CLAUDE.md "E2E eval failure blame protocol" for details.
 const evalsEnabled = !!process.env.EVALS;
 const describeE2E = evalsEnabled ? describe : describe.skip;

@@ -322,10 +327,16 @@ File a contributor report about this issue. Then tell me what you filed.`,
    const logFiles = fs.readdirSync(logsDir).filter(f => f.endsWith('.md'));
    expect(logFiles.length).toBeGreaterThan(0);

+    // Verify new reflection-based format
    const logContent = fs.readFileSync(path.join(logsDir, logFiles[0]), 'utf-8');
    expect(logContent).toContain('Hey gstack team');
    expect(logContent).toContain('What I was trying to do');
    expect(logContent).toContain('What happened instead');
+    expect(logContent).toMatch(/rating/i);
+    // Verify report has repro steps (agent may use "Steps to reproduce", "Repro Steps", etc.)
+    expect(logContent).toMatch(/repro|steps to reproduce|how to reproduce/i);
+    // Verify report has date/version footer (agent may format differently)
+    expect(logContent).toMatch(/date.*2026|2026.*date/i);

    // Clean up
    try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {}
@@ -424,16 +435,20 @@ describeE2E('QA skill E2E', () => {

  test('/qa quick completes without browse errors', async () => {
    const result = await runSkillTest({
-      prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
+      prompt: `B="${browseBin}"
+
+The test server is already running at: ${testServer.url}
+Target page: ${testServer.url}/basic.html

 Read the file qa/SKILL.md for the QA workflow instructions.

 Run a Quick-depth QA test on ${testServer.url}/basic.html
 Do NOT use AskUserQuestion — run Quick tier directly.
+Do NOT try to start a server or discover ports — the URL above is ready.
 Write your report to ${qaDir}/qa-reports/qa-report.md`,
      workingDirectory: qaDir,
      maxTurns: 35,
-      timeout: 180_000,
+      timeout: 240_000,
      testName: 'qa-quick',
      runId,
    });
@@ -448,7 +463,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
    }
    // Accept error_max_turns — the agent doing thorough QA work is not a failure
    expect(['success', 'error_max_turns']).toContain(result.exitReason);
-  }, 240_000);
+  }, 300_000);
 });

 // --- B5: Review skill E2E ---
@@ -528,6 +528,44 @@ describe('v0.4.1 preamble features', () => {
  }
 });

+// --- Contributor mode preamble structure validation ---
+
+describe('Contributor mode preamble structure', () => {
+  const skillsWithPreamble = [
+    'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md',
+    'qa-only/SKILL.md',
+    'setup-browser-cookies/SKILL.md',
+    'ship/SKILL.md', 'review/SKILL.md',
+    'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
+    'retro/SKILL.md',
+  ];
+
+  for (const skill of skillsWithPreamble) {
+    test(`${skill} has 0-10 rating in contributor mode`, () => {
+      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
+      expect(content).toContain('0 to 10');
+      expect(content).toContain('My rating');
+    });
+
+    test(`${skill} has calibration example`, () => {
+      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
+      expect(content).toContain('Calibration');
+      expect(content).toContain('the bar');
+    });
+
+    test(`${skill} has "what would make this a 10" field`, () => {
+      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
+      expect(content).toContain('What would make this a 10');
+    });
+
+    test(`${skill} uses periodic reflection (not per-command)`, () => {
+      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
+      expect(content).toContain('workflow step');
+      expect(content).not.toContain('After you use gstack-provided CLIs');
+    });
+  }
+});
+
 describe('Enum & Value Completeness in review checklist', () => {
  const checklist = fs.readFileSync(path.join(ROOT, 'review', 'checklist.md'), 'utf-8');