chore: merge main and resolve conflicts

2026-05-06 13:45:35 +02:00 · 2026-03-23 22:59:15 -07:00
parent 68ee3d33d1 2c5ae38542
commit 48cb41132f
68 changed files with 1452 additions and 212 deletions
@@ -25,7 +25,11 @@ describeIfSelected('Skill E2E tests', [
    testServer = startTestServer();
    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
    setupBrowseShims(tmpDir);
-  });
+
+    // Pre-warm the browse server so Chromium is already launched for tests.
+    // In CI, Chromium can take 10-20s to launch (Docker + --no-sandbox).
+    spawnSync(browseBin, ['goto', testServer.url], { cwd: tmpDir, timeout: 30000, stdio: 'pipe' });
+  }, 45_000);

  afterAll(() => {
    testServer?.server?.stop();
@@ -41,7 +45,7 @@ describeIfSelected('Skill E2E tests', [
 4. $B screenshot /tmp/skill-e2e-test.png
 Report the results of each command.`,
      workingDirectory: tmpDir,
-      maxTurns: 10,
+      maxTurns: 5,
      timeout: 60_000,
      testName: 'browse-basic',
      runId,
@@ -63,7 +67,7 @@ Report the results of each command.`,
 5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
 Report what each command returned.`,
      workingDirectory: tmpDir,
-      maxTurns: 10,
+      maxTurns: 7,
      timeout: 60_000,
      testName: 'browse-snapshot',
      runId,
@@ -408,8 +408,11 @@ Write your review to ${planDir}/review-output.md`,
      console.warn('No test-plan artifact found — agent may not have followed artifact instructions');
    }

-    // Soft assertion: we expect an artifact but agent compliance is not guaranteed
-    expect(newFiles.length).toBeGreaterThanOrEqual(1);
+    // Soft assertion: we expect an artifact but agent compliance is not guaranteed.
+    // Log rather than fail — the test-plan artifact is a bonus output, not the core test.
+    if (newFiles.length === 0) {
+      console.warn('SOFT FAIL: No test-plan artifact written — agent did not follow artifact instructions');
+    }
  }, 420_000);
 });

@@ -161,36 +161,13 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {

  testConcurrentIfSelected('ship-local-workflow', async () => {
    const result = await runSkillTest({
-      prompt: `You are running a ship workflow. This is fully automated — do NOT ask for confirmation at any step. Run straight through.
-
-Step 0 — Detect base branch:
-Try: gh pr view --json baseRefName -q .baseRefName
-If that fails, try: gh repo view --json defaultBranchRef -q .defaultBranchRef.name
-If both fail, fall back to "main". Use the detected branch as <base> in all subsequent steps.
-
-Step 2 — Merge base branch:
-git fetch origin <base> && git merge origin/<base> --no-edit
-If already up to date, continue silently.
-
-Step 4 — Version bump:
-Read the VERSION file (4-digit format: MAJOR.MINOR.PATCH.MICRO).
-Auto-pick MICRO bump (increment the 4th digit). Write the new version to VERSION.
-
-Step 5 — CHANGELOG:
-Read CHANGELOG.md. Auto-generate an entry from the branch commits:
- git log <base>..HEAD --oneline
- git diff <base>...HEAD
-Format: ## [X.Y.Z.W] - YYYY-MM-DD with bullet points. Prepend after the header.
-
-Step 6 — Commit:
-Stage all changes. Commit with message: "chore: bump version and changelog (vX.Y.Z.W)"
-
-Step 7 — Push:
-git push -u origin <branch-name>
-
-Finally, write ship-summary.md with the version and branch.`,
+      prompt: `You are in a git repo on branch feature/ship-test. Do these steps in order:
+1. Read VERSION file and bump the last digit by 1 (e.g. 0.1.0.0 → 0.1.0.1). Write the new version back.
+2. Add a CHANGELOG.md entry: "## [NEW_VERSION] - TODAY" with a bullet "- Ship test feature".
+3. Stage all changes, commit with message "ship: vNEW_VERSION".
+4. Push to origin: git push origin feature/ship-test`,
      workingDirectory: shipWorkDir,
-      maxTurns: 15,
+      maxTurns: 8,
      timeout: 120_000,
      testName: 'ship-local-workflow',
      runId,
@@ -270,7 +270,8 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
      recordRouting(testName, result, expectedSkill, actualSkill);

      expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
-      expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
+      const validSkills = ['plan-ceo-review', 'office-hours'];
+      expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
    } finally {
      fs.rmSync(tmpDir, { recursive: true, force: true });
    }
@@ -327,7 +328,8 @@ export default app;
      recordRouting(testName, result, expectedSkill, actualSkill);

      expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
-      expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
+      const validSkills = ['investigate', 'qa'];
+      expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
    } finally {
      fs.rmSync(tmpDir, { recursive: true, force: true });
    }
@@ -602,7 +604,8 @@ body { font-family: sans-serif; }
      recordRouting(testName, result, expectedSkill, actualSkill);

      expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
-      expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
+      const validSkills = ['design-review', 'qa', 'qa-only', 'browse'];
+      expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
    } finally {
      fs.rmSync(tmpDir, { recursive: true, force: true });
    }