Merge remote-tracking branch 'origin/main' into garrytan/codex-hang-fixes

2026-05-08 06:26:45 +02:00 · 2026-03-26 17:32:18 -06:00
parent 2879561695 25e971bc5e
commit 6c54028a8e
80 changed files with 9113 additions and 287 deletions
@@ -134,10 +134,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'gstack-upgrade-happy-path': ['gstack-upgrade/**'],

  // Deploy skills
-  'land-and-deploy-workflow':   ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
-  'canary-workflow':            ['canary/**', 'browse/src/**'],
-  'benchmark-workflow':         ['benchmark/**', 'browse/src/**'],
-  'setup-deploy-workflow':      ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
+  'land-and-deploy-workflow':      ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
+  'land-and-deploy-first-run':     ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
+  'land-and-deploy-review-gate':   ['land-and-deploy/**', 'bin/gstack-review-read'],
+  'canary-workflow':               ['canary/**', 'browse/src/**'],
+  'benchmark-workflow':            ['benchmark/**', 'browse/src/**'],
+  'setup-deploy-workflow':         ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],

  // Autoplan
  'autoplan-core':  ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
@@ -188,6 +190,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'review-design-lite': 'periodic',   // 4/7 threshold is subjective
  'review-coverage-audit': 'gate',
  'review-plan-completion': 'gate',
+  'review-dashboard-via': 'gate',

  // Office Hours
  'office-hours-spec-review': 'gate',
@@ -253,6 +256,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {

  // Deploy skills
  'land-and-deploy-workflow': 'gate',
+  'land-and-deploy-first-run': 'gate',
+  'land-and-deploy-review-gate': 'gate',
  'canary-workflow': 'gate',
  'benchmark-workflow': 'gate',
  'setup-deploy-workflow': 'gate',
@@ -316,6 +321,9 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
  'retro/SKILL.md instructions':          ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
  'qa-only/SKILL.md workflow':            ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
  'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
+
+  // Voice directive
+  'voice directive tone':                 ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
 };

 /**
@@ -85,6 +85,161 @@ Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
  }, 180_000);
 });

+// --- Land-and-Deploy First-Run E2E ---
+
+describeIfSelected('Land-and-Deploy first-run E2E', ['land-and-deploy-first-run'], () => {
+  let firstRunDir: string;
+
+  beforeAll(() => {
+    firstRunDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-first-run-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: firstRunDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "world"; }\n');
+    fs.writeFileSync(path.join(firstRunDir, 'fly.toml'), 'app = "first-run-app"\n\n[http_service]\n  internal_port = 3000\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    run('git', ['checkout', '-b', 'feat/first-deploy']);
+    fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "first deploy"; }\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'feat: first deploy']);
+
+    copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(firstRunDir, 'land-and-deploy'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(firstRunDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testConcurrentIfSelected('land-and-deploy-first-run', async () => {
+    const result = await runSkillTest({
+      prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
+
+You are on branch feat/first-deploy. This is the FIRST TIME running /land-and-deploy
+for this project — there is NO land-deploy-confirmed file.
+
+This repo has a fly.toml with app = "first-run-app", indicating a Fly.io deployment.
+
+IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
+Instead, simulate the Step 1.5 first-run dry-run validation:
+1. Detect that this is a FIRST_RUN (no land-deploy-confirmed file)
+2. Detect the deploy platform from fly.toml (Fly.io, app = first-run-app)
+3. Infer the production URL (https://first-run-app.fly.dev)
+4. Build the DEPLOY INFRASTRUCTURE VALIDATION table showing:
+   - Platform detected
+   - Command validation results (simulated as all passing)
+   - Staging detection results (none expected)
+   - What will happen steps
+5. Write the dry-run report to .gstack/deploy-reports/dry-run-validation.md
+
+Do NOT use AskUserQuestion. Do NOT run gh or fly commands.
+Just demonstrate the first-run dry-run output.`,
+      workingDirectory: firstRunDir,
+      maxTurns: 20,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+      timeout: 120_000,
+      testName: 'land-and-deploy-first-run',
+      runId,
+    });
+
+    logCost('/land-and-deploy first-run', result);
+    recordE2E(evalCollector, '/land-and-deploy first-run', 'Land-and-Deploy first-run E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify dry-run report was created
+    const reportDir = path.join(firstRunDir, '.gstack', 'deploy-reports');
+    expect(fs.existsSync(reportDir)).toBe(true);
+
+    // Check report content mentions platform detection
+    const reportFiles = fs.readdirSync(reportDir);
+    expect(reportFiles.length).toBeGreaterThan(0);
+    const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
+    const hasPlatform = reportContent.toLowerCase().includes('fly') || reportContent.toLowerCase().includes('first-run-app');
+    expect(hasPlatform).toBe(true);
+  }, 180_000);
+});
+
+// --- Land-and-Deploy Review Gate E2E ---
+
+describeIfSelected('Land-and-Deploy review gate E2E', ['land-and-deploy-review-gate'], () => {
+  let reviewDir: string;
+
+  beforeAll(() => {
+    reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-review-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(reviewDir, 'app.ts'), 'export function hello() { return "world"; }\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Create 6 more commits to make any review stale
+    for (let i = 1; i <= 6; i++) {
+      fs.writeFileSync(path.join(reviewDir, `file${i}.ts`), `export const x${i} = ${i};\n`);
+      run('git', ['add', '.']);
+      run('git', ['commit', '-m', `feat: add file${i}`]);
+    }
+
+    copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(reviewDir, 'land-and-deploy'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testConcurrentIfSelected('land-and-deploy-review-gate', async () => {
+    const result = await runSkillTest({
+      prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
+
+Focus on Step 3.5a and Step 3.5a-bis (the review staleness check and inline review offer).
+
+This repo has 6 commits since the initial commit. There are NO review logs
+(gstack-review-read would return NO_REVIEWS).
+
+Simulate what the readiness gate would show:
+1. Run gstack-review-read equivalent (simulate NO_REVIEWS output)
+2. Determine review staleness: Eng Review should be "NOT RUN"
+3. Note that Step 3.5a-bis would offer an inline review
+4. Write a simulated readiness report to .gstack/deploy-reports/readiness-report.md
+   showing the review status as NOT RUN with the inline review offer text
+
+Do NOT use AskUserQuestion. Do NOT run gh commands.
+Show what the readiness gate output would look like.`,
+      workingDirectory: reviewDir,
+      maxTurns: 15,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+      timeout: 120_000,
+      testName: 'land-and-deploy-review-gate',
+      runId,
+    });
+
+    logCost('/land-and-deploy review-gate', result);
+    recordE2E(evalCollector, '/land-and-deploy review-gate', 'Land-and-Deploy review gate E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify readiness report was created
+    const reportDir = path.join(reviewDir, '.gstack', 'deploy-reports');
+    expect(fs.existsSync(reportDir)).toBe(true);
+
+    const reportFiles = fs.readdirSync(reportDir);
+    expect(reportFiles.length).toBeGreaterThan(0);
+    const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
+    // Should mention review status
+    const hasReviewMention = reportContent.toLowerCase().includes('review') ||
+                              reportContent.toLowerCase().includes('not run');
+    expect(hasReviewMention).toBe(true);
+  }, 180_000);
+});
+
 // --- Canary skill E2E ---

 describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
@@ -778,6 +778,69 @@ describeIfSelected('Other skill evals', [
  }, 30_000);
 });

+// Voice directive eval — tests that the voice section produces the right tone
+describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
+  testIfSelected('voice directive tone', async () => {
+    const t0 = Date.now();
+    // Read a tier 2+ skill to get the full voice directive in context
+    const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
+    const voiceStart = content.indexOf('## Voice');
+    if (voiceStart === -1) {
+      throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
+    }
+    const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
+    const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
+
+    const result = await callJudge<{
+      directness: number;
+      concreteness: number;
+      avoids_corporate: number;
+      avoids_ai_vocabulary: number;
+      connects_user_outcomes: number;
+      reasoning: string;
+    }>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
+Score each dimension 1-5 where 5 is excellent:
+
+1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
+2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
+3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
+4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
+5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
+
+Return JSON only:
+{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
+
+THE VOICE DIRECTIVE:
+${voiceSection}`);
+
+    console.log('Voice directive scores:', JSON.stringify(result, null, 2));
+
+    evalCollector?.addTest({
+      name: 'voice directive tone',
+      suite: 'Voice directive eval',
+      tier: 'llm-judge',
+      passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
+        && result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: {
+        directness: result.directness,
+        concreteness: result.concreteness,
+        avoids_corporate: result.avoids_corporate,
+        avoids_ai_vocabulary: result.avoids_ai_vocabulary,
+        connects_user_outcomes: result.connects_user_outcomes,
+      },
+      judge_reasoning: result.reasoning,
+    });
+
+    expect(result.directness).toBeGreaterThanOrEqual(4);
+    expect(result.concreteness).toBeGreaterThanOrEqual(4);
+    expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
+    expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
+    expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
+  }, 30_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  if (evalCollector) {
@@ -1369,11 +1369,6 @@ describe('Codex skill', () => {
    expect(content).toContain('Persist Eng Review result');
  });

-  test('/ship gate suggests /review or /plan-eng-review when Eng Review is missing', () => {
-    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
-    expect(content).toContain('Abort — run /review or /plan-eng-review first');
-  });
-
  test('Review Readiness Dashboard includes Adversarial Review row', () => {
    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
    expect(content).toContain('Adversarial');