Merge remote-tracking branch 'origin/main' into garrytan/recover-voice-fix

This commit is contained in:
Garry Tan
2026-03-26 12:16:58 -06:00
63 changed files with 8028 additions and 285 deletions
+9 -4
View File
@@ -134,10 +134,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
// Deploy skills
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
'canary-workflow': ['canary/**', 'browse/src/**'],
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
'land-and-deploy-workflow': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
'land-and-deploy-first-run': ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
'land-and-deploy-review-gate': ['land-and-deploy/**', 'bin/gstack-review-read'],
'canary-workflow': ['canary/**', 'browse/src/**'],
'benchmark-workflow': ['benchmark/**', 'browse/src/**'],
'setup-deploy-workflow': ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
// Autoplan
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
@@ -188,6 +190,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'review-design-lite': 'periodic', // 4/7 threshold is subjective
'review-coverage-audit': 'gate',
'review-plan-completion': 'gate',
'review-dashboard-via': 'gate',
// Office Hours
'office-hours-spec-review': 'gate',
@@ -253,6 +256,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Deploy skills
'land-and-deploy-workflow': 'gate',
'land-and-deploy-first-run': 'gate',
'land-and-deploy-review-gate': 'gate',
'canary-workflow': 'gate',
'benchmark-workflow': 'gate',
'setup-deploy-workflow': 'gate',
+155
View File
@@ -85,6 +85,161 @@ Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
}, 180_000);
});
// --- Land-and-Deploy First-Run E2E ---
describeIfSelected('Land-and-Deploy first-run E2E', ['land-and-deploy-first-run'], () => {
let firstRunDir: string;
beforeAll(() => {
firstRunDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-first-run-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: firstRunDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "world"; }\n');
fs.writeFileSync(path.join(firstRunDir, 'fly.toml'), 'app = "first-run-app"\n\n[http_service]\n internal_port = 3000\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
run('git', ['checkout', '-b', 'feat/first-deploy']);
fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "first deploy"; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'feat: first deploy']);
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(firstRunDir, 'land-and-deploy'));
});
afterAll(() => {
try { fs.rmSync(firstRunDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('land-and-deploy-first-run', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
You are on branch feat/first-deploy. This is the FIRST TIME running /land-and-deploy
for this project — there is NO land-deploy-confirmed file.
This repo has a fly.toml with app = "first-run-app", indicating a Fly.io deployment.
IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
Instead, simulate the Step 1.5 first-run dry-run validation:
1. Detect that this is a FIRST_RUN (no land-deploy-confirmed file)
2. Detect the deploy platform from fly.toml (Fly.io, app = first-run-app)
3. Infer the production URL (https://first-run-app.fly.dev)
4. Build the DEPLOY INFRASTRUCTURE VALIDATION table showing:
- Platform detected
- Command validation results (simulated as all passing)
- Staging detection results (none expected)
- What will happen steps
5. Write the dry-run report to .gstack/deploy-reports/dry-run-validation.md
Do NOT use AskUserQuestion. Do NOT run gh or fly commands.
Just demonstrate the first-run dry-run output.`,
workingDirectory: firstRunDir,
maxTurns: 20,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'land-and-deploy-first-run',
runId,
});
logCost('/land-and-deploy first-run', result);
recordE2E(evalCollector, '/land-and-deploy first-run', 'Land-and-Deploy first-run E2E', result);
expect(result.exitReason).toBe('success');
// Verify dry-run report was created
const reportDir = path.join(firstRunDir, '.gstack', 'deploy-reports');
expect(fs.existsSync(reportDir)).toBe(true);
// Check report content mentions platform detection
const reportFiles = fs.readdirSync(reportDir);
expect(reportFiles.length).toBeGreaterThan(0);
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
const hasPlatform = reportContent.toLowerCase().includes('fly') || reportContent.toLowerCase().includes('first-run-app');
expect(hasPlatform).toBe(true);
}, 180_000);
});
// --- Land-and-Deploy Review Gate E2E ---
describeIfSelected('Land-and-Deploy review gate E2E', ['land-and-deploy-review-gate'], () => {
let reviewDir: string;
beforeAll(() => {
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-review-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(reviewDir, 'app.ts'), 'export function hello() { return "world"; }\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
// Create 6 more commits to make any review stale
for (let i = 1; i <= 6; i++) {
fs.writeFileSync(path.join(reviewDir, `file${i}.ts`), `export const x${i} = ${i};\n`);
run('git', ['add', '.']);
run('git', ['commit', '-m', `feat: add file${i}`]);
}
copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(reviewDir, 'land-and-deploy'));
});
afterAll(() => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('land-and-deploy-review-gate', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
Focus on Step 3.5a and Step 3.5a-bis (the review staleness check and inline review offer).
This repo has 6 commits since the initial commit. There are NO review logs
(gstack-review-read would return NO_REVIEWS).
Simulate what the readiness gate would show:
1. Run gstack-review-read equivalent (simulate NO_REVIEWS output)
2. Determine review staleness: Eng Review should be "NOT RUN"
3. Note that Step 3.5a-bis would offer an inline review
4. Write a simulated readiness report to .gstack/deploy-reports/readiness-report.md
showing the review status as NOT RUN with the inline review offer text
Do NOT use AskUserQuestion. Do NOT run gh commands.
Show what the readiness gate output would look like.`,
workingDirectory: reviewDir,
maxTurns: 15,
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
timeout: 120_000,
testName: 'land-and-deploy-review-gate',
runId,
});
logCost('/land-and-deploy review-gate', result);
recordE2E(evalCollector, '/land-and-deploy review-gate', 'Land-and-Deploy review gate E2E', result);
expect(result.exitReason).toBe('success');
// Verify readiness report was created
const reportDir = path.join(reviewDir, '.gstack', 'deploy-reports');
expect(fs.existsSync(reportDir)).toBe(true);
const reportFiles = fs.readdirSync(reportDir);
expect(reportFiles.length).toBeGreaterThan(0);
const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
// Should mention review status
const hasReviewMention = reportContent.toLowerCase().includes('review') ||
reportContent.toLowerCase().includes('not run');
expect(hasReviewMention).toBe(true);
}, 180_000);
});
// --- Canary skill E2E ---
describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
-5
View File
@@ -1369,11 +1369,6 @@ describe('Codex skill', () => {
expect(content).toContain('Persist Eng Review result');
});
test('/ship gate suggests /review or /plan-eng-review when Eng Review is missing', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Abort — run /review or /plan-eng-review first');
});
test('Review Readiness Dashboard includes Adversarial Review row', () => {
const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
expect(content).toContain('Adversarial');