merge: resolve CHANGELOG conflict with main, bump to v0.12.8.0

Main claimed v0.12.7.0 for community PRs + security hardening.
Bumped our codex-cwd-bug entry to v0.12.8.0. Both entries preserved.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-26 23:28:18 -06:00
22 changed files with 335 additions and 84 deletions
+24
View File
@@ -3,6 +3,7 @@ import { COMMAND_DESCRIPTIONS } from '../browse/src/commands';
import { SNAPSHOT_FLAGS } from '../browse/src/snapshot';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const MAX_SKILL_DESCRIPTION_LENGTH = 1024;
@@ -1599,6 +1600,29 @@ describe('setup script validation', () => {
});
});
describe('discover-skills hidden directory filtering', () => {
test('discoverTemplates skips dot-prefixed directories', () => {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-discover-'));
try {
// Create a hidden dir with a template (should be excluded)
fs.mkdirSync(path.join(tmpDir, '.hidden'), { recursive: true });
fs.writeFileSync(path.join(tmpDir, '.hidden', 'SKILL.md.tmpl'), '---\nname: evil\n---\ntest');
// Create a visible dir with a template (should be included)
fs.mkdirSync(path.join(tmpDir, 'visible'), { recursive: true });
fs.writeFileSync(path.join(tmpDir, 'visible', 'SKILL.md.tmpl'), '---\nname: good\n---\ntest');
const { discoverTemplates } = require('../scripts/discover-skills');
const results = discoverTemplates(tmpDir);
const dirs = results.map((r: { tmpl: string }) => r.tmpl);
expect(dirs).toContain('visible/SKILL.md.tmpl');
expect(dirs).not.toContain('.hidden/SKILL.md.tmpl');
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
});
});
describe('telemetry', () => {
test('generated SKILL.md contains telemetry start block', () => {
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
-2
View File
@@ -151,7 +151,6 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-think-bigger': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-debug': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-code-review': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -276,7 +275,6 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Skill routing — periodic (LLM routing is non-deterministic)
'journey-ideation': 'periodic',
'journey-plan-eng': 'periodic',
'journey-think-bigger': 'periodic',
'journey-debug': 'periodic',
'journey-qa': 'periodic',
'journey-code-review': 'periodic',
+77
View File
@@ -0,0 +1,77 @@
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { execSync, ExecSyncOptionsWithStringEncoding } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin');
let tmpDir: string;
let slugDir: string;
function run(input: string, opts: { expectFail?: boolean } = {}): { stdout: string; exitCode: number } {
const execOpts: ExecSyncOptionsWithStringEncoding = {
cwd: ROOT,
env: { ...process.env, GSTACK_HOME: tmpDir },
encoding: 'utf-8',
timeout: 10000,
};
try {
const stdout = execSync(`${BIN}/gstack-review-log '${input.replace(/'/g, "'\\''")}'`, execOpts).trim();
return { stdout, exitCode: 0 };
} catch (e: any) {
if (opts.expectFail) {
return { stdout: e.stderr?.toString() || '', exitCode: e.status || 1 };
}
throw e;
}
}
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-revlog-'));
// gstack-review-log uses gstack-slug which needs a git repo — create the projects dir
// with a predictable slug by pre-creating the directory structure
slugDir = path.join(tmpDir, 'projects');
fs.mkdirSync(slugDir, { recursive: true });
});
afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});
describe('gstack-review-log', () => {
test('appends valid JSON to review JSONL file', () => {
const input = '{"skill":"plan-eng-review","status":"clean"}';
const result = run(input);
expect(result.exitCode).toBe(0);
// Find the JSONL file that was written
const projectDirs = fs.readdirSync(slugDir);
expect(projectDirs.length).toBeGreaterThan(0);
const projectDir = path.join(slugDir, projectDirs[0]);
const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
expect(jsonlFiles.length).toBeGreaterThan(0);
const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
const parsed = JSON.parse(content);
expect(parsed.skill).toBe('plan-eng-review');
expect(parsed.status).toBe('clean');
});
test('rejects non-JSON input with non-zero exit code', () => {
const result = run('not json at all', { expectFail: true });
expect(result.exitCode).not.toBe(0);
// Verify nothing was written
const projectDirs = fs.readdirSync(slugDir);
if (projectDirs.length > 0) {
const projectDir = path.join(slugDir, projectDirs[0]);
const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
if (jsonlFiles.length > 0) {
const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
expect(content).toBe('');
}
}
});
});
+1 -1
View File
@@ -45,7 +45,7 @@ describeIfSelected('Skill E2E tests', [
4. $B screenshot /tmp/skill-e2e-test.png
Report the results of each command.`,
workingDirectory: tmpDir,
maxTurns: 5,
maxTurns: 7,
timeout: 60_000,
testName: 'browse-basic',
runId,
+20 -14
View File
@@ -340,21 +340,22 @@ Write your findings to ${dir}/review-output.md`,
run('git', ['add', 'app.ts'], dir);
run('git', ['commit', '-m', 'feat: update to v2'], dir);
// Copy ship skill
fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dir, 'ship-SKILL.md'));
// Extract only Step 0 (base branch detection) from ship/SKILL.md
// (copying the full 1900-line file causes agent context bloat and flaky timeouts)
const fullShipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const step0Start = fullShipSkill.indexOf('## Step 0: Detect platform and base branch');
const step0End = fullShipSkill.indexOf('## Step 1: Pre-flight');
const shipSection = fullShipSkill.slice(step0Start, step0End > step0Start ? step0End : undefined);
fs.writeFileSync(path.join(dir, 'ship-SKILL.md'), shipSection);
const result = await runSkillTest({
prompt: `Read ship-SKILL.md for the ship workflow.
prompt: `Read ship-SKILL.md. It contains Step 0 (Detect base branch) from the ship workflow.
Skip the preamble bash block, lake intro, telemetry, and contributor mode sections — go straight to Step 0.
Run the base branch detection. Since there is no remote, gh commands will fail — fall back to main.
Run ONLY Step 0 (Detect base branch) and Step 1 (Pre-flight) from the ship workflow.
Since there is no remote, gh commands will fail — fall back to main.
Then run git diff and git log against the detected base branch.
After completing Step 0 and Step 1, STOP. Do NOT proceed to Step 2 or beyond.
Do NOT push, create PRs, or modify VERSION/CHANGELOG.
Write a summary of what you detected to ${dir}/ship-preflight.md including:
Write a summary to ${dir}/ship-preflight.md including:
- The detected base branch name
- The current branch name
- The diff stat against the base branch`,
@@ -580,8 +581,13 @@ describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'],
].join('\n'));
fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755);
// Copy ship skill
fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dashDir, 'ship-SKILL.md'));
// Extract only the Review Readiness Dashboard section from ship/SKILL.md
// (copying the full 1900-line file causes agent context bloat and timeouts)
const fullSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
const dashStart = fullSkill.indexOf('## Review Readiness Dashboard');
const dashEnd = fullSkill.indexOf('\n---\n', dashStart);
const dashSection = fullSkill.slice(dashStart, dashEnd > dashStart ? dashEnd : undefined);
fs.writeFileSync(path.join(dashDir, 'ship-SKILL.md'), dashSection);
});
afterAll(() => {
@@ -605,7 +611,7 @@ Skip the preamble, lake intro, telemetry, and all other ship steps.
Write the dashboard output to ${dashDir}/dashboard-output.md`,
workingDirectory: dashDir,
maxTurns: 12,
timeout: 90_000,
timeout: 180_000,
testName: 'review-dashboard-via',
runId,
});
@@ -639,7 +645,7 @@ Write the dashboard output to ${dashDir}/dashboard-output.md`,
);
// Ship dashboard should not gate when eng review is clear
expect(gateQuestions).toHaveLength(0);
}, 120_000);
}, 240_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
+4 -50
View File
@@ -250,56 +250,10 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 150_000);
testIfSelected('journey-think-bigger', async () => {
const tmpDir = createRoutingWorkDir('think-bigger');
try {
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
## Components
- REST API (Express.js)
- PostgreSQL database
- React frontend
- SMS integration (Twilio)
## Data Model
- restaurants (id, name, settings)
- parties (id, restaurant_id, name, size, phone, status, created_at)
- wait_estimates (id, restaurant_id, avg_wait_minutes)
## API Endpoints
- POST /api/parties - add party to waitlist
- GET /api/parties - list current waitlist
- PATCH /api/parties/:id/status - update party status
- GET /api/estimate - get current wait estimate
`);
spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
const testName = 'journey-think-bigger';
const expectedSkill = 'plan-ceo-review';
const result = await runSkillTest({
prompt: "Actually, looking at this plan again, I feel like we're thinking too small. We're just doing waitlists but what about the whole restaurant guest experience? Is there a bigger opportunity here we should go after?",
workingDirectory: tmpDir,
maxTurns: 5,
allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
timeout: 120_000,
testName,
runId,
});
const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill');
const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined;
logCost(`journey: ${testName}`, result);
recordRouting(testName, result, expectedSkill, actualSkill);
expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
const validSkills = ['plan-ceo-review', 'office-hours'];
expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
} finally {
fs.rmSync(tmpDir, { recursive: true, force: true });
}
}, 180_000);
// Removed: journey-think-bigger
// Tested ambiguous routing ("think bigger" → plan-ceo-review) but Claude
// legitimately answers directly instead of routing. Never passed reliably.
// The other 10 journey tests cover routing with clear signals.
testIfSelected('journey-debug', async () => {
const tmpDir = createRoutingWorkDir('debug');
+76
View File
@@ -125,6 +125,82 @@ describe('gstack-telemetry-log', () => {
expect(events[0]).toHaveProperty('_branch');
});
// ─── json_safe() injection prevention tests ────────────────
test('sanitizes skill name with quote injection attempt', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill 'review","injected":"true' --duration 10 --outcome success --session-id inj-1`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
// Must be valid JSON (no injection — quotes stripped, so no field injection possible)
const event = JSON.parse(lines[0]);
// The key check: no injected top-level property was created
expect(event).not.toHaveProperty('injected');
// Skill field should have quotes stripped but content preserved
expect(event.skill).not.toContain('"');
});
test('truncates skill name exceeding 200 chars', () => {
setConfig('telemetry', 'anonymous');
const longSkill = 'a'.repeat(250);
run(`${BIN}/gstack-telemetry-log --skill '${longSkill}' --duration 10 --outcome success --session-id trunc-1`);
const events = parseJsonl();
expect(events[0].skill.length).toBeLessThanOrEqual(200);
});
test('sanitizes outcome with newline injection attempt', () => {
setConfig('telemetry', 'anonymous');
// Use printf to pass actual newline in the argument
run(`bash -c 'OUTCOME=$(printf "success\\nfake\\":\\"true"); ${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome "$OUTCOME" --session-id inj-2'`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('fake');
});
test('sanitizes session_id with backslash-quote injection', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id 'id\\\\"","x":"y'`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('x');
});
test('sanitizes error_class with quote injection', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-class 'timeout","extra":"val' --session-id inj-3`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('extra');
});
test('sanitizes failed_step with quote injection', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --failed-step 'step1","hacked":"yes' --session-id inj-4`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event).not.toHaveProperty('hacked');
});
test('escapes error_message quotes and preserves content', () => {
setConfig('telemetry', 'anonymous');
run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-message 'Error: file "test.txt" not found' --session-id inj-5`);
const lines = readJsonl();
expect(lines).toHaveLength(1);
const event = JSON.parse(lines[0]);
expect(event.error_message).toContain('file');
expect(event.error_message).toContain('not found');
});
test('creates analytics directory if missing', () => {
// Remove analytics dir
const analyticsDir = path.join(tmpDir, 'analytics');