From 8745f89ad44fa13d5a64490104361e4095d385c8 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Mon, 27 Apr 2026 23:02:08 -0700 Subject: [PATCH] feat(windows): curated windows-free-tests CI job + test-free-shards curation Codex's v1.18.0.0 review flagged that a windows-latest matrix entry on the existing Linux-container evals.yml workflow can't work as a drop-in, and that the free test suite has POSIX-bound dependencies a sharded runner doesn't fix on its own. This commit takes McGluut's test-free-shards.ts (190 LOC), adds a Windows-fragility scan, and runs the curated subset on a separate non-container windows-latest job. scripts/test-free-shards.ts: - Enumeration + paid-eval filtering + stable-hash sharding (FNV-1a). Adapted from McGluut/gstack fork. - Upstream-original: --windows-only filter scans each test's content for POSIX-bound patterns: hardcoded /bin/sh, spawn('sh', ...), bash -c, raw /tmp/, chmod, xargs, which claude. Files matching are excluded with the reason logged. Currently filters 25 of 128 free tests; remaining 103 run on windows-latest. .github/workflows/windows-free-tests.yml: - Separate non-container job (NOT a matrix entry on evals.yml). Runs: bun run test:windows # curated subset bun test browse/test/claude-bin.test.ts # PATHEXT+overrides on Windows bun test test/gstack-paths.test.ts # state-root resolution package.json: new test:free + test:windows scripts. Honest about scope (codex-flagged): this does NOT make the full free suite Windows-safe. The 25 excluded tests need POSIX-only surfaces ported off shell primitives (test/ship-version-sync.test.ts:72 hardcodes /bin/bash, etc). Tracked as a P4 follow-up TODO. Full Windows parity is the next wave; this release ships the curated lane. Tests: test/test-free-shards.test.ts has 14 unit tests covering enumeration, paid-eval filtering, Windows-fragility detection (POSIX patterns + safe code), and stable sharding determinism. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/windows-free-tests.yml | 57 +++++ package.json | 4 +- scripts/test-free-shards.ts | 283 +++++++++++++++++++++++ test/test-free-shards.test.ts | 128 ++++++++++ 4 files changed, 471 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/windows-free-tests.yml create mode 100755 scripts/test-free-shards.ts create mode 100644 test/test-free-shards.test.ts diff --git a/.github/workflows/windows-free-tests.yml b/.github/workflows/windows-free-tests.yml new file mode 100644 index 00000000..e7bd9718 --- /dev/null +++ b/.github/workflows/windows-free-tests.yml @@ -0,0 +1,57 @@ +name: Windows Free Tests + +# Curated subset of the free test suite that runs on windows-latest. +# +# Codex's v1.18.0.0 review flagged that the existing evals.yml workflow uses +# a Linux container, so a windows-latest matrix entry there isn't a drop-in. +# This workflow is non-container, runs the curated Windows-safe subset, plus +# targeted resolver tests that exercise the Bun.which-based claude binary +# resolution + the GSTACK_CLAUDE_BIN override path on Windows. +# +# What this DOES NOT do (out of scope for v1.18.0.0): +# - Run the full free suite on Windows. The 24 tests that hardcode /bin/sh, +# spawn('sh',...), or raw /tmp/ paths are excluded by scripts/test-free-shards.ts +# --windows-only. They need POSIX-bound surfaces to be ported off shell +# primitives before they can run on Windows. Tracked as a follow-up TODO. +# - Run Playwright/browser-backed tests. Browse server bring-up on Windows is +# a separate concern (PR #1238 windows-pty-bun-pty-fix is in flight). + +on: + pull_request: + branches: [main] + workflow_dispatch: + +concurrency: + group: windows-free-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + windows-free-tests: + runs-on: windows-latest + timeout-minutes: 15 + + steps: + - uses: actions/checkout@v4 + + - uses: oven-sh/setup-bun@v1 + with: + bun-version: latest + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Show curated subset (for build log audit trail) + run: bun run scripts/test-free-shards.ts --windows-only --list + shell: bash + + - name: Run curated Windows-safe subset + run: bun run test:windows + shell: bash + + - name: Targeted Claude resolver tests (real PATHEXT coverage on Windows) + run: bun test browse/test/claude-bin.test.ts + shell: bash + + - name: gstack-paths helper test (resolves $GSTACK_STATE_ROOT etc. on Windows) + run: bun test test/gstack-paths.test.ts + shell: bash diff --git a/package.json b/package.json index a2dd52d4..ba021125 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "1.15.0.0", + "version": "1.18.0.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", @@ -17,6 +17,8 @@ "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", "test": "bun test browse/test/ test/ make-pdf/test/ --ignore 'test/skill-e2e-*.test.ts' --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts --ignore test/gemini-e2e.test.ts && (bun run slop:diff 2>/dev/null || true)", + "test:free": "bun run scripts/test-free-shards.ts", + "test:windows": "bun run scripts/test-free-shards.ts --windows-only", "test:evals": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", "test:e2e": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", diff --git a/scripts/test-free-shards.ts b/scripts/test-free-shards.ts new file mode 100755 index 00000000..c816a2b6 --- /dev/null +++ b/scripts/test-free-shards.ts @@ -0,0 +1,283 @@ +#!/usr/bin/env bun +/** + * test-free-shards — enumerate, shard, and curate the free test suite. + * + * Three jobs: + * 1. Enumeration. Walk `browse/test/`, `test/`, `make-pdf/test/` and return + * every `*.test.{ts,tsx,js,jsx,mjs,cjs}` that isn't a paid-eval test. + * 2. Sharding. Stable-hash assign each test to one of N shards. Used by CI + * to parallelize the free suite when needed. + * 3. Curation (Windows-safe filter). Scan each test's content for POSIX-only + * patterns (`/bin/bash`, `sh -c`, raw `/tmp/`, `chmod`, `xargs`). Files + * that match are excluded from the Windows-safe subset — they would fail + * on `windows-latest` no matter how the runner shards them. + * + * Adapted from the McGluut/gstack fork's test-free-shards.ts (190 LOC). The + * Windows-safe filter is upstream-original — codex flagged that sharding alone + * doesn't fix POSIX-bound tests, so we curate the subset that actually runs + * on the windows-latest CI job. + * + * Usage: + * bun run scripts/test-free-shards.ts --list # show all + * bun run scripts/test-free-shards.ts --windows-only --list # show curated + * bun run scripts/test-free-shards.ts --windows-only # run curated + * bun run scripts/test-free-shards.ts --shards 4 --shard 1 # one shard + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { spawnSync } from 'child_process'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const TEST_ROOTS = ['browse/test', 'test', 'make-pdf/test'] as const; +const TEST_FILE_REGEX = /\.test\.(?:[cm]?[jt]s|tsx|jsx)$/; + +// Tests that require API spend, external services, or e2e harnesses. +// These are filtered out before any sharding or curation. +const PAID_EVAL_TESTS = [ + /^browse\/test\/security-review-fullstack\.test\.ts$/, + /^test\/skill-e2e-.*\.test\.ts$/, + /^test\/skill-llm-eval\.test\.ts$/, + /^test\/skill-routing-e2e\.test\.ts$/, + /^test\/codex-e2e\.test\.ts$/, + /^test\/gemini-e2e\.test\.ts$/, +] as const; + +// POSIX-only patterns that indicate a test will fail on windows-latest no +// matter how the runner shards. Codex's v1.18.0.0 review flagged the first +// three as concrete examples in the existing free suite (test/ship-version-sync.test.ts:72, +// test/helpers/providers/claude.ts:22, package.json:12). We scan the test's +// own content here so the filter stays automatic as new tests land. +const WINDOWS_FRAGILE_PATTERNS: Array<{ pattern: RegExp; reason: string }> = [ + { pattern: /['"`]\/bin\/(?:ba)?sh/, reason: 'hardcoded /bin/sh or /bin/bash' }, + { pattern: /spawnSync\(['"]sh['"],|spawn\(['"]sh['"],|exec\(['"]sh /, reason: 'spawn("sh", ...)' }, + { pattern: /['"]bash -c['"]|['"]sh -c['"]/, reason: 'bash -c / sh -c' }, + { pattern: /['"`]\/tmp\//, reason: 'raw /tmp/ path (use os.tmpdir())' }, + { pattern: /['"]chmod\b/, reason: 'chmod shell command' }, + { pattern: /['"]xargs\b/, reason: 'xargs pipeline' }, + { pattern: /\bwhich claude\b/, reason: 'which claude (use Bun.which)' }, +]; + +export const DEFAULT_SHARD_COUNT = 20; +export const FREE_TEST_TIMEOUT_MS = 10_000; + +export function normalizeRelativePath(filePath: string): string { + return filePath.replace(/\\/g, '/'); +} + +export function isFreeTestFile(relativePath: string): boolean { + const normalized = normalizeRelativePath(relativePath); + if (!TEST_FILE_REGEX.test(normalized)) return false; + return !PAID_EVAL_TESTS.some(pattern => pattern.test(normalized)); +} + +/** + * Returns the first POSIX-only pattern hit in the file, or null if Windows-safe. + */ +export function detectWindowsFragility(absolutePath: string): { reason: string } | null { + let content: string; + try { + content = fs.readFileSync(absolutePath, 'utf-8'); + } catch { + return null; + } + for (const { pattern, reason } of WINDOWS_FRAGILE_PATTERNS) { + if (pattern.test(content)) return { reason }; + } + return null; +} + +function walkTestFiles(dirPath: string): string[] { + const entries = fs.readdirSync(dirPath, { withFileTypes: true }); + const files: string[] = []; + for (const entry of entries) { + const fullPath = path.join(dirPath, entry.name); + if (entry.isDirectory()) { + files.push(...walkTestFiles(fullPath)); + continue; + } + if (TEST_FILE_REGEX.test(entry.name)) { + files.push(fullPath); + } + } + return files; +} + +export function collectFreeTestFiles(rootDir = ROOT): string[] { + const discovered = new Set(); + for (const testRoot of TEST_ROOTS) { + const absoluteRoot = path.join(rootDir, testRoot); + if (!fs.existsSync(absoluteRoot)) continue; + for (const fullPath of walkTestFiles(absoluteRoot)) { + const relativePath = normalizeRelativePath(path.relative(rootDir, fullPath)); + if (isFreeTestFile(relativePath)) { + discovered.add(relativePath); + } + } + } + return [...discovered].sort(); +} + +export interface CurationResult { + safe: string[]; + excluded: Array<{ file: string; reason: string }>; +} + +export function curateWindowsSafe(files: string[], rootDir = ROOT): CurationResult { + const safe: string[] = []; + const excluded: Array<{ file: string; reason: string }> = []; + for (const relativePath of files) { + const absolute = path.join(rootDir, relativePath); + const fragility = detectWindowsFragility(absolute); + if (fragility) { + excluded.push({ file: relativePath, reason: fragility.reason }); + } else { + safe.push(relativePath); + } + } + return { safe, excluded }; +} + +export function stableHash(input: string): number { + let hash = 0x811c9dc5; + for (let index = 0; index < input.length; index += 1) { + hash ^= input.charCodeAt(index); + hash = Math.imul(hash, 0x01000193); + } + return hash >>> 0; +} + +export function assignFilesToShards(files: string[], shardCount: number): string[][] { + if (!Number.isInteger(shardCount) || shardCount <= 0) { + throw new Error(`Shard count must be a positive integer. Received: ${shardCount}`); + } + + const shards = Array.from({ length: shardCount }, () => [] as string[]); + for (const file of files) { + const shardIndex = stableHash(file) % shardCount; + shards[shardIndex].push(file); + } + + return shards + .map(filesInShard => filesInShard.sort()) + .filter(filesInShard => filesInShard.length > 0); +} + +export function buildShardArgs(files: string[]): string[] { + return ['test', ...files, '--max-concurrency=1', `--timeout=${FREE_TEST_TIMEOUT_MS}`]; +} + +type CliOptions = { + dryRun: boolean; + listOnly: boolean; + windowsOnly: boolean; + shardCount: number; + shardIndex: number | null; +}; + +function parseCliOptions(argv: string[]): CliOptions { + let dryRun = false; + let listOnly = false; + let windowsOnly = false; + let shardCount = DEFAULT_SHARD_COUNT; + let shardIndex: number | null = null; + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + if (arg === '--dry-run') { dryRun = true; continue; } + if (arg === '--list') { listOnly = true; continue; } + if (arg === '--windows-only') { windowsOnly = true; continue; } + if (arg === '--shards') { + const value = argv[index + 1]; + if (!value) throw new Error('Missing value for --shards'); + shardCount = Number.parseInt(value, 10); + index += 1; + continue; + } + if (arg === '--shard') { + const value = argv[index + 1]; + if (!value) throw new Error('Missing value for --shard'); + shardIndex = Number.parseInt(value, 10); + index += 1; + continue; + } + throw new Error(`Unknown argument: ${arg}`); + } + + return { dryRun, listOnly, windowsOnly, shardCount, shardIndex }; +} + +function formatShardSummary(shards: string[][]): string[] { + return shards.map((files, index) => { + const preview = files.slice(0, 3).join(', '); + const suffix = files.length > 3 ? ', ...' : ''; + return `Shard ${index + 1}/${shards.length}: ${files.length} files${preview ? ` -> ${preview}${suffix}` : ''}`; + }); +} + +function runShard(files: string[], shardNumber: number, totalShards: number): number { + const header = `[test:free] shard ${shardNumber}/${totalShards} (${files.length} files)`; + console.log(header); + const result = spawnSync(process.execPath, buildShardArgs(files), { + cwd: ROOT, + stdio: 'inherit', + env: process.env, + }); + if (result.status !== 0) { + console.error(`${header} failed with exit code ${result.status ?? 1}`); + } + return result.status ?? 1; +} + +function main(): number { + const options = parseCliOptions(process.argv.slice(2)); + const allFiles = collectFreeTestFiles(); + if (allFiles.length === 0) { + throw new Error('No free test files were discovered.'); + } + + let files = allFiles; + let curationReport: CurationResult | null = null; + if (options.windowsOnly) { + curationReport = curateWindowsSafe(allFiles); + files = curationReport.safe; + console.log(`[test:free] curated ${files.length} Windows-safe tests (${curationReport.excluded.length} excluded)`); + if (options.listOnly && curationReport.excluded.length > 0) { + console.log('\nExcluded (POSIX-fragile):'); + for (const { file, reason } of curationReport.excluded) { + console.log(` - ${file} [${reason}]`); + } + } + } + + if (options.listOnly) { + console.log(`\nDiscovered ${files.length} test files.`); + for (const file of files) console.log(` ${file}`); + return 0; + } + + const shards = assignFilesToShards(files, options.shardCount); + if (options.dryRun) { + console.log(`\nWould run ${files.length} files across ${shards.length} shards.`); + for (const line of formatShardSummary(shards)) console.log(line); + return 0; + } + + if (options.shardIndex !== null) { + if (!Number.isInteger(options.shardIndex) || options.shardIndex < 1 || options.shardIndex > shards.length) { + throw new Error(`--shard must be between 1 and ${shards.length}. Received: ${options.shardIndex}`); + } + return runShard(shards[options.shardIndex - 1], options.shardIndex, shards.length); + } + + for (let index = 0; index < shards.length; index += 1) { + const exitCode = runShard(shards[index], index + 1, shards.length); + if (exitCode !== 0) return exitCode; + } + + return 0; +} + +if (import.meta.main) { + process.exitCode = main(); +} diff --git a/test/test-free-shards.test.ts b/test/test-free-shards.test.ts new file mode 100644 index 00000000..5e1cbd6a --- /dev/null +++ b/test/test-free-shards.test.ts @@ -0,0 +1,128 @@ +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + isFreeTestFile, + collectFreeTestFiles, + detectWindowsFragility, + curateWindowsSafe, + stableHash, + assignFilesToShards, + normalizeRelativePath, +} from '../scripts/test-free-shards'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +describe('test-free-shards: enumeration', () => { + test('isFreeTestFile rejects non-test files', () => { + expect(isFreeTestFile('test/foo.ts')).toBe(false); + expect(isFreeTestFile('test/foo.test.ts')).toBe(true); + expect(isFreeTestFile('test/foo.test.tsx')).toBe(true); + expect(isFreeTestFile('test/foo.test.mjs')).toBe(true); + }); + + test('isFreeTestFile rejects paid eval tests', () => { + expect(isFreeTestFile('test/skill-e2e-foo.test.ts')).toBe(false); + expect(isFreeTestFile('test/skill-llm-eval.test.ts')).toBe(false); + expect(isFreeTestFile('test/codex-e2e.test.ts')).toBe(false); + expect(isFreeTestFile('test/gemini-e2e.test.ts')).toBe(false); + }); + + test('collectFreeTestFiles returns sorted, deduped, only-free list', () => { + const files = collectFreeTestFiles(ROOT); + expect(files.length).toBeGreaterThan(10); + expect(files).toEqual([...files].sort()); + expect(new Set(files).size).toBe(files.length); + for (const f of files) { + expect(isFreeTestFile(f)).toBe(true); + } + }); + + test('normalizeRelativePath converts Windows backslashes to forward slashes', () => { + expect(normalizeRelativePath('test\\foo\\bar.test.ts')).toBe('test/foo/bar.test.ts'); + expect(normalizeRelativePath('test/foo/bar.test.ts')).toBe('test/foo/bar.test.ts'); + }); +}); + +describe('test-free-shards: Windows curation', () => { + function withTempFile(content: string, fn: (filePath: string) => void): void { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'curation-test-')); + const file = path.join(dir, 'sample.test.ts'); + fs.writeFileSync(file, content); + try { + fn(file); + } finally { + fs.rmSync(dir, { recursive: true, force: true }); + } + } + + test('detects /bin/bash hardcode', () => { + withTempFile(`spawn('/bin/bash', ['-c', 'echo hi']);`, (f) => { + expect(detectWindowsFragility(f)?.reason).toBe('hardcoded /bin/sh or /bin/bash'); + }); + }); + + test('detects spawn("sh", ...)', () => { + withTempFile(`spawnSync('sh', ['-c', 'command -v claude']);`, (f) => { + expect(detectWindowsFragility(f)?.reason).toBe('spawn("sh", ...)'); + }); + }); + + test('detects raw /tmp/ paths', () => { + withTempFile(`const TMPERR = '/tmp/codex-err.txt';`, (f) => { + expect(detectWindowsFragility(f)?.reason).toBe('raw /tmp/ path (use os.tmpdir())'); + }); + }); + + test('detects which claude shell command', () => { + withTempFile(`execSync('which claude').trim();`, (f) => { + expect(detectWindowsFragility(f)?.reason).toBe('which claude (use Bun.which)'); + }); + }); + + test('Windows-safe code passes the filter', () => { + withTempFile(`import { spawn } from 'child_process'; spawn(claude.command, args);`, (f) => { + expect(detectWindowsFragility(f)).toBeNull(); + }); + }); + + test('curateWindowsSafe partitions files into safe + excluded', () => { + const files = collectFreeTestFiles(ROOT); + const result = curateWindowsSafe(files, ROOT); + expect(result.safe.length + result.excluded.length).toBe(files.length); + // Sanity: at least one excluded entry, since we know test/ship-version-sync.test.ts uses /bin/bash + expect(result.excluded.length).toBeGreaterThan(0); + // Every excluded entry has a non-empty reason + for (const { reason } of result.excluded) { + expect(reason.length).toBeGreaterThan(0); + } + }); +}); + +describe('test-free-shards: sharding', () => { + test('stableHash is deterministic', () => { + expect(stableHash('foo.test.ts')).toBe(stableHash('foo.test.ts')); + expect(stableHash('foo.test.ts')).not.toBe(stableHash('bar.test.ts')); + }); + + test('assignFilesToShards distributes files into N non-empty shards', () => { + const files = ['a.test.ts', 'b.test.ts', 'c.test.ts', 'd.test.ts', 'e.test.ts']; + const shards = assignFilesToShards(files, 3); + const flattened = shards.flat(); + expect(flattened.sort()).toEqual([...files].sort()); + expect(shards.every((s) => s.length > 0)).toBe(true); + }); + + test('assignFilesToShards rejects invalid shard counts', () => { + expect(() => assignFilesToShards(['a.test.ts'], 0)).toThrow(); + expect(() => assignFilesToShards(['a.test.ts'], -1)).toThrow(); + }); + + test('shards are stable across runs (same files always land in same shard)', () => { + const files = ['x.test.ts', 'y.test.ts', 'z.test.ts']; + const a = assignFilesToShards(files, 5); + const b = assignFilesToShards(files, 5); + expect(a).toEqual(b); + }); +});