mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-19 08:10:08 +02:00
5867286c75
Two test files cover the voyage-code-3 default landed in the previous commits: test/gbrain-init-voyage-code-3.test.ts — free, deterministic, gate-tier. Mirrors gbrain-init-rollback.test.ts: runs the skill template's PGLite-init bash against a fake \`gbrain\` that logs argv to a sentinel file, asserts the right flags pass under VOYAGE_API_KEY set/unset/empty. Also includes belt-and-suspenders grep checks that the template literally contains the voyage gate at all 3 PGLite init sites. test/gbrain-sync-voyage-code-3-integration.test.ts — real, paid, skip-if-no-key. Inits a sandbox PGLite with voyage-code-3 in a tempdir, registers a 3-file fixture git repo as a source, runs \`gbrain sync --strategy code --skip-failed\`, asserts pages imported + embedded > 0. Also asserts \`gbrain doctor\` reports no dimension mismatch and the column width is 1024d. \`gbrain code-def\` smoke test confirms symbol extraction works against the embedded fixture. The integration test deliberately omits a \`gbrain query\` assertion: query produces correct output but \`gbrain query\` hangs ~2 min on a fresh PGLite before exiting. The smoking-gun assertion for "embeddings worked" is the "N pages embedded" line from sync output. Symbol-aware correctness is covered by the code-def assertion. Caught one real bug during test development: gbrain reads \`.gbrain-source\` from CWD and tries to sync that source too. The test sets cwd to the sandbox root to avoid the parent worktree's pin polluting the sandbox brain. Documented in the runGbrain() helper. Runtime: ~22s when VOYAGE_API_KEY is set, instant skip otherwise. Cost: ~\$0.001 per run (3 tiny fixture files, ~500 tokens of Voyage embeddings). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
329 lines
12 KiB
TypeScript
329 lines
12 KiB
TypeScript
/**
|
|
* Real integration: gbrain PGLite + voyage-code-3 end-to-end.
|
|
*
|
|
* Inits a sandboxed PGLite engine with voyage-code-3 embeddings, registers a
|
|
* tiny code fixture as a source, syncs it (which triggers Voyage embedding
|
|
* generation), and queries it back. The whole point is to catch the failure
|
|
* modes that hit us in real life:
|
|
*
|
|
* - dimension mismatch between the configured embedding column and the
|
|
* model's actual output dim (the 1280-vs-1536 trap that gbrain doctor
|
|
* surfaces but `gbrain init` silently sets up)
|
|
* - voyage-code-3 unavailable via gbrain's openai-compat adapter
|
|
* - sync completes but embedding generation silently fails (0 chunks)
|
|
*
|
|
* We intentionally do NOT call `gbrain query` here — it produces correct
|
|
* output but doesn't exit cleanly on a fresh PGLite (~2 min hang after
|
|
* results print). The smoking-gun assertion for "embeddings worked" is the
|
|
* "N pages embedded" line from sync output: if that's >= 1, voyage-code-3
|
|
* returned 1024-dim vectors and gbrain persisted them. Symbol-aware
|
|
* functionality is covered separately by the code-def test.
|
|
*
|
|
* Skips when:
|
|
* - `gbrain` is not on PATH (dev machine without it installed)
|
|
* - VOYAGE_API_KEY is unset (the test makes real Voyage API calls)
|
|
*
|
|
* Cost: ~$0.001 per run. The fixture is 3 tiny files, ~500 tokens total.
|
|
* Not gated on EVALS=1 because it's not an LLM eval — it's a deterministic
|
|
* integration test of the embedding pipeline. Always runs when the env
|
|
* supports it.
|
|
*
|
|
* Runtime: ~30-60s (gbrain init schema migrations + sync + Voyage round-trip).
|
|
* Long enough that `bun test` runs it serially with a per-test 120s timeout.
|
|
*/
|
|
|
|
import { describe, test, expect } from "bun:test";
|
|
import {
|
|
mkdtempSync,
|
|
mkdirSync,
|
|
writeFileSync,
|
|
rmSync,
|
|
existsSync,
|
|
} from "fs";
|
|
import { tmpdir } from "os";
|
|
import { join } from "path";
|
|
import { spawnSync } from "child_process";
|
|
|
|
const gbrainPath = spawnSync("which", ["gbrain"], { encoding: "utf-8" }).stdout.trim();
|
|
const gbrainAvailable = gbrainPath.length > 0;
|
|
const voyageKey = process.env.VOYAGE_API_KEY?.trim() ?? "";
|
|
const voyageKeyPresent = voyageKey.length > 0;
|
|
|
|
const shouldRun = gbrainAvailable && voyageKeyPresent;
|
|
const skipReason = !gbrainAvailable
|
|
? "gbrain not on PATH"
|
|
: !voyageKeyPresent
|
|
? "VOYAGE_API_KEY not set (real Voyage API calls required)"
|
|
: "";
|
|
|
|
if (!shouldRun) {
|
|
console.log(`[gbrain-sync-voyage-code-3-integration] SKIP: ${skipReason}`);
|
|
}
|
|
|
|
interface SandboxEnv {
|
|
root: string;
|
|
gbrainHome: string;
|
|
fixtureDir: string;
|
|
cleanup: () => void;
|
|
}
|
|
|
|
function makeSandbox(): SandboxEnv {
|
|
const root = mkdtempSync(join(tmpdir(), "gbrain-voyage-int-"));
|
|
// GBRAIN_HOME points at the PARENT of .gbrain (per gbrain's configDir());
|
|
// setting GBRAIN_HOME=/x means gbrain looks at /x/.gbrain/.
|
|
const gbrainHome = root;
|
|
const fixtureDir = join(root, "fixture-repo");
|
|
mkdirSync(fixtureDir, { recursive: true });
|
|
|
|
// Tiny realistic fixture: three files exercising different file types so
|
|
// gbrain's code stage has something to extract symbols + embeddings from.
|
|
writeFileSync(
|
|
join(fixtureDir, "math.ts"),
|
|
`export function fibonacci(n: number): number {
|
|
if (n <= 1) return n;
|
|
return fibonacci(n - 1) + fibonacci(n - 2);
|
|
}
|
|
|
|
export function isPrime(n: number): boolean {
|
|
if (n < 2) return false;
|
|
for (let i = 2; i * i <= n; i++) {
|
|
if (n % i === 0) return false;
|
|
}
|
|
return true;
|
|
}
|
|
`,
|
|
);
|
|
writeFileSync(
|
|
join(fixtureDir, "queue.ts"),
|
|
`export class JobQueue<T> {
|
|
private items: T[] = [];
|
|
enqueue(item: T): void { this.items.push(item); }
|
|
dequeue(): T | undefined { return this.items.shift(); }
|
|
size(): number { return this.items.length; }
|
|
}
|
|
`,
|
|
);
|
|
writeFileSync(
|
|
join(fixtureDir, "README.md"),
|
|
`# Fixture repo
|
|
|
|
Sample code for testing the voyage-code-3 embedding pipeline.
|
|
The math module exposes fibonacci and primality helpers.
|
|
The queue module is a simple FIFO job queue.
|
|
`,
|
|
);
|
|
|
|
// Make it a git repo because gbrain's code-sync strategy expects one.
|
|
const gitInit = spawnSync("git", ["init", "-q"], { cwd: fixtureDir, encoding: "utf-8" });
|
|
if (gitInit.status !== 0) {
|
|
throw new Error(`git init failed: ${gitInit.stderr}`);
|
|
}
|
|
spawnSync("git", ["config", "user.email", "test@example.invalid"], { cwd: fixtureDir });
|
|
spawnSync("git", ["config", "user.name", "test"], { cwd: fixtureDir });
|
|
spawnSync("git", ["add", "."], { cwd: fixtureDir });
|
|
spawnSync("git", ["commit", "-q", "-m", "fixture"], { cwd: fixtureDir });
|
|
|
|
return {
|
|
root,
|
|
gbrainHome,
|
|
fixtureDir,
|
|
cleanup: () => rmSync(root, { recursive: true, force: true }),
|
|
};
|
|
}
|
|
|
|
function gbrainEnv(s: SandboxEnv): NodeJS.ProcessEnv {
|
|
return {
|
|
...process.env,
|
|
GBRAIN_HOME: s.gbrainHome,
|
|
VOYAGE_API_KEY: voyageKey,
|
|
};
|
|
}
|
|
|
|
function runGbrain(s: SandboxEnv, args: string[], opts: { timeout?: number } = {}) {
|
|
// cwd MUST be the sandbox root, not the test's parent CWD. If gbrain runs
|
|
// from inside the gstack worktree, it picks up the worktree's
|
|
// `.gbrain-source` pin and tries to sync that source too — which won't
|
|
// exist in the sandbox PGLite, and the resulting "not found" exits 1.
|
|
return spawnSync("gbrain", args, {
|
|
encoding: "utf-8",
|
|
env: gbrainEnv(s),
|
|
cwd: s.root,
|
|
timeout: opts.timeout ?? 120_000,
|
|
});
|
|
}
|
|
|
|
describe.skipIf(!shouldRun)(
|
|
"gbrain PGLite + voyage-code-3 end-to-end (real Voyage API)",
|
|
() => {
|
|
test(
|
|
"init with voyage-code-3 produces a 1024-dim-aligned PGLite config",
|
|
() => {
|
|
const s = makeSandbox();
|
|
try {
|
|
const init = runGbrain(s, [
|
|
"init",
|
|
"--pglite",
|
|
"--json",
|
|
"--embedding-model",
|
|
"voyage:voyage-code-3",
|
|
"--embedding-dimensions",
|
|
"1024",
|
|
]);
|
|
expect(init.status).toBe(0);
|
|
// init prints JSON status line at the end; just sniff for success.
|
|
const out = (init.stdout || "") + (init.stderr || "");
|
|
expect(out).toContain('"status":"success"');
|
|
expect(out).toContain('"engine":"pglite"');
|
|
|
|
// doctor must agree the column width matches the live probe dim.
|
|
const doctor = runGbrain(s, ["doctor"]);
|
|
const dout = (doctor.stdout || "") + (doctor.stderr || "");
|
|
// Doctor exits non-zero on error rows; warnings are OK. The
|
|
// critical assertion is no dimension mismatch.
|
|
expect(dout).not.toContain("DB dimension mismatch");
|
|
// Should explicitly mention voyage-code-3 as the live provider.
|
|
expect(dout).toMatch(/voyage-code-3/);
|
|
// Width consistency check should be green for 1024d.
|
|
expect(dout).toMatch(/Schema width \(1024d\)/);
|
|
} finally {
|
|
s.cleanup();
|
|
}
|
|
},
|
|
120_000,
|
|
);
|
|
|
|
test(
|
|
"sync --strategy code generates Voyage embeddings and registers pages + chunks",
|
|
() => {
|
|
const s = makeSandbox();
|
|
try {
|
|
// 1. init voyage-code-3 PGLite
|
|
const init = runGbrain(s, [
|
|
"init",
|
|
"--pglite",
|
|
"--json",
|
|
"--embedding-model",
|
|
"voyage:voyage-code-3",
|
|
"--embedding-dimensions",
|
|
"1024",
|
|
]);
|
|
expect(init.status).toBe(0);
|
|
|
|
// 2. register the fixture as a code source
|
|
const add = runGbrain(s, [
|
|
"sources",
|
|
"add",
|
|
"fixture-code",
|
|
"--path",
|
|
s.fixtureDir,
|
|
]);
|
|
expect(add.status).toBe(0);
|
|
|
|
// 3. sync with code strategy — this is where Voyage embeddings get
|
|
// generated. Use --skip-failed so a single oversized file (which
|
|
// can happen in real repos) doesn't block the assertion.
|
|
const sync = runGbrain(
|
|
s,
|
|
[
|
|
"sync",
|
|
"--source",
|
|
"fixture-code",
|
|
"--strategy",
|
|
"code",
|
|
"--skip-failed",
|
|
],
|
|
{ timeout: 180_000 },
|
|
);
|
|
if (sync.status !== 0) {
|
|
console.error(`[sync FAILED exit=${sync.status}]`);
|
|
console.error(`STDOUT:\n${sync.stdout}`);
|
|
console.error(`STDERR:\n${sync.stderr}`);
|
|
}
|
|
expect(sync.status).toBe(0);
|
|
const sout = (sync.stdout || "") + (sync.stderr || "");
|
|
// The fixture has 3 files; gbrain should import at least the 2 .ts
|
|
// files (README.md may or may not be picked up by --strategy code
|
|
// depending on gbrain's file-type heuristics).
|
|
expect(sout).toMatch(/imported=[1-9]/);
|
|
// The "pages embedded" line is the smoking gun: if it's 0,
|
|
// embedding generation silently failed (voyage adapter broken,
|
|
// dimension mismatch, etc). Anything > 0 means voyage-code-3
|
|
// returned 1024-dim vectors and gbrain wrote them.
|
|
expect(sout).toMatch(/[1-9]\d* pages embedded/);
|
|
|
|
// 4. verify the source has pages and chunks
|
|
const list = runGbrain(s, ["sources", "list", "--json"]);
|
|
expect(list.status).toBe(0);
|
|
const sources = JSON.parse(list.stdout) as {
|
|
sources: Array<{ id: string; page_count: number }>;
|
|
};
|
|
const fixture = sources.sources.find((x) => x.id === "fixture-code");
|
|
expect(fixture).toBeDefined();
|
|
expect(fixture!.page_count).toBeGreaterThanOrEqual(2);
|
|
} finally {
|
|
s.cleanup();
|
|
}
|
|
},
|
|
300_000,
|
|
);
|
|
|
|
test(
|
|
"code-def finds symbols defined in the embedded fixture",
|
|
() => {
|
|
const s = makeSandbox();
|
|
try {
|
|
runGbrain(s, [
|
|
"init",
|
|
"--pglite",
|
|
"--json",
|
|
"--embedding-model",
|
|
"voyage:voyage-code-3",
|
|
"--embedding-dimensions",
|
|
"1024",
|
|
]);
|
|
runGbrain(s, ["sources", "add", "fixture-code", "--path", s.fixtureDir]);
|
|
runGbrain(
|
|
s,
|
|
["sync", "--source", "fixture-code", "--strategy", "code", "--skip-failed"],
|
|
{ timeout: 180_000 },
|
|
);
|
|
|
|
// code-def is the symbol-aware path. It doesn't strictly need
|
|
// embeddings (symbols are extracted by tree-sitter), but the JSON
|
|
// shape it returns is the contract gstack's CLAUDE.md guidance
|
|
// points the agent at. Verify it works against our PGLite + Voyage
|
|
// setup.
|
|
const result = runGbrain(s, ["code-def", "fibonacci"]);
|
|
expect(result.status).toBe(0);
|
|
const parsed = JSON.parse(result.stdout) as {
|
|
symbol: string;
|
|
count: number;
|
|
results: Array<{ file: string; symbol_type: string }>;
|
|
};
|
|
expect(parsed.symbol).toBe("fibonacci");
|
|
expect(parsed.count).toBeGreaterThanOrEqual(1);
|
|
expect(parsed.results[0].file).toContain("math.ts");
|
|
} finally {
|
|
s.cleanup();
|
|
}
|
|
},
|
|
300_000,
|
|
);
|
|
},
|
|
);
|
|
|
|
// Lightweight always-on guard: even without the integration test running, we
|
|
// can still assert that the test file's `describe.skipIf` gate is correctly
|
|
// formed. This catches a future edit that accidentally inverts the gate.
|
|
test("integration test gate uses the correct skip predicate", () => {
|
|
// shouldRun must be the boolean AND of the two pre-checks. If a refactor
|
|
// makes it true when either piece is missing, the test below would attempt
|
|
// real API calls without a key — undefined behavior.
|
|
expect(shouldRun).toBe(gbrainAvailable && voyageKeyPresent);
|
|
// When skipping, we logged a reason — basic sanity that the reason string
|
|
// matches what shouldRun says.
|
|
if (!shouldRun) {
|
|
expect(skipReason.length).toBeGreaterThan(0);
|
|
}
|
|
});
|