test(browser-skills): gate-tier E2E for /scrape + /skillify (D4)

Five scenarios cover the productivity loop and the contracts locked
during the v1.19.0.0 plan review:

  scrape-match-path           — intent matching bundled hackernews-frontpage
                                routes via $B skill run, no prototype phase
  scrape-prototype-path       — no matching skill, drives $B against a local
                                file:// fixture, returns JSON, suggests
                                /skillify
  skillify-happy-path         — /scrape then /skillify; skill written to
                                ~/.gstack/browser-skills/<name>/ with the
                                full file tree; SKILL.md prose body must
                                not contain conversation fragments (D2)
  skillify-provenance-refusal — cold /skillify with no prior /scrape refuses
                                with the D1 message; nothing on disk (D1)
  skillify-approval-reject    — /scrape then /skillify but reject in the
                                approval gate; temp dir is removed, nothing
                                at the final tier path (D3)

All five gate-tier (~$0.50-$1.50 each, ~$5 total per CI run). Set EVALS=1
to enable. Uses local file:// fixtures so prototype + skillify scenarios
run deterministically without network.

Touchfiles registers all 5 entries with proper deps on scrape/**,
skillify/**, browse/src/browser-skill-write.ts, and the Phase 1 runtime
modules. The match-path test depends on the bundled hackernews-frontpage
skill so its touchfile includes browser-skills/hackernews-frontpage/**.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-27 18:34:07 -07:00
parent e0b454fe58
commit b5904dc11f
2 changed files with 482 additions and 0 deletions
+30
View File
@@ -242,6 +242,29 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Multi-provider benchmark adapters — live API smoke against real claude/codex/gemini CLIs
'benchmark-providers-live': ['bin/gstack-model-benchmark', 'test/helpers/providers/**', 'test/helpers/benchmark-runner.ts', 'test/helpers/pricing.ts'],
// Browser-skills Phase 2a — /scrape + /skillify (v1.19.0.0). Gate-tier
// E2E covers the D1 (provenance guard), D3 (atomic write) contracts plus
// the basic loop. Shared deps: both skill templates, the D3 helper, the
// Phase 1 runtime, and the bundled hackernews-frontpage reference (the
// match-path test relies on it).
'scrape-match-path': [
'scrape/**', 'browse/src/browser-skills.ts', 'browse/src/browser-skill-commands.ts',
'browser-skills/hackernews-frontpage/**',
],
'scrape-prototype-path': [
'scrape/**', 'browse/src/browser-skills.ts', 'browse/src/browser-skill-commands.ts',
],
'skillify-happy-path': [
'skillify/**', 'scrape/**', 'browse/src/browser-skill-write.ts',
'browse/src/browser-skills.ts', 'browse/src/browser-skill-commands.ts',
],
'skillify-provenance-refusal': [
'skillify/**', 'browse/src/browser-skill-write.ts',
],
'skillify-approval-reject': [
'skillify/**', 'scrape/**', 'browse/src/browser-skill-write.ts',
],
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -478,6 +501,13 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Multi-provider benchmark — periodic (requires external CLIs + auth, paid)
'benchmark-providers-live': 'periodic',
// Browser-skills Phase 2a — gate (D1/D3 contracts must not silently break)
'scrape-match-path': 'gate',
'scrape-prototype-path': 'gate',
'skillify-happy-path': 'gate',
'skillify-provenance-refusal': 'gate',
'skillify-approval-reject': 'gate',
// Skill routing — periodic (LLM routing is non-deterministic)
'journey-ideation': 'periodic',
'journey-plan-eng': 'periodic',
+452
View File
@@ -0,0 +1,452 @@
/**
* Browser-skills Phase 2a — gate-tier E2E for /scrape and /skillify.
*
* Five scenarios cover the productivity loop and the contracts locked
* during the v1.19.0.0 plan review:
*
* D1 — /skillify provenance guard (scenario 4)
* D2 — synthesis input slice (covered indirectly by scenario 3 — the
* committed SKILL.md must not contain conversation prose)
* D3 — atomic write discipline (scenarios 3 and 5)
*
* 1. scrape-match-path — /scrape with intent matching bundled
* hackernews-frontpage routes via $B skill run, no prototype.
* 2. scrape-prototype-path — /scrape against a local file:// fixture
* (no matching skill) drives $B primitives, returns JSON, suggests
* /skillify.
* 3. skillify-happy-path — /scrape then /skillify in one session.
* Skill written to ~/.gstack/browser-skills/<name>/ with full
* file tree, $B skill test passes.
* 4. skillify-provenance-refusal — cold /skillify with no prior
* /scrape refuses with the D1 message; nothing on disk.
* 5. skillify-approval-reject — /scrape then /skillify but reject in
* the approval gate; temp dir is removed, nothing at final path.
*
* All five run gate-tier (~$0.50$1.50 each, ~$5 total per CI).
* Set EVALS=1 to enable. Set EVALS_MODEL to override (default sonnet-4-6).
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import {
ROOT, browseBin, runId,
describeIfSelected, testConcurrentIfSelected,
setupBrowseShims, copyDirSync, logCost, recordE2E,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-skillify');
// ─── Shared workdir setup ───────────────────────────────────────
interface Workdir {
workDir: string;
gstackHome: string;
skillsDir: string;
}
/**
* Build a working directory that has:
* - The /scrape and /skillify skills installed under .claude/skills/
* - The browse binary symlinked + find-browse shim (via setupBrowseShims)
* - bin/ scripts referenced by the preamble
* - A scoped GSTACK_HOME under the workdir so on-disk artifacts are
* contained and assertable
* - A CLAUDE.md routing block instructing Skill-tool invocation
*
* `installSkills` lets each test pick the minimum surface (e.g., the
* provenance-refusal scenario doesn't need /scrape).
*/
function setupSkillifyWorkdir(suffix: string, installSkills: string[] = ['scrape', 'skillify']): Workdir {
const workDir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-skillify-${suffix}-`));
const gstackHome = path.join(workDir, '.gstack-home');
fs.mkdirSync(gstackHome, { recursive: true });
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(workDir, 'README.md'), '# test\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'initial']);
setupBrowseShims(workDir);
// Install requested skills.
const skillsDir = path.join(workDir, '.claude', 'skills');
for (const skill of installSkills) {
const destDir = path.join(skillsDir, skill);
fs.mkdirSync(destDir, { recursive: true });
fs.copyFileSync(path.join(ROOT, skill, 'SKILL.md'), path.join(destDir, 'SKILL.md'));
}
// bin/ scripts — preamble references several of these.
const binDir = path.join(workDir, 'bin');
fs.mkdirSync(binDir, { recursive: true });
for (const script of [
'gstack-timeline-log', 'gstack-slug', 'gstack-config',
'gstack-update-check', 'gstack-repo-mode',
'gstack-learnings-log', 'gstack-learnings-search',
]) {
const src = path.join(ROOT, 'bin', script);
if (fs.existsSync(src)) {
fs.copyFileSync(src, path.join(binDir, script));
fs.chmodSync(path.join(binDir, script), 0o755);
}
}
fs.writeFileSync(path.join(workDir, 'CLAUDE.md'), `# Project Instructions
## Skill routing
When the user's request matches an available skill, ALWAYS invoke it via
the Skill tool as your FIRST action.
Key routing rules:
- /scrape, "scrape", "get data from", "extract from" → invoke scrape
- /skillify, "skillify", "codify this scrape" → invoke skillify
Environment:
- GSTACK_HOME="${gstackHome}" for all gstack bin scripts.
- bin scripts are at ./bin/ relative to this directory.
- Browse binary is at ${browseBin} — assign to $B (e.g., \`B=${browseBin}\`).
`);
return { workDir, gstackHome, skillsDir };
}
/**
* Install the bundled hackernews-frontpage browser-skill into the workdir's
* project-tier (so $B skill list finds it for match-path tests). The skill
* has to live under <workdir>/.gstack/browser-skills/ for the project-tier
* lookup to find it (gstack's bundled tier resolves from the install dir,
* which the test workdir doesn't have).
*/
function installBundledHackernewsSkill(workDir: string) {
const src = path.join(ROOT, 'browser-skills', 'hackernews-frontpage');
const dst = path.join(workDir, '.gstack', 'browser-skills', 'hackernews-frontpage');
copyDirSync(src, dst);
}
/** Helper: every Bash invocation's command string from the agent. */
function bashCommands(result: { toolCalls: Array<{ tool: string; input: any }> }): string[] {
return result.toolCalls
.filter((tc) => tc.tool === 'Bash')
.map((tc) => String(tc.input?.command ?? ''))
.filter(Boolean);
}
/** Helper: the union of agent text + every tool input/output for matching. */
function fullSurface(result: any): string {
const parts: string[] = [];
if (result.output) parts.push(String(result.output));
for (const tc of result.toolCalls || []) {
parts.push(JSON.stringify(tc.input || {}));
if (tc.output) parts.push(String(tc.output));
}
for (const entry of result.transcript || []) {
try { parts.push(JSON.stringify(entry)); } catch { /* skip */ }
}
return parts.join('\n');
}
// ─── Test fixtures ──────────────────────────────────────────────
/**
* Tiny HTML fixture for the prototype-path test. Stable structure with three
* "items" the agent should be able to extract via $B html + parse.
*/
const PROTOTYPE_FIXTURE_HTML = `<!doctype html>
<html><body>
<h1>Test Items</h1>
<ul id="items">
<li class="item"><a href="/a">First Title</a><span class="score">42</span></li>
<li class="item"><a href="/b">Second Title</a><span class="score">17</span></li>
<li class="item"><a href="/c">Third Title</a><span class="score">8</span></li>
</ul>
</body></html>
`;
// ─── Live-fire suite ────────────────────────────────────────────
describeIfSelected('Browser-skills Phase 2a E2E (/scrape + /skillify)', [
'scrape-match-path',
'scrape-prototype-path',
'skillify-happy-path',
'skillify-provenance-refusal',
'skillify-approval-reject',
], () => {
afterAll(() => { finalizeEvalCollector(evalCollector); });
// ── 1. /scrape match path: bundled hackernews-frontpage matches ──────
testConcurrentIfSelected('scrape-match-path', async () => {
const { workDir, gstackHome } = setupSkillifyWorkdir('match', ['scrape']);
installBundledHackernewsSkill(workDir);
const result = await runSkillTest({
prompt: `Run /scrape latest hacker news stories. Invoke /scrape via the Skill tool.
You MUST follow the skill's match-phase logic:
1. Run \`$B skill list\` to see what browser-skills are available
2. Recognize that "latest hacker news stories" matches the bundled
hackernews-frontpage skill's triggers
3. Run \`$B skill run hackernews-frontpage\` and emit the JSON
Do NOT enter the prototype phase. Do NOT use AskUserQuestion.`,
workingDirectory: workDir,
env: { GSTACK_HOME: gstackHome },
maxTurns: 12,
allowedTools: ['Skill', 'Bash', 'Read'],
timeout: 120_000,
testName: 'scrape-match-path',
runId,
});
logCost('scrape-match-path', result);
const cmds = bashCommands(result);
const listedSkills = cmds.some(c => /\bskill\s+list\b/.test(c));
const ranBundledSkill = cmds.some(c => /\bskill\s+run\s+hackernews-frontpage\b/.test(c));
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
recordE2E(evalCollector, 'scrape match-path routes to bundled skill', 'Phase 2a E2E', result, {
passed: exitOk && listedSkills && ranBundledSkill,
});
expect(exitOk).toBe(true);
expect(listedSkills).toBe(true);
expect(ranBundledSkill).toBe(true);
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
}, 180_000);
// ── 2. /scrape prototype path: drive $B primitives against fixture ────
testConcurrentIfSelected('scrape-prototype-path', async () => {
const { workDir, gstackHome } = setupSkillifyWorkdir('prototype', ['scrape']);
// Stage a local HTML fixture the agent can goto via file://
const fixturePath = path.join(workDir, 'fixture.html');
fs.writeFileSync(fixturePath, PROTOTYPE_FIXTURE_HTML);
const fileUrl = `file://${fixturePath}`;
const result = await runSkillTest({
prompt: `Run /scrape titles and scores from ${fileUrl}.
Invoke /scrape via the Skill tool. Follow the skill's prototype-phase logic:
1. \`$B skill list\` finds NO matching skill
2. Drive: \`$B goto ${fileUrl}\` then \`$B html\` (or \`$B text\`)
3. Parse the items (each has a title and a score)
4. Emit JSON of the form {"items": [{"title": "...", "score": N}, ...], "count": N}
5. Suggest /skillify in one line
Do NOT use AskUserQuestion.`,
workingDirectory: workDir,
env: { GSTACK_HOME: gstackHome },
maxTurns: 18,
allowedTools: ['Skill', 'Bash', 'Read'],
timeout: 180_000,
testName: 'scrape-prototype-path',
runId,
});
logCost('scrape-prototype-path', result);
const cmds = bashCommands(result);
const wentToFixture = cmds.some(c => c.includes(fileUrl));
const fetchedHtml = cmds.some(c => /\bgoto\b|\bhtml\b|\btext\b/.test(c));
const surface = fullSurface(result);
const mentionsSkillify = /skillify/i.test(surface);
const hasJsonItems = /"items"\s*:\s*\[/.test(surface) || /'items'\s*:/.test(surface);
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
recordE2E(evalCollector, 'scrape prototype-path drives $B + emits JSON + nudges skillify', 'Phase 2a E2E', result, {
passed: exitOk && wentToFixture && fetchedHtml && hasJsonItems && mentionsSkillify,
});
expect(exitOk).toBe(true);
expect(wentToFixture).toBe(true);
expect(fetchedHtml).toBe(true);
expect(hasJsonItems).toBe(true);
expect(mentionsSkillify).toBe(true);
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
}, 240_000);
// ── 3. /skillify happy path: scrape then skillify in one session ─────
testConcurrentIfSelected('skillify-happy-path', async () => {
const { workDir, gstackHome } = setupSkillifyWorkdir('happy', ['scrape', 'skillify']);
const fixturePath = path.join(workDir, 'fixture.html');
fs.writeFileSync(fixturePath, PROTOTYPE_FIXTURE_HTML);
const fileUrl = `file://${fixturePath}`;
const result = await runSkillTest({
prompt: `Two steps in this session:
1. Run /scrape titles and scores from ${fileUrl} via the Skill tool.
Drive the prototype path; return JSON with items[].
2. Run /skillify via the Skill tool. Follow ALL 11 steps including:
- D1 provenance guard (you have a recent /scrape, proceed)
- D2 synthesis: include ONLY the final-attempt $B calls (goto + html)
- D3 atomic write: stage to temp dir, run test, then commit on approval
- When AskUserQuestion fires, choose the recommended option (A)
for both the name/tier question AND the approval gate.
Use HOME=${workDir} so all skill writes land under the test workdir
(translates to ~/.gstack/browser-skills/<name>/ via $HOME).
Do NOT halt for clarification.`,
workingDirectory: workDir,
env: {
GSTACK_HOME: gstackHome,
HOME: workDir, // /skillify writes to $HOME/.gstack/browser-skills/
},
maxTurns: 40,
allowedTools: ['Skill', 'Bash', 'Read', 'Write'],
timeout: 360_000,
testName: 'skillify-happy-path',
runId,
});
logCost('skillify-happy-path', result);
// The skill should land in $HOME/.gstack/browser-skills/<name>/
const skillsRoot = path.join(workDir, '.gstack', 'browser-skills');
const writtenSkills = fs.existsSync(skillsRoot)
? fs.readdirSync(skillsRoot).filter(d => !d.startsWith('.') && d !== 'hackernews-frontpage')
: [];
const skillName = writtenSkills[0];
const skillDir = skillName ? path.join(skillsRoot, skillName) : '';
const hasAllFiles = !!skillDir
&& fs.existsSync(path.join(skillDir, 'SKILL.md'))
&& fs.existsSync(path.join(skillDir, 'script.ts'))
&& fs.existsSync(path.join(skillDir, 'script.test.ts'))
&& fs.existsSync(path.join(skillDir, '_lib', 'browse-client.ts'))
&& fs.existsSync(path.join(skillDir, 'fixtures'));
// D2 enforcement: the SKILL.md prose body MUST NOT contain conversation
// fragments. Cheap heuristic: it shouldn't have "I" or "Let me" or other
// first-person/agent-narration markers.
let prosesClean = false;
if (hasAllFiles) {
const skillMd = fs.readFileSync(path.join(skillDir, 'SKILL.md'), 'utf-8');
const body = skillMd.split(/\n---\n/)[1] || '';
prosesClean = !/^I /m.test(body)
&& !/Let me /i.test(body)
&& !/^I'll /m.test(body);
}
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
recordE2E(evalCollector, 'skillify happy path writes well-formed skill on disk', 'Phase 2a E2E', result, {
passed: exitOk && hasAllFiles && prosesClean,
});
expect(exitOk).toBe(true);
expect(writtenSkills.length).toBeGreaterThan(0);
expect(hasAllFiles).toBe(true);
expect(prosesClean).toBe(true);
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
}, 420_000);
// ── 4. /skillify provenance refusal: D1 contract ─────────────────────
testConcurrentIfSelected('skillify-provenance-refusal', async () => {
const { workDir, gstackHome } = setupSkillifyWorkdir('refusal', ['skillify']);
const result = await runSkillTest({
prompt: `Run /skillify via the Skill tool. There has been NO prior /scrape
in this conversation. Follow the skill's Step 1 (D1 provenance guard) literally:
walk back through agent turns, find no /scrape result, refuse with the exact
message the skill specifies, and stop. Do NOT synthesize anything. Do NOT
write any files.`,
workingDirectory: workDir,
env: {
GSTACK_HOME: gstackHome,
HOME: workDir,
},
maxTurns: 8,
allowedTools: ['Skill', 'Bash', 'Read'],
timeout: 90_000,
testName: 'skillify-provenance-refusal',
runId,
});
logCost('skillify-provenance-refusal', result);
const surface = fullSurface(result);
const refusalText = /no recent \/?scrape result|run \/scrape.*first|no prior \/?scrape/i.test(surface);
// Critical: nothing on disk. No staged dir, no committed skill.
const skillsRoot = path.join(workDir, '.gstack', 'browser-skills');
const stagingRoot = path.join(workDir, '.gstack', '.tmp');
const noSkillsWritten = !fs.existsSync(skillsRoot)
|| fs.readdirSync(skillsRoot).filter(d => !d.startsWith('.')).length === 0;
const noStaging = !fs.existsSync(stagingRoot)
|| fs.readdirSync(stagingRoot).filter(d => d.startsWith('skillify-')).length === 0;
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
recordE2E(evalCollector, 'skillify D1 refusal — no on-disk write', 'Phase 2a E2E', result, {
passed: exitOk && refusalText && noSkillsWritten && noStaging,
});
expect(exitOk).toBe(true);
expect(refusalText).toBe(true);
expect(noSkillsWritten).toBe(true);
expect(noStaging).toBe(true);
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
}, 120_000);
// ── 5. /skillify approval-gate reject: D3 cleanup ────────────────────
testConcurrentIfSelected('skillify-approval-reject', async () => {
const { workDir, gstackHome } = setupSkillifyWorkdir('reject', ['scrape', 'skillify']);
const fixturePath = path.join(workDir, 'fixture.html');
fs.writeFileSync(fixturePath, PROTOTYPE_FIXTURE_HTML);
const fileUrl = `file://${fixturePath}`;
const result = await runSkillTest({
prompt: `Two steps:
1. Run /scrape titles and scores from ${fileUrl} via the Skill tool.
2. Run /skillify via the Skill tool. Follow steps 1-9. When the approval
gate AskUserQuestion fires (Step 9), choose option C (Discard) instead
of A (Commit). The D3 contract says the temp dir must be removed and
nothing should land at the final tier path.
Use HOME=${workDir}. Do NOT commit the skill.`,
workingDirectory: workDir,
env: {
GSTACK_HOME: gstackHome,
HOME: workDir,
},
maxTurns: 35,
allowedTools: ['Skill', 'Bash', 'Read', 'Write'],
timeout: 360_000,
testName: 'skillify-approval-reject',
runId,
});
logCost('skillify-approval-reject', result);
// D3 contract: nothing at the final tier path; staging dir is gone.
const skillsRoot = path.join(workDir, '.gstack', 'browser-skills');
const writtenSkills = fs.existsSync(skillsRoot)
? fs.readdirSync(skillsRoot).filter(d => !d.startsWith('.'))
: [];
const stagingRoot = path.join(workDir, '.gstack', '.tmp');
const stagingLeftovers = fs.existsSync(stagingRoot)
? fs.readdirSync(stagingRoot).filter(d => d.startsWith('skillify-'))
: [];
const exitOk = ['success', 'error_max_turns'].includes(result.exitReason);
recordE2E(evalCollector, 'skillify approval-reject leaves no on-disk artifact', 'Phase 2a E2E', result, {
passed: exitOk && writtenSkills.length === 0 && stagingLeftovers.length === 0,
});
expect(exitOk).toBe(true);
expect(writtenSkills.length).toBe(0);
expect(stagingLeftovers.length).toBe(0);
try { fs.rmSync(workDir, { recursive: true, force: true }); } catch {}
}, 420_000);
});