mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-20 00:30:10 +02:00
test: migrate section-loading E2E to lossless SDK tool-stream detection
The /ship and /plan-ceo-review section-loading tests drove a real PTY and scraped the ANSI screen buffer for sections/<file>.md paths. That silently saw nothing in a Conductor PTY (cursor-positioned tool renders and an unanswered Step 0 question loop both defeat the regex), so both reported read: [] even when the agent did the work. They now run the skill through claude -p (the same SDK path the AUQ matrix uses) and detect section reads from the tool-use stream — Read calls whose file_path contains sections/<file>.md — with no rendering layer to mangle. The run is also hermetic: the freshly-generated worktree skeleton + sections are copied into a throwaway fixture with the absolute path pinned, so the test validates this branch's carve without mutating the user's ~/.claude install. Validated EVALS_TIER=periodic: both pass (plan-ceo Reads review-sections.md; ship Reads review-army.md + changelog.md), ~6.5 min for both vs ~23 min combined on the old PTY path where both were failing. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,7 +15,7 @@ import * as fs from 'node:fs';
|
||||
import * as os from 'node:os';
|
||||
import * as path from 'node:path';
|
||||
import { spawnSync } from 'node:child_process';
|
||||
import { runSkillTest } from './session-runner';
|
||||
import { runSkillTest, type SkillTestResult } from './session-runner';
|
||||
|
||||
const ROOT = path.resolve(__dirname, '..', '..');
|
||||
|
||||
@@ -201,6 +201,76 @@ This is a capture test, not an interactive session. Skip any system-audit / envi
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Drive ANY carved skill through a real `claude -p` run and detect, LOSSLESSLY,
|
||||
* which `sections/<file>.md` files the agent actually Read — from the tool-use
|
||||
* stream, not the ANSI screen buffer. This is the reliable replacement for the
|
||||
* real-PTY `visibleSince()` screen-scraping the section-loading tests used to do
|
||||
* (which silently saw nothing in a Conductor PTY: cursor-positioned renders and
|
||||
* an unanswered Step 0 question loop both defeat the regex).
|
||||
*
|
||||
* The skill under test is the planted copy in `planDir` (pin the absolute path so
|
||||
* the agent cannot wander to the global install). AskUserQuestion is declared
|
||||
* unavailable so the agent auto-picks the recommended option and proceeds far
|
||||
* enough to hit the post-Step-0 STOP-Read directives; Read is the tool a STOP-Read
|
||||
* resolves to, so Read/Grep/Glob/Write is all the agent needs (no Bash → it cannot
|
||||
* `find /` its way out, nor run git/gh mutations).
|
||||
*/
|
||||
export async function captureSectionReads(opts: {
|
||||
planDir: string;
|
||||
skillName: string;
|
||||
scenario: string;
|
||||
/** Relative filename the agent writes its final output to (terminal signal). */
|
||||
reportFile?: string;
|
||||
/** Marker proving a real report/plan was produced (default: any non-empty text). */
|
||||
reportMarker?: RegExp;
|
||||
testName: string;
|
||||
runId?: string;
|
||||
model?: string;
|
||||
maxTurns?: number;
|
||||
timeout?: number;
|
||||
}): Promise<{ readSections: Set<string>; reportProduced: boolean; toolCalls: SkillTestResult['toolCalls']; output: string }> {
|
||||
const outFile = path.join(opts.planDir, opts.reportFile ?? 'REPORT.md');
|
||||
const skillPath = path.join(opts.planDir, opts.skillName, 'SKILL.md');
|
||||
const prompt = `You are running an automated skill-execution test. No human is present, so AskUserQuestion is unavailable. The ONLY skill file you may read is this absolute path: ${skillPath}. Do NOT Glob/find/search for any other SKILL.md anywhere — especially nothing under ~/.claude or /Users.
|
||||
|
||||
Read ${skillPath} and EXECUTE its workflow for this scenario:
|
||||
|
||||
${opts.scenario}
|
||||
|
||||
Rules for this run:
|
||||
- Skip system-audit, environment-setup, telemetry, and codebase-exploration steps.
|
||||
- At any decision point that would call AskUserQuestion, silently pick the skill's recommended option and continue. Do NOT stop to ask.
|
||||
- This skill's body has been carved into on-demand sections/. When the skill gives a STOP-Read directive (for example "Read \`.../sections/<file>\` and execute it in full"), you MUST actually Read that sections/ file with the Read tool BEFORE doing the work it covers. Do not work from memory.
|
||||
- Do NOT run git, gh, commit, push, or any mutating command.
|
||||
- When the workflow is complete, write the skill's final output (the full review report / ship plan, including any required report table) to ${outFile}.`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
prompt,
|
||||
workingDirectory: opts.planDir,
|
||||
allowedTools: ['Read', 'Grep', 'Glob', 'Write'],
|
||||
maxTurns: opts.maxTurns ?? 25,
|
||||
timeout: opts.timeout ?? 300_000,
|
||||
testName: opts.testName,
|
||||
runId: opts.runId,
|
||||
model: opts.model ?? 'claude-opus-4-7',
|
||||
});
|
||||
|
||||
const readSections = new Set<string>();
|
||||
for (const c of result.toolCalls) {
|
||||
if (c.tool !== 'Read') continue;
|
||||
const fp = String(c.input?.file_path ?? '');
|
||||
const m = fp.match(/sections\/([A-Za-z0-9._-]+\.md)/);
|
||||
if (m) readSections.add(m[1]);
|
||||
}
|
||||
|
||||
let output = '';
|
||||
try { output = fs.readFileSync(outFile, 'utf-8'); } catch { output = result.output ?? ''; }
|
||||
const reportProduced = opts.reportMarker ? opts.reportMarker.test(output) : output.trim().length > 0;
|
||||
|
||||
return { readSections, reportProduced, toolCalls: result.toolCalls, output };
|
||||
}
|
||||
|
||||
/** Read the carved (current worktree) plan-ceo SKILL.md + its sections dir. */
|
||||
export function carvedSkill(): { skillMd: string; sectionsFrom: string | null } {
|
||||
const sec = path.join(ROOT, 'plan-ceo-review', 'sections');
|
||||
|
||||
@@ -121,8 +121,8 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'plan-design-with-ui-scope': ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
|
||||
'budget-regression-pty': ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'],
|
||||
'ship-idempotency-pty': ['ship/**', 'bin/gstack-next-version', 'bin/gstack-version-bump', 'scripts/resolvers/sections.ts', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'ship-section-loading': ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/required-reads.ts', 'test/helpers/transcript-section-logger.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-ceo-section-loading': ['plan-ceo-review/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/required-reads.ts', 'test/helpers/transcript-section-logger.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'ship-section-loading': ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
|
||||
'plan-ceo-section-loading': ['plan-ceo-review/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
|
||||
'autoplan-chain-pty': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
|
||||
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
|
||||
|
||||
Reference in New Issue
Block a user