test(auq): default GSTACK_HEADLESS=1 in eval/E2E runners

Headless harness runs classify as headless (BLOCK on AUQ failure rather than
emit a prose question no one reads). SDK runner uses ambient mutation, not the
Options.env object, to avoid breaking the SDK auth pipeline. Interactive-path
suites opt out by overriding the env per-run.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-07 17:51:22 -07:00
parent c475d73b34
commit a4c39380bc
3 changed files with 16 additions and 1 deletions
+8
View File
@@ -300,6 +300,14 @@ export async function runAgentSdkTest(
const queryImpl: QueryProvider = opts.queryProvider ?? query;
const model = opts.model ?? 'claude-opus-4-7';
// Default GSTACK_HEADLESS=1 so SDK-driven eval/E2E runs classify as headless: an
// AskUserQuestion failure BLOCKs instead of emitting a prose question no human can
// answer. Set ambiently (the SDK child inherits process.env) rather than via
// sdkOpts.env — passing an env object to the SDK breaks its auth pipeline (see
// CLAUDE.md). A suite testing the interactive prose-fallback path sets
// process.env.GSTACK_HEADLESS='' before calling.
if (process.env.GSTACK_HEADLESS === undefined) process.env.GSTACK_HEADLESS = '1';
let attempt = 0;
let lastErr: unknown = null;
+3
View File
@@ -52,6 +52,9 @@ export class ClaudeAdapter implements ProviderAdapter {
timeout: opts.timeoutMs,
encoding: 'utf-8',
maxBuffer: 32 * 1024 * 1024,
// Default GSTACK_HEADLESS=1 so a benchmark run classifies as headless (an
// AskUserQuestion failure BLOCKs rather than emitting unanswerable prose).
env: { ...process.env, GSTACK_HEADLESS: '1' },
});
const parsed = this.parseOutput(out);
return {
+5 -1
View File
@@ -176,7 +176,11 @@ export async function runSkillTest(options: {
const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
cwd: workingDirectory,
env: extraEnv ? { ...process.env, ...extraEnv } : undefined,
// Default GSTACK_HEADLESS=1 so eval/E2E runs classify as headless (BLOCK on an
// AskUserQuestion failure rather than emit a prose question no human reads). A
// suite exercising the INTERACTIVE prose-fallback path opts out by passing
// `env: { GSTACK_HEADLESS: '' }` — extraEnv wins because it spreads last.
env: { ...process.env, GSTACK_HEADLESS: '1', ...extraEnv },
stdout: 'pipe',
stderr: 'pipe',
});