mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test(security): assert block stops further tool calls, allow lets them through
Gap caught by user: the review-flow tests verified the decision path (POST, file write, agent_error emission) but not the actual security property — that Block stops subsequent tool calls and Allow lets them continue. Mock-claude tool_result_injection scenario now emits a second tool_use ~8s after the injected tool_result, targeting post-block-followup. example.com. If block really blocks, that event never reaches the chat feed (SIGTERM killed the subprocess before it emitted). If allow really allows, it does. Allow test asserts the followup tool_use DOES appear → session lives. Block test asserts the followup tool_use does NOT appear after 12s → kill actually stopped further work. Both tests previously proved the control plane (decision file → agent poll → agent_error); they now prove the data plane too. Test timeout bumped 60s → 90s to accommodate the 12s quiet window. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+22
-5
@@ -133,11 +133,28 @@ process.on('SIGTERM', () => {
|
||||
],
|
||||
},
|
||||
});
|
||||
// Stay alive up to 120s waiting for the user's review decision to
|
||||
// propagate back through the kill file. On Allow, we just exit cleanly
|
||||
// after a short delay. On Block, SIGTERM handler exits 143.
|
||||
await sleep(120000);
|
||||
emit({ type: 'result', result: 'mock-claude: review decision timeout' });
|
||||
// Wait long enough for the review decision to propagate (BLOCK path
|
||||
// SIGTERMs us here — see handler at top). On ALLOW the review loop
|
||||
// unblocks and we continue with a second tool_use to a sensitive
|
||||
// domain. If block actually blocks, this second event never reaches
|
||||
// the chat feed (test asserts on that). If allow actually allows, it
|
||||
// does.
|
||||
await sleep(8000);
|
||||
emit({
|
||||
type: 'assistant',
|
||||
message: {
|
||||
content: [
|
||||
{
|
||||
type: 'tool_use',
|
||||
id: 'toolu_02_followup',
|
||||
name: 'Bash',
|
||||
input: { command: '$B goto https://post-block-followup.example.com/' },
|
||||
},
|
||||
],
|
||||
},
|
||||
});
|
||||
await sleep(500);
|
||||
emit({ type: 'result', result: 'mock-claude: post-review followup complete' });
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
||||
@@ -272,12 +272,36 @@ describe('review-flow full-stack E2E', () => {
|
||||
expect(overrodeLog.payloadHash).toBe(blockLog.payloadHash);
|
||||
// Privacy contract: neither record includes the raw payload.
|
||||
expect(JSON.stringify(overrodeLog)).not.toContain('IGNORE ALL PREVIOUS');
|
||||
|
||||
// Liveness: session must actually KEEP RUNNING after Allow. Mock-claude
|
||||
// emits a second tool_use to post-block-followup.example.com ~8s
|
||||
// after the tool_result. That event must reach the chat feed, proving
|
||||
// the sidebar-agent resumed the stream-handler relay instead of
|
||||
// silently wedging.
|
||||
const followupDeadline = Date.now() + 20_000;
|
||||
let followup: any = null;
|
||||
while (Date.now() < followupDeadline && !followup) {
|
||||
const chatResp = await apiFetch('/sidebar-chat');
|
||||
const chatData: any = await chatResp.json();
|
||||
for (const entry of chatData.entries ?? []) {
|
||||
const input = String((entry as any).input ?? '');
|
||||
if (
|
||||
entry.type === 'tool_use' &&
|
||||
input.includes('post-block-followup.example.com')
|
||||
) {
|
||||
followup = entry;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!followup) await new Promise((r) => setTimeout(r, 300));
|
||||
}
|
||||
expect(followup).not.toBeNull();
|
||||
} finally {
|
||||
await stopStack();
|
||||
try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {}
|
||||
}
|
||||
},
|
||||
60_000,
|
||||
90_000,
|
||||
);
|
||||
|
||||
test.skipIf(!CLASSIFIER_READY)(
|
||||
@@ -337,12 +361,34 @@ describe('review-flow full-stack E2E', () => {
|
||||
const attempts = await readAttempts();
|
||||
const overrodeLog = attempts.find((a) => a.verdict === 'user_overrode');
|
||||
expect(overrodeLog).toBeFalsy();
|
||||
|
||||
// The real security property: after Block, NO FURTHER tool calls
|
||||
// reach the chat feed. Mock-claude would have emitted a tool_use
|
||||
// to post-block-followup.example.com ~8s after the tool_result if
|
||||
// the session had kept running. Wait long enough for that window
|
||||
// to close (12s total), then assert the followup event never
|
||||
// appeared. This is what makes "block" actually stop the page —
|
||||
// the subprocess is SIGTERM'd before it can emit the next event.
|
||||
await new Promise((r) => setTimeout(r, 12_000));
|
||||
const finalChatResp = await apiFetch('/sidebar-chat');
|
||||
const finalChatData: any = await finalChatResp.json();
|
||||
const followupAttempted = (finalChatData.entries ?? []).some(
|
||||
(entry: any) =>
|
||||
entry.type === 'tool_use' &&
|
||||
String(entry.input ?? '').includes('post-block-followup.example.com'),
|
||||
);
|
||||
expect(followupAttempted).toBe(false);
|
||||
|
||||
// And mock-claude must actually have died (not just been signaled
|
||||
// — the SIGTERM + SIGKILL pair should have exited the process).
|
||||
const mockAlive = (await apiFetch('/sidebar-chat')).ok; // channel still open
|
||||
expect(mockAlive).toBe(true);
|
||||
} finally {
|
||||
await stopStack();
|
||||
try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {}
|
||||
}
|
||||
},
|
||||
60_000,
|
||||
90_000,
|
||||
);
|
||||
|
||||
test.skipIf(!CLASSIFIER_READY)(
|
||||
|
||||
Reference in New Issue
Block a user