refactor(worker): rename claude-executor to pi-executor

2026-06-30 18:45:34 +02:00 · 2026-06-15 16:05:31 +05:30
parent 56241625a4
commit 263b18e98a
13 changed files with 32 additions and 32 deletions
@@ -122,7 +122,7 @@ Infra (Temporal) runs via `docker-compose.yml`. Workers are ephemeral `docker ru
 - `apps/worker/src/paths.ts` — Centralized path constants (`PROMPTS_DIR`, `CONFIGS_DIR`, `WORKSPACES_DIR`)
 - `apps/worker/src/session-manager.ts` — Agent definitions (`AGENTS` record). Agent types in `apps/worker/src/types/agents.ts`
 - `apps/worker/src/config-parser.ts` — YAML config parsing with JSON Schema validation
- `apps/worker/src/ai/claude-executor.ts` — Claude Agent SDK integration with retry logic
+- `apps/worker/src/ai/pi-executor.ts` — pi harness integration (retry disabled; Temporal owns retry)
 - `apps/worker/src/services/` — Business logic layer (Temporal-agnostic). Activities delegate here. Key: `agent-execution.ts`, `error-handling.ts`, `container.ts`
 - `apps/worker/src/types/` — Consolidated types: `Result<T,E>`, `ErrorCode`, `AgentName`, `ActivityLogger`, etc.
 - `apps/worker/src/utils/` — Shared utilities (file I/O, formatting, concurrency)
@@ -145,9 +145,9 @@ Durable workflow orchestration with crash recovery, queryable progress, intellig
 5. **Reporting** (`report`) — Executive-level security report

 ### Supporting Systems
- **Configuration** — YAML configs in `apps/worker/configs/` with JSON Schema validation (`config-schema.json`). Supports auth settings (MFA/TOTP), URL/code rule scoping (`rules.avoid`/`rules.focus`), run-scope steering (`vuln_classes`, `exploit`), free-form `rules_of_engagement`, and post-hoc `report` filters (`min_severity`, `min_confidence`, `guidance`). `code_path` avoid rules are enforced via the `@gotgenes/pi-permission-system` extension: `apps/worker/src/temporal/activities.ts:syncCodePathDenyRules` writes a global `path` deny config once per workflow (`apps/worker/src/ai/settings-writer.ts:writeCodePathPermissionConfig`), and the executor loads the extension when that config is present (`apps/worker/src/ai/claude-executor.ts`), so denies fire across every tool and child `task` session. `vuln_classes`/`exploit` scope is locked into `session.json` on first run; resumes with a different scope fail fast (`persistOrValidateRunScope`). Credential resolution — local mode: env vars → `./.env`; npx mode: env vars → `~/.shannon/config.toml` (via `shn setup`)
+- **Configuration** — YAML configs in `apps/worker/configs/` with JSON Schema validation (`config-schema.json`). Supports auth settings (MFA/TOTP), URL/code rule scoping (`rules.avoid`/`rules.focus`), run-scope steering (`vuln_classes`, `exploit`), free-form `rules_of_engagement`, and post-hoc `report` filters (`min_severity`, `min_confidence`, `guidance`). `code_path` avoid rules are enforced via the `@gotgenes/pi-permission-system` extension: `apps/worker/src/temporal/activities.ts:syncCodePathDenyRules` writes a global `path` deny config once per workflow (`apps/worker/src/ai/settings-writer.ts:writeCodePathPermissionConfig`), and the executor loads the extension when that config is present (`apps/worker/src/ai/pi-executor.ts`), so denies fire across every tool and child `task` session. `vuln_classes`/`exploit` scope is locked into `session.json` on first run; resumes with a different scope fail fast (`persistOrValidateRunScope`). Credential resolution — local mode: env vars → `./.env`; npx mode: env vars → `~/.shannon/config.toml` (via `shn setup`)
 - **Prompts** — Per-phase templates in `apps/worker/prompts/` with variable substitution (`{{TARGET_URL}}`, `{{CONFIG_CONTEXT}}`). Shared partials in `apps/worker/prompts/shared/` via `apps/worker/src/services/prompt-manager.ts`, including `_code-path-rules.txt` (focus/avoid `[FILE]`/`[GLOB]` routing) and `_rules-of-engagement.txt` (free-text engagement rules). When `exploit: false`, `apps/worker/src/services/findings-renderer.ts` deterministically converts each `*_exploitation_queue.json` into a `*_findings.md` for report assembly — no LLM in the loop
- **Agent Harness (pi)** — Uses the **pi harness** (`@earendil-works/pi-coding-agent`, requires Node ≥ 22.19) via `apps/worker/src/ai/claude-executor.ts` (`runClaudePrompt` → `createAgentSession`, retry disabled so Temporal owns retry). Models resolve through pi-ai in `apps/worker/src/ai/models.ts` (Anthropic / Bedrock / custom base URL via `ModelRegistry`+`AuthStorage`). pi ships no JSON-schema output or `Task`/`TodoWrite` built-ins, so structured queues are captured via a `submit_exploitation_queue` custom tool (`apps/worker/src/ai/queue-schemas.ts`), and `task` (read-only child sessions) + `todo_write` are provided as custom tools (`apps/worker/src/ai/tools.ts`); the per-phase MCP collectors are pi custom tools (TypeBox `defineTool` in `apps/worker/src/mcp-server/`). Thinking level defaults to `medium`; disable per-scan via `CLAUDE_ADAPTIVE_THINKING=false` (→ `off`) or set `CLAUDE_THINKING_LEVEL` (env) / `core.adaptive_thinking = false` (npx TOML). Browser automation via `playwright-cli` with session isolation (`-s=<session>`). TOTP generation via `generate-totp` CLI tool. Login flow template at `apps/worker/prompts/shared/login-instructions.txt` supports form, SSO, API, and basic auth. On authenticated whitebox scans, the `validate-authentication` preflight performs the single real login and saves the browser session to `auth-state.json` in the per-session audit directory (path from `authStateFile()` in `apps/worker/src/audit/utils.ts`, derived from `generateAuditPath()`). The validation activity (`apps/worker/src/services/validate-authentication.ts`) removes any stale file from a prior run before the agent runs and verifies the file parses and contains cookies or storage before the preflight is marked complete; `logWorkflowComplete` deletes it when the workflow ends so authenticated cookies don't sit on disk between scans. Agent prompts opt in to session reuse by `@include(shared/_shared-session.txt)` before their `<login_instructions>` block — the partial restores the session and falls through to the full login flow if verification fails. `vuln-auth`/`exploit-auth` omit the include and own their own login
+- **Agent Harness (pi)** — Uses the **pi harness** (`@earendil-works/pi-coding-agent`, requires Node ≥ 22.19) via `apps/worker/src/ai/pi-executor.ts` (`runPiPrompt` → `createAgentSession`, retry disabled so Temporal owns retry). Models resolve through pi-ai in `apps/worker/src/ai/models.ts` (Anthropic / Bedrock / custom base URL via `ModelRegistry`+`AuthStorage`). pi ships no JSON-schema output or `Task`/`TodoWrite` built-ins, so structured queues are captured via a `submit_exploitation_queue` custom tool (`apps/worker/src/ai/queue-schemas.ts`), and `task` (read-only child sessions) + `todo_write` are provided as custom tools (`apps/worker/src/ai/tools.ts`); the per-phase MCP collectors are pi custom tools (TypeBox `defineTool` in `apps/worker/src/mcp-server/`). Thinking level defaults to `medium`; disable per-scan via `CLAUDE_ADAPTIVE_THINKING=false` (→ `off`) or set `CLAUDE_THINKING_LEVEL` (env) / `core.adaptive_thinking = false` (npx TOML). Browser automation via `playwright-cli` with session isolation (`-s=<session>`). TOTP generation via `generate-totp` CLI tool. Login flow template at `apps/worker/prompts/shared/login-instructions.txt` supports form, SSO, API, and basic auth. On authenticated whitebox scans, the `validate-authentication` preflight performs the single real login and saves the browser session to `auth-state.json` in the per-session audit directory (path from `authStateFile()` in `apps/worker/src/audit/utils.ts`, derived from `generateAuditPath()`). The validation activity (`apps/worker/src/services/validate-authentication.ts`) removes any stale file from a prior run before the agent runs and verifies the file parses and contains cookies or storage before the preflight is marked complete; `logWorkflowComplete` deletes it when the workflow ends so authenticated cookies don't sit on disk between scans. Agent prompts opt in to session reuse by `@include(shared/_shared-session.txt)` before their `<login_instructions>` block — the partial restores the session and falls through to the full login flow if verification fails. `vuln-auth`/`exploit-auth` omit the include and own their own login
 - **Audit System** — Crash-safe append-only logging in `workspaces/{hostname}_{sessionId}/`. Tracks session metrics, per-agent logs, prompts, and deliverables. WorkflowLogger (`apps/worker/src/audit/workflow-logger.ts`) provides unified human-readable per-workflow logs, backed by LogStream (`apps/worker/src/audit/log-stream.ts`) shared stream primitive
 - **Deliverables** — Saved to `deliverables/` in the target repo via the `save-deliverable` CLI script (`apps/worker/src/scripts/save-deliverable.ts`)
 - **Workspaces & Resume** — Named workspaces via `-w <name>` or auto-named from URL+timestamp. Resume detects completed agents via `session.json`. `loadResumeState()` in `apps/worker/src/temporal/activities.ts` validates deliverable existence, restores git checkpoints, and cleans up incomplete deliverables. Workspace listing via `apps/worker/src/temporal/workspaces.ts`
@@ -228,7 +228,7 @@ Comments must be **timeless** — no references to this conversation, refactorin

 **Entry Points:** `apps/worker/src/temporal/workflows.ts`, `apps/worker/src/temporal/activities.ts`, `apps/worker/src/temporal/worker.ts`

-**Core Logic:** `apps/worker/src/session-manager.ts`, `apps/worker/src/ai/claude-executor.ts`, `apps/worker/src/ai/settings-writer.ts` (writes `code_path` deny rules to the `@gotgenes/pi-permission-system` global config), `apps/worker/src/config-parser.ts`, `apps/worker/src/services/` (incl. `preflight.ts`, `findings-renderer.ts`, `reporting.ts`), `apps/worker/src/audit/`
+**Core Logic:** `apps/worker/src/session-manager.ts`, `apps/worker/src/ai/pi-executor.ts`, `apps/worker/src/ai/settings-writer.ts` (writes `code_path` deny rules to the `@gotgenes/pi-permission-system` global config), `apps/worker/src/config-parser.ts`, `apps/worker/src/services/` (incl. `preflight.ts`, `findings-renderer.ts`, `reporting.ts`), `apps/worker/src/audit/`

 **Config:** `docker-compose.yml`, `apps/cli/infra/compose.yml`, `apps/worker/configs/`, `apps/worker/prompts/`, `tsconfig.base.json` (shared compiler options), `turbo.json`, `biome.json`

@@ -106,7 +106,7 @@ export function resolveModelId(tier: ModelTier = 'medium', providerConfig?: Prov
 /**
 * Resolve the thinking level for a run.
 *
- * The Claude Agent SDK enabled "adaptive" thinking only on capable models; pi uses
+ * The previous harness enabled "adaptive" thinking only on capable models; pi uses
 * explicit levels and clamps to model capability internally. We default to 'medium'
 * and honour the existing CLAUDE_ADAPTIVE_THINKING=false kill switch (→ 'off'). An
 * explicit CLAUDE_THINKING_LEVEL wins when set.
@@ -326,7 +326,7 @@ export function formatErrorOutput(
  } else if (context.useCleanOutput) {
    lines.push(`${context.agentType} failed (${formatDuration(duration)})`);
  } else {
-    lines.push(`  Claude Code failed: ${description} (${formatDuration(duration)})`);
+    lines.push(`  pi agent failed: ${description} (${formatDuration(duration)})`);
  }

  lines.push(`    Error Type: ${error.constructor.name}`);
@@ -360,7 +360,7 @@ export function formatCompletionMessage(
    return `${context.agentType.charAt(0).toUpperCase() + context.agentType.slice(1)} complete! (${turnCount} turns, ${formatDuration(duration)})`;
  }

-  return `  Claude Code completed: ${description} (${turnCount} turns) in ${formatDuration(duration)}`;
+  return `  pi agent completed: ${description} (${turnCount} turns) in ${formatDuration(duration)}`;
 }

 export function formatToolUseOutput(toolName: string, input: Record<string, unknown> | undefined): string[] {
@@ -78,7 +78,7 @@ async function buildPermissionResourceLoader(cwd: string, logger: ActivityLogger
  return loader;
 }

-export interface ClaudePromptResult {
+export interface PiPromptResult {
  result?: string | null | undefined;
  success: boolean;
  duration: number;
@@ -109,7 +109,7 @@ async function writeErrorLog(
  try {
    const errorLog = {
      timestamp: formatTimestamp(),
-      agent: 'claude-executor',
+      agent: 'pi-executor',
      error: { name: err.constructor.name, message: err.message, code: err.code, status: err.status, stack: err.stack },
      context: { sourceDir, prompt: `${fullPrompt.slice(0, 200)}...`, retryable: isRetryableError(err) },
      duration,
@@ -122,7 +122,7 @@ async function writeErrorLog(
 }

 export async function validateAgentOutput(
-  result: ClaudePromptResult,
+  result: PiPromptResult,
  agentName: string | null,
  sourceDir: string,
  logger: ActivityLogger,
@@ -187,11 +187,11 @@ function classifyErrorText(content: string): PentestError | null {

 // Low-level pi execution. Drives one agent session to completion with progress and
 // audit logging. Exported for Temporal activities to call single-attempt execution.
-export async function runClaudePrompt(
+export async function runPiPrompt(
  prompt: string,
  sourceDir: string,
  context: string = '',
-  description: string = 'Claude analysis',
+  description: string = 'Agent analysis',
  _agentName: string | null = null,
  auditSession: AuditSession | null = null,
  logger: ActivityLogger,
@@ -200,7 +200,7 @@ export async function runClaudePrompt(
  apiKey?: string,
  deliverablesSubdir?: string,
  providerConfig?: import('../types/config.js').ProviderConfig,
-): Promise<ClaudePromptResult> {
+): Promise<PiPromptResult> {
  // 1. Initialize timing and prompt
  const timer = new Timer(`agent-${description.toLowerCase().replace(/\s+/g, '-')}`);
  const fullPrompt = context ? `${context}\n\n${prompt}` : prompt;
@@ -6,7 +6,7 @@

 /**
 * Writes the @gotgenes/pi-permission-system global config from `code_path` avoid
- * patterns. The executor loads the extension (see claude-executor) and pi enforces
+ * patterns. The executor loads the extension (see pi-executor) and pi enforces
 * these path denies at the tool layer for every agent. Written to the global config
 * dir under `agentDir` — the project-scoped path is gated behind project trust,
 * which our headless runs do not grant; the global path is not.
@@ -7,7 +7,7 @@
 /**
 * Universal custom tools registered for every agent: `task` and `todo_write`.
 *
- * These replace the Claude Agent SDK built-ins that pi does not ship. `task`
+ * These replace the previous harness built-ins that pi does not ship. `task`
 * delegates a focused analysis to an in-process read-only child session (the
 * Task sub-agent replacement); `todo_write` is a full-state-replace planning
 * scratchpad mirrored to the workflow log.
@@ -116,7 +116,7 @@ function renderTodos(todos: readonly TodoItem[]): string {
 /**
 * The `todo_write` tool — a full-state-replace planning scratchpad.
 *
- * Mirrors Claude Code's TodoWrite: each call carries the entire list and replaces
+ * Mirrors the TodoWrite tool: each call carries the entire list and replaces
 * stored state (no append/merge). No deliverable impact; every call is echoed to
 * the workflow log so `shannon logs` shows the agent's live plan. State is per
 * tool instance (one per agent execution).
@@ -12,7 +12,7 @@
 * - Load prompt template using AGENTS[agentName].promptTemplate
 * - Create git checkpoint
 * - Start audit logging
- * - Invoke Claude SDK via runClaudePrompt
+ * - Invoke the pi agent via runPiPrompt
 * - Spending cap check using isSpendingCapBehavior
 * - Handle failure (rollback, audit)
 * - Validate output using AGENTS[agentName].deliverableFilename
@@ -22,7 +22,7 @@
 */

 import { fs, path } from 'zx';
-import { type ClaudePromptResult, runClaudePrompt, validateAgentOutput } from '../ai/claude-executor.js';
+import { type PiPromptResult, runPiPrompt, validateAgentOutput } from '../ai/pi-executor.js';
 import { createQueueSubmitTool, getQueueFilename } from '../ai/queue-schemas.js';
 import type { AuditSession } from '../audit/index.js';
 import { authStateFile } from '../audit/utils.js';
@@ -59,7 +59,7 @@ export interface AgentExecutionInput {

 interface FailAgentOpts {
  attemptNumber: number;
-  result: ClaudePromptResult;
+  result: PiPromptResult;
  rollbackReason: string;
  errorMessage: string;
  errorCode: ErrorCode;
@@ -167,7 +167,7 @@ export class AgentExecutionService {
    //    exploitation queue (pi has no JSON-schema output format).
    const submitTool = createQueueSubmitTool(agentName, distributedConfig?.exploit ?? true);
    const callerTools = [...(customTools ?? []), ...(submitTool ? [submitTool.tool] : [])];
-    const result: ClaudePromptResult = await runClaudePrompt(
+    const result: PiPromptResult = await runPiPrompt(
      prompt,
      repoPath,
      '', // context
@@ -309,7 +309,7 @@ export class AgentExecutionService {
  /**
   * Convert AgentEndResult to AgentMetrics for workflow state.
   */
-  static toMetrics(endResult: AgentEndResult, result: ClaudePromptResult): AgentMetrics {
+  static toMetrics(endResult: AgentEndResult, result: PiPromptResult): AgentMetrics {
    return {
      durationMs: endResult.duration_ms,
      inputTokens: null, // Not currently exposed by SDK wrapper
@@ -62,7 +62,7 @@ const RETRYABLE_PATTERNS = [
  'internal server error',
  'service unavailable',
  'bad gateway',
-  // Claude API errors
+  // Provider API errors
  'model unavailable',
  'service temporarily unavailable',
  'api error',
@@ -11,8 +11,8 @@
 * Services are pure domain logic with no Temporal dependencies.
 */

-export type { ClaudePromptResult } from '../ai/claude-executor.js';
-export { runClaudePrompt } from '../ai/claude-executor.js';
+export type { PiPromptResult } from '../ai/pi-executor.js';
+export { runPiPrompt } from '../ai/pi-executor.js';
 export type { AgentExecutionInput } from './agent-execution.js';
 export { AgentExecutionService } from './agent-execution.js';
 export { ConfigLoaderService } from './config-loader.js';
@@ -350,7 +350,7 @@ async function probeCredentialsWithPi(authType: string, baseUrl?: string): Promi
  return ok(undefined);
 }

-/** Validate credentials via a minimal Claude Agent SDK query. */
+/** Validate credentials via a minimal pi session. */
 async function validateCredentials(
  logger: ActivityLogger,
  apiKey?: string,
@@ -15,7 +15,7 @@
 import { readFile, rm } from 'node:fs/promises';
 import { defineTool, type ToolDefinition } from '@earendil-works/pi-coding-agent';
 import { Type } from 'typebox';
-import { runClaudePrompt } from '../ai/claude-executor.js';
+import { runPiPrompt } from '../ai/pi-executor.js';
 import type { AuditSession } from '../audit/index.js';
 import { authStateFile } from '../audit/utils.js';
 import type { ActivityLogger } from '../types/activity-logger.js';
@@ -123,7 +123,7 @@ export async function validateAuthentication(input: ValidateAuthInput): Promise<
  const startTime = Date.now();

  const submit = createAuthSubmitTool();
-  const result = await runClaudePrompt(
+  const result = await runPiPrompt(
    prompt,
    repoPath,
    '',
@@ -219,7 +219,7 @@ function countStorageEntries(parsed: unknown, key: 'cookies' | 'origins'): numbe
 }

 function classifyResult(
-  result: import('../ai/claude-executor.js').ClaudePromptResult,
+  result: import('../ai/pi-executor.js').PiPromptResult,
  authentication: NonNullable<DistributedConfig['authentication']>,
 ): Result<void, PentestError> {
  if (!result.success) {
@@ -663,7 +663,7 @@ export async function syncPlaywrightStealthConfig(input: ActivityInput): Promise
 /**
 * Sync code_path avoid rules into the @gotgenes/pi-permission-system global config
 * so pi enforces them at the tool layer for every agent in this run. The executor
- * loads the extension when this config is present (see claude-executor).
+ * loads the extension when this config is present (see pi-executor).
 *
 * Runs once per workflow before any analysis agent fires. Config is fixed for the
 * lifetime of the workflow, so writing once avoids a parallel-agent race on the
@@ -9,7 +9,7 @@
 *
 * Anthropic's spending cap behavior is inconsistent:
 * - Sometimes a proper SDK error (billing_error)
- * - Sometimes Claude responds with text about the cap
+ * - Sometimes the agent responds with text about the cap
 * - Sometimes partial billing before cutoff
 *
 * This module provides defense-in-depth detection with shared pattern lists
@@ -17,7 +17,7 @@
 */

 /**
- * Text patterns for SDK output sniffing (what Claude says).
+ * Text patterns for SDK output sniffing (what the agent says).
 * Used by message-handlers.ts and the behavioral heuristic.
 */
 export const BILLING_TEXT_PATTERNS = [
@@ -67,7 +67,7 @@ export function matchesBillingApiPattern(message: string): boolean {
 /**
 * Behavioral heuristic for detecting spending cap.
 *
- * When Claude hits a spending cap, it often returns a short message
+ * When the agent hits a spending cap, it often returns a short message
 * with $0 cost. Legitimate agent work NEVER costs $0 with only 1-2 turns.
 *
 * This combines three signals: