feat: add PostHog telemetry with persistent installation tracking

- Add telemetry module with PostHog integration and opt-out support - Track workflow/agent lifecycle events (start, complete, fail, retry) - Persist anonymous installation ID to ~/.shannon/telemetry-id - Include hashed target hostname for unique target counting - Mount host ~/.shannon in container for ID persistence across rebuilds
2026-05-26 10:27:53 +02:00 · 2026-01-13 17:51:51 -08:00
parent 3b391ec54c
commit eb8ab3be86
14 changed files with 733 additions and 5 deletions
@@ -70,6 +70,7 @@ import {
 import { assembleFinalReport } from '../phases/reporting.js';
 import { getPromptNameForAgent } from '../types/agents.js';
 import { AuditSession } from '../audit/index.js';
+import { telemetry, TelemetryEvent, hashTargetUrl } from '../telemetry/index.js';
 import type { AgentName } from '../types/agents.js';
 import type { AgentMetrics } from './shared.js';
 import type { DistributedConfig } from '../types/config.js';
@@ -88,6 +89,14 @@ export interface ActivityInput {
  outputPath?: string;
  pipelineTestingMode?: boolean;
  workflowId: string;
+  workflowStartTime?: number; // Epoch ms, used for total workflow duration in telemetry
+  installationId?: string; // Persistent anonymous ID for counting unique installations
+  // Workflow stats for telemetry (only passed to report agent)
+  workflowStats?: {
+    totalAgents: number;
+    agentsSucceeded: number;
+    agentsFailed: number;
+  };
 }

 /**
@@ -115,6 +124,7 @@ async function runAgentActivity(
    outputPath,
    pipelineTestingMode = false,
    workflowId,
+    installationId,
  } = input;

  const startTime = Date.now();
@@ -122,6 +132,18 @@ async function runAgentActivity(
  // Get attempt number from Temporal context (tracks retries automatically)
  const attemptNumber = Context.current().info.attempt;

+  // Set installationId as distinct ID for unique user tracking
+  if (installationId) {
+    telemetry.setDistinctId(installationId);
+  }
+
+  // Track agent start
+  telemetry.track(TelemetryEvent.AGENT_START, {
+    agent_name: agentName,
+    attempt_number: attemptNumber,
+    workflow_id: workflowId,
+  });
+
  // Heartbeat loop - signals worker is alive to Temporal server
  const heartbeatInterval = setInterval(() => {
    const elapsed = Math.floor((Date.now() - startTime) / 1000);
@@ -226,6 +248,15 @@ async function runAgentActivity(
    });
    await commitGitSuccess(repoPath, agentName);

+    // Track agent completion
+    telemetry.track(TelemetryEvent.AGENT_COMPLETE, {
+      agent_name: agentName,
+      attempt_number: attemptNumber,
+      duration_ms: Date.now() - startTime,
+      cost_usd: result.cost ?? undefined,
+      workflow_id: workflowId,
+    });
+
    // 10. Return metrics
    return {
      durationMs: Date.now() - startTime,
@@ -246,6 +277,17 @@ async function runAgentActivity(
    // If error is already an ApplicationFailure (e.g., from our retry limit logic),
    // re-throw it directly without re-classifying
    if (error instanceof ApplicationFailure) {
+      // Track retry or failure based on retryability
+      telemetry.track(
+        error.nonRetryable ? TelemetryEvent.AGENT_FAILED : TelemetryEvent.AGENT_RETRY,
+        {
+          agent_name: agentName,
+          attempt_number: attemptNumber,
+          duration_ms: Date.now() - startTime,
+          error_type: error.type || 'UnknownError',
+          workflow_id: workflowId,
+        }
+      );
      throw error;
    }

@@ -255,6 +297,18 @@ async function runAgentActivity(
    const rawMessage = error instanceof Error ? error.message : String(error);
    const message = truncateErrorMessage(rawMessage);

+    // Track retry or failure based on classification
+    telemetry.track(
+      classified.retryable ? TelemetryEvent.AGENT_RETRY : TelemetryEvent.AGENT_FAILED,
+      {
+        agent_name: agentName,
+        attempt_number: attemptNumber,
+        duration_ms: Date.now() - startTime,
+        error_type: classified.type,
+        workflow_id: workflowId,
+      }
+    );
+
    if (classified.retryable) {
      // Temporal will retry with configured backoff
      const failure = ApplicationFailure.create({
@@ -329,7 +383,42 @@ export async function runAuthzExploitAgent(input: ActivityInput): Promise<AgentM
 }

 export async function runReportAgent(input: ActivityInput): Promise<AgentMetrics> {
-  return runAgentActivity('report', input);
+  // Use workflow start time for total duration if available, otherwise fall back to now
+  const workflowStartTime = input.workflowStartTime ?? Date.now();
+  const stats = input.workflowStats;
+  const targetHash = hashTargetUrl(input.webUrl);
+  const workflowId = input.workflowId;
+  try {
+    const metrics = await runAgentActivity('report', input);
+    // Report agent success = workflow complete
+    telemetry.track(TelemetryEvent.WORKFLOW_COMPLETE, {
+      total_duration_ms: Date.now() - workflowStartTime,
+      total_cost_usd: metrics.costUsd ?? undefined,
+      total_agents: stats?.totalAgents,
+      agents_succeeded: stats?.agentsSucceeded,
+      agents_failed: stats?.agentsFailed,
+      target_hash: targetHash,
+      workflow_id: workflowId,
+    });
+    return metrics;
+  } catch (error) {
+    // Report agent failure = workflow failed
+    const errorType =
+      error instanceof ApplicationFailure
+        ? error.type || 'UnknownError'
+        : classifyErrorForTemporal(error).type;
+    telemetry.track(TelemetryEvent.WORKFLOW_FAILED, {
+      total_duration_ms: Date.now() - workflowStartTime,
+      error_type: errorType,
+      last_agent: 'report',
+      total_agents: stats?.totalAgents,
+      agents_succeeded: stats?.agentsSucceeded,
+      agents_failed: stats?.agentsFailed,
+      target_hash: targetHash,
+      workflow_id: workflowId,
+    });
+    throw error;
+  }
 }

 /**
@@ -31,6 +31,7 @@ import dotenv from 'dotenv';
 import chalk from 'chalk';
 import { displaySplashScreen } from '../splash-screen.js';
 import { sanitizeHostname } from '../audit/utils.js';
+import { telemetry, TelemetryEvent, hashTargetUrl, getInstallationId } from '../telemetry/index.js';
 // Import types only - these don't pull in workflow runtime code
 import type { PipelineInput, PipelineState, PipelineProgress } from './shared.js';

@@ -130,12 +131,20 @@ async function startPipeline(): Promise<void> {
    const hostname = sanitizeHostname(webUrl);
    const workflowId = customWorkflowId || `${hostname}_shannon-${Date.now()}`;

+    // Get persistent installation ID for unique installation counting
+    const installationId = await getInstallationId();
+
+    // Initialize telemetry with installation ID as distinct ID (for unique user tracking)
+    telemetry.initialize(pipelineTestingMode);
+    telemetry.setDistinctId(installationId);
+
    const input: PipelineInput = {
      webUrl,
      repoPath,
      ...(configPath && { configPath }),
      ...(outputPath && { outputPath }),
      ...(pipelineTestingMode && { pipelineTestingMode }),
+      installationId,
    };

    console.log(chalk.green.bold(`✓ Workflow started: ${workflowId}`));
@@ -160,6 +169,14 @@ async function startPipeline(): Promise<void> {
      }
    );

+    // Track workflow start
+    telemetry.track(TelemetryEvent.WORKFLOW_START, {
+      has_config: !!configPath,
+      pipeline_testing_mode: pipelineTestingMode,
+      target_hash: hashTargetUrl(webUrl),
+      workflow_id: workflowId,
+    });
+
    if (!waitForCompletion) {
      console.log(chalk.bold('Monitor progress:'));
      console.log(chalk.white('  Web UI:  ') + chalk.blue(`http://localhost:8233/namespaces/default/workflows/${workflowId}`));
@@ -202,6 +219,7 @@ async function startPipeline(): Promise<void> {
      process.exit(1);
    }
  } finally {
+    await telemetry.shutdown();
    await connection.close();
  }
 }
@@ -9,6 +9,7 @@ export interface PipelineInput {
  outputPath?: string;
  pipelineTestingMode?: boolean;
  workflowId?: string; // Added by client, used for audit correlation
+  installationId?: string; // Persistent anonymous ID for counting unique installations
 }

 export interface AgentMetrics {
@@ -26,6 +26,7 @@ import path from 'node:path';
 import dotenv from 'dotenv';
 import chalk from 'chalk';
 import * as activities from './activities.js';
+import { telemetry } from '../telemetry/index.js';

 dotenv.config();

@@ -37,6 +38,10 @@ async function runWorker(): Promise<void> {

  const connection = await NativeConnection.connect({ address });

+  // Initialize telemetry for activity execution
+  // Worker doesn't know pipelineTestingMode until activity runs, so default to false
+  telemetry.initialize();
+
  // Bundle workflows for Temporal's V8 isolate
  console.log(chalk.gray('Bundling workflows...'));
  const workflowBundle = await bundleWorkflowCode({
@@ -68,6 +73,7 @@ async function runWorker(): Promise<void> {
  try {
    await worker.run();
  } finally {
+    await telemetry.shutdown();
    await connection.close();
    console.log(chalk.gray('Worker stopped'));
  }
@@ -136,6 +136,9 @@ export async function pentestPipelineWorkflow(
    ...(input.pipelineTestingMode !== undefined && {
      pipelineTestingMode: input.pipelineTestingMode,
    }),
+    ...(input.installationId !== undefined && {
+      installationId: input.installationId,
+    }),
  };

  try {
@@ -267,7 +270,17 @@ export async function pentestPipelineWorkflow(
    await a.assembleReportActivity(activityInput);

    // Then run the report agent to add executive summary and clean up
-    state.agentMetrics['report'] = await a.runReportAgent(activityInput);
+    // Pass workflow start time and stats for accurate telemetry
+    const reportInput = {
+      ...activityInput,
+      workflowStartTime: state.startTime,
+      workflowStats: {
+        totalAgents: 13, // pre-recon, recon, 5 vuln, 5 exploit, report
+        agentsSucceeded: state.completedAgents.length,
+        agentsFailed: failedPipelines.length,
+      },
+    };
+    state.agentMetrics['report'] = await a.runReportAgent(reportInput);
    state.completedAgents.push('report');

    // === Complete ===