feat: add PostHog telemetry with persistent installation tracking

- Add telemetry module with PostHog integration and opt-out support
- Track workflow/agent lifecycle events (start, complete, fail, retry)
- Persist anonymous installation ID to ~/.shannon/telemetry-id
- Include hashed target hostname for unique target counting
- Mount host ~/.shannon in container for ID persistence across rebuilds
This commit is contained in:
ajmallesh
2026-01-13 17:51:51 -08:00
parent 3b391ec54c
commit eb8ab3be86
14 changed files with 733 additions and 5 deletions
+90 -1
View File
@@ -70,6 +70,7 @@ import {
import { assembleFinalReport } from '../phases/reporting.js';
import { getPromptNameForAgent } from '../types/agents.js';
import { AuditSession } from '../audit/index.js';
import { telemetry, TelemetryEvent, hashTargetUrl } from '../telemetry/index.js';
import type { AgentName } from '../types/agents.js';
import type { AgentMetrics } from './shared.js';
import type { DistributedConfig } from '../types/config.js';
@@ -88,6 +89,14 @@ export interface ActivityInput {
outputPath?: string;
pipelineTestingMode?: boolean;
workflowId: string;
workflowStartTime?: number; // Epoch ms, used for total workflow duration in telemetry
installationId?: string; // Persistent anonymous ID for counting unique installations
// Workflow stats for telemetry (only passed to report agent)
workflowStats?: {
totalAgents: number;
agentsSucceeded: number;
agentsFailed: number;
};
}
/**
@@ -115,6 +124,7 @@ async function runAgentActivity(
outputPath,
pipelineTestingMode = false,
workflowId,
installationId,
} = input;
const startTime = Date.now();
@@ -122,6 +132,18 @@ async function runAgentActivity(
// Get attempt number from Temporal context (tracks retries automatically)
const attemptNumber = Context.current().info.attempt;
// Set installationId as distinct ID for unique user tracking
if (installationId) {
telemetry.setDistinctId(installationId);
}
// Track agent start
telemetry.track(TelemetryEvent.AGENT_START, {
agent_name: agentName,
attempt_number: attemptNumber,
workflow_id: workflowId,
});
// Heartbeat loop - signals worker is alive to Temporal server
const heartbeatInterval = setInterval(() => {
const elapsed = Math.floor((Date.now() - startTime) / 1000);
@@ -226,6 +248,15 @@ async function runAgentActivity(
});
await commitGitSuccess(repoPath, agentName);
// Track agent completion
telemetry.track(TelemetryEvent.AGENT_COMPLETE, {
agent_name: agentName,
attempt_number: attemptNumber,
duration_ms: Date.now() - startTime,
cost_usd: result.cost ?? undefined,
workflow_id: workflowId,
});
// 10. Return metrics
return {
durationMs: Date.now() - startTime,
@@ -246,6 +277,17 @@ async function runAgentActivity(
// If error is already an ApplicationFailure (e.g., from our retry limit logic),
// re-throw it directly without re-classifying
if (error instanceof ApplicationFailure) {
// Track retry or failure based on retryability
telemetry.track(
error.nonRetryable ? TelemetryEvent.AGENT_FAILED : TelemetryEvent.AGENT_RETRY,
{
agent_name: agentName,
attempt_number: attemptNumber,
duration_ms: Date.now() - startTime,
error_type: error.type || 'UnknownError',
workflow_id: workflowId,
}
);
throw error;
}
@@ -255,6 +297,18 @@ async function runAgentActivity(
const rawMessage = error instanceof Error ? error.message : String(error);
const message = truncateErrorMessage(rawMessage);
// Track retry or failure based on classification
telemetry.track(
classified.retryable ? TelemetryEvent.AGENT_RETRY : TelemetryEvent.AGENT_FAILED,
{
agent_name: agentName,
attempt_number: attemptNumber,
duration_ms: Date.now() - startTime,
error_type: classified.type,
workflow_id: workflowId,
}
);
if (classified.retryable) {
// Temporal will retry with configured backoff
const failure = ApplicationFailure.create({
@@ -329,7 +383,42 @@ export async function runAuthzExploitAgent(input: ActivityInput): Promise<AgentM
}
export async function runReportAgent(input: ActivityInput): Promise<AgentMetrics> {
return runAgentActivity('report', input);
// Use workflow start time for total duration if available, otherwise fall back to now
const workflowStartTime = input.workflowStartTime ?? Date.now();
const stats = input.workflowStats;
const targetHash = hashTargetUrl(input.webUrl);
const workflowId = input.workflowId;
try {
const metrics = await runAgentActivity('report', input);
// Report agent success = workflow complete
telemetry.track(TelemetryEvent.WORKFLOW_COMPLETE, {
total_duration_ms: Date.now() - workflowStartTime,
total_cost_usd: metrics.costUsd ?? undefined,
total_agents: stats?.totalAgents,
agents_succeeded: stats?.agentsSucceeded,
agents_failed: stats?.agentsFailed,
target_hash: targetHash,
workflow_id: workflowId,
});
return metrics;
} catch (error) {
// Report agent failure = workflow failed
const errorType =
error instanceof ApplicationFailure
? error.type || 'UnknownError'
: classifyErrorForTemporal(error).type;
telemetry.track(TelemetryEvent.WORKFLOW_FAILED, {
total_duration_ms: Date.now() - workflowStartTime,
error_type: errorType,
last_agent: 'report',
total_agents: stats?.totalAgents,
agents_succeeded: stats?.agentsSucceeded,
agents_failed: stats?.agentsFailed,
target_hash: targetHash,
workflow_id: workflowId,
});
throw error;
}
}
/**
+18
View File
@@ -31,6 +31,7 @@ import dotenv from 'dotenv';
import chalk from 'chalk';
import { displaySplashScreen } from '../splash-screen.js';
import { sanitizeHostname } from '../audit/utils.js';
import { telemetry, TelemetryEvent, hashTargetUrl, getInstallationId } from '../telemetry/index.js';
// Import types only - these don't pull in workflow runtime code
import type { PipelineInput, PipelineState, PipelineProgress } from './shared.js';
@@ -130,12 +131,20 @@ async function startPipeline(): Promise<void> {
const hostname = sanitizeHostname(webUrl);
const workflowId = customWorkflowId || `${hostname}_shannon-${Date.now()}`;
// Get persistent installation ID for unique installation counting
const installationId = await getInstallationId();
// Initialize telemetry with installation ID as distinct ID (for unique user tracking)
telemetry.initialize(pipelineTestingMode);
telemetry.setDistinctId(installationId);
const input: PipelineInput = {
webUrl,
repoPath,
...(configPath && { configPath }),
...(outputPath && { outputPath }),
...(pipelineTestingMode && { pipelineTestingMode }),
installationId,
};
console.log(chalk.green.bold(`✓ Workflow started: ${workflowId}`));
@@ -160,6 +169,14 @@ async function startPipeline(): Promise<void> {
}
);
// Track workflow start
telemetry.track(TelemetryEvent.WORKFLOW_START, {
has_config: !!configPath,
pipeline_testing_mode: pipelineTestingMode,
target_hash: hashTargetUrl(webUrl),
workflow_id: workflowId,
});
if (!waitForCompletion) {
console.log(chalk.bold('Monitor progress:'));
console.log(chalk.white(' Web UI: ') + chalk.blue(`http://localhost:8233/namespaces/default/workflows/${workflowId}`));
@@ -202,6 +219,7 @@ async function startPipeline(): Promise<void> {
process.exit(1);
}
} finally {
await telemetry.shutdown();
await connection.close();
}
}
+1
View File
@@ -9,6 +9,7 @@ export interface PipelineInput {
outputPath?: string;
pipelineTestingMode?: boolean;
workflowId?: string; // Added by client, used for audit correlation
installationId?: string; // Persistent anonymous ID for counting unique installations
}
export interface AgentMetrics {
+6
View File
@@ -26,6 +26,7 @@ import path from 'node:path';
import dotenv from 'dotenv';
import chalk from 'chalk';
import * as activities from './activities.js';
import { telemetry } from '../telemetry/index.js';
dotenv.config();
@@ -37,6 +38,10 @@ async function runWorker(): Promise<void> {
const connection = await NativeConnection.connect({ address });
// Initialize telemetry for activity execution
// Worker doesn't know pipelineTestingMode until activity runs, so default to false
telemetry.initialize();
// Bundle workflows for Temporal's V8 isolate
console.log(chalk.gray('Bundling workflows...'));
const workflowBundle = await bundleWorkflowCode({
@@ -68,6 +73,7 @@ async function runWorker(): Promise<void> {
try {
await worker.run();
} finally {
await telemetry.shutdown();
await connection.close();
console.log(chalk.gray('Worker stopped'));
}
+14 -1
View File
@@ -136,6 +136,9 @@ export async function pentestPipelineWorkflow(
...(input.pipelineTestingMode !== undefined && {
pipelineTestingMode: input.pipelineTestingMode,
}),
...(input.installationId !== undefined && {
installationId: input.installationId,
}),
};
try {
@@ -267,7 +270,17 @@ export async function pentestPipelineWorkflow(
await a.assembleReportActivity(activityInput);
// Then run the report agent to add executive summary and clean up
state.agentMetrics['report'] = await a.runReportAgent(activityInput);
// Pass workflow start time and stats for accurate telemetry
const reportInput = {
...activityInput,
workflowStartTime: state.startTime,
workflowStats: {
totalAgents: 13, // pre-recon, recon, 5 vuln, 5 exploit, report
agentsSucceeded: state.completedAgents.length,
agentsFailed: failedPipelines.length,
},
};
state.agentMetrics['report'] = await a.runReportAgent(reportInput);
state.completedAgents.push('report');
// === Complete ===