mirror of
https://github.com/KeygraphHQ/shannon.git
synced 2026-05-26 10:27:53 +02:00
feat: add PostHog telemetry with persistent installation tracking
- Add telemetry module with PostHog integration and opt-out support - Track workflow/agent lifecycle events (start, complete, fail, retry) - Persist anonymous installation ID to ~/.shannon/telemetry-id - Include hashed target hostname for unique target counting - Mount host ~/.shannon in container for ID persistence across rebuilds
This commit is contained in:
@@ -70,6 +70,7 @@ import {
|
||||
import { assembleFinalReport } from '../phases/reporting.js';
|
||||
import { getPromptNameForAgent } from '../types/agents.js';
|
||||
import { AuditSession } from '../audit/index.js';
|
||||
import { telemetry, TelemetryEvent, hashTargetUrl } from '../telemetry/index.js';
|
||||
import type { AgentName } from '../types/agents.js';
|
||||
import type { AgentMetrics } from './shared.js';
|
||||
import type { DistributedConfig } from '../types/config.js';
|
||||
@@ -88,6 +89,14 @@ export interface ActivityInput {
|
||||
outputPath?: string;
|
||||
pipelineTestingMode?: boolean;
|
||||
workflowId: string;
|
||||
workflowStartTime?: number; // Epoch ms, used for total workflow duration in telemetry
|
||||
installationId?: string; // Persistent anonymous ID for counting unique installations
|
||||
// Workflow stats for telemetry (only passed to report agent)
|
||||
workflowStats?: {
|
||||
totalAgents: number;
|
||||
agentsSucceeded: number;
|
||||
agentsFailed: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -115,6 +124,7 @@ async function runAgentActivity(
|
||||
outputPath,
|
||||
pipelineTestingMode = false,
|
||||
workflowId,
|
||||
installationId,
|
||||
} = input;
|
||||
|
||||
const startTime = Date.now();
|
||||
@@ -122,6 +132,18 @@ async function runAgentActivity(
|
||||
// Get attempt number from Temporal context (tracks retries automatically)
|
||||
const attemptNumber = Context.current().info.attempt;
|
||||
|
||||
// Set installationId as distinct ID for unique user tracking
|
||||
if (installationId) {
|
||||
telemetry.setDistinctId(installationId);
|
||||
}
|
||||
|
||||
// Track agent start
|
||||
telemetry.track(TelemetryEvent.AGENT_START, {
|
||||
agent_name: agentName,
|
||||
attempt_number: attemptNumber,
|
||||
workflow_id: workflowId,
|
||||
});
|
||||
|
||||
// Heartbeat loop - signals worker is alive to Temporal server
|
||||
const heartbeatInterval = setInterval(() => {
|
||||
const elapsed = Math.floor((Date.now() - startTime) / 1000);
|
||||
@@ -226,6 +248,15 @@ async function runAgentActivity(
|
||||
});
|
||||
await commitGitSuccess(repoPath, agentName);
|
||||
|
||||
// Track agent completion
|
||||
telemetry.track(TelemetryEvent.AGENT_COMPLETE, {
|
||||
agent_name: agentName,
|
||||
attempt_number: attemptNumber,
|
||||
duration_ms: Date.now() - startTime,
|
||||
cost_usd: result.cost ?? undefined,
|
||||
workflow_id: workflowId,
|
||||
});
|
||||
|
||||
// 10. Return metrics
|
||||
return {
|
||||
durationMs: Date.now() - startTime,
|
||||
@@ -246,6 +277,17 @@ async function runAgentActivity(
|
||||
// If error is already an ApplicationFailure (e.g., from our retry limit logic),
|
||||
// re-throw it directly without re-classifying
|
||||
if (error instanceof ApplicationFailure) {
|
||||
// Track retry or failure based on retryability
|
||||
telemetry.track(
|
||||
error.nonRetryable ? TelemetryEvent.AGENT_FAILED : TelemetryEvent.AGENT_RETRY,
|
||||
{
|
||||
agent_name: agentName,
|
||||
attempt_number: attemptNumber,
|
||||
duration_ms: Date.now() - startTime,
|
||||
error_type: error.type || 'UnknownError',
|
||||
workflow_id: workflowId,
|
||||
}
|
||||
);
|
||||
throw error;
|
||||
}
|
||||
|
||||
@@ -255,6 +297,18 @@ async function runAgentActivity(
|
||||
const rawMessage = error instanceof Error ? error.message : String(error);
|
||||
const message = truncateErrorMessage(rawMessage);
|
||||
|
||||
// Track retry or failure based on classification
|
||||
telemetry.track(
|
||||
classified.retryable ? TelemetryEvent.AGENT_RETRY : TelemetryEvent.AGENT_FAILED,
|
||||
{
|
||||
agent_name: agentName,
|
||||
attempt_number: attemptNumber,
|
||||
duration_ms: Date.now() - startTime,
|
||||
error_type: classified.type,
|
||||
workflow_id: workflowId,
|
||||
}
|
||||
);
|
||||
|
||||
if (classified.retryable) {
|
||||
// Temporal will retry with configured backoff
|
||||
const failure = ApplicationFailure.create({
|
||||
@@ -329,7 +383,42 @@ export async function runAuthzExploitAgent(input: ActivityInput): Promise<AgentM
|
||||
}
|
||||
|
||||
export async function runReportAgent(input: ActivityInput): Promise<AgentMetrics> {
|
||||
return runAgentActivity('report', input);
|
||||
// Use workflow start time for total duration if available, otherwise fall back to now
|
||||
const workflowStartTime = input.workflowStartTime ?? Date.now();
|
||||
const stats = input.workflowStats;
|
||||
const targetHash = hashTargetUrl(input.webUrl);
|
||||
const workflowId = input.workflowId;
|
||||
try {
|
||||
const metrics = await runAgentActivity('report', input);
|
||||
// Report agent success = workflow complete
|
||||
telemetry.track(TelemetryEvent.WORKFLOW_COMPLETE, {
|
||||
total_duration_ms: Date.now() - workflowStartTime,
|
||||
total_cost_usd: metrics.costUsd ?? undefined,
|
||||
total_agents: stats?.totalAgents,
|
||||
agents_succeeded: stats?.agentsSucceeded,
|
||||
agents_failed: stats?.agentsFailed,
|
||||
target_hash: targetHash,
|
||||
workflow_id: workflowId,
|
||||
});
|
||||
return metrics;
|
||||
} catch (error) {
|
||||
// Report agent failure = workflow failed
|
||||
const errorType =
|
||||
error instanceof ApplicationFailure
|
||||
? error.type || 'UnknownError'
|
||||
: classifyErrorForTemporal(error).type;
|
||||
telemetry.track(TelemetryEvent.WORKFLOW_FAILED, {
|
||||
total_duration_ms: Date.now() - workflowStartTime,
|
||||
error_type: errorType,
|
||||
last_agent: 'report',
|
||||
total_agents: stats?.totalAgents,
|
||||
agents_succeeded: stats?.agentsSucceeded,
|
||||
agents_failed: stats?.agentsFailed,
|
||||
target_hash: targetHash,
|
||||
workflow_id: workflowId,
|
||||
});
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -31,6 +31,7 @@ import dotenv from 'dotenv';
|
||||
import chalk from 'chalk';
|
||||
import { displaySplashScreen } from '../splash-screen.js';
|
||||
import { sanitizeHostname } from '../audit/utils.js';
|
||||
import { telemetry, TelemetryEvent, hashTargetUrl, getInstallationId } from '../telemetry/index.js';
|
||||
// Import types only - these don't pull in workflow runtime code
|
||||
import type { PipelineInput, PipelineState, PipelineProgress } from './shared.js';
|
||||
|
||||
@@ -130,12 +131,20 @@ async function startPipeline(): Promise<void> {
|
||||
const hostname = sanitizeHostname(webUrl);
|
||||
const workflowId = customWorkflowId || `${hostname}_shannon-${Date.now()}`;
|
||||
|
||||
// Get persistent installation ID for unique installation counting
|
||||
const installationId = await getInstallationId();
|
||||
|
||||
// Initialize telemetry with installation ID as distinct ID (for unique user tracking)
|
||||
telemetry.initialize(pipelineTestingMode);
|
||||
telemetry.setDistinctId(installationId);
|
||||
|
||||
const input: PipelineInput = {
|
||||
webUrl,
|
||||
repoPath,
|
||||
...(configPath && { configPath }),
|
||||
...(outputPath && { outputPath }),
|
||||
...(pipelineTestingMode && { pipelineTestingMode }),
|
||||
installationId,
|
||||
};
|
||||
|
||||
console.log(chalk.green.bold(`✓ Workflow started: ${workflowId}`));
|
||||
@@ -160,6 +169,14 @@ async function startPipeline(): Promise<void> {
|
||||
}
|
||||
);
|
||||
|
||||
// Track workflow start
|
||||
telemetry.track(TelemetryEvent.WORKFLOW_START, {
|
||||
has_config: !!configPath,
|
||||
pipeline_testing_mode: pipelineTestingMode,
|
||||
target_hash: hashTargetUrl(webUrl),
|
||||
workflow_id: workflowId,
|
||||
});
|
||||
|
||||
if (!waitForCompletion) {
|
||||
console.log(chalk.bold('Monitor progress:'));
|
||||
console.log(chalk.white(' Web UI: ') + chalk.blue(`http://localhost:8233/namespaces/default/workflows/${workflowId}`));
|
||||
@@ -202,6 +219,7 @@ async function startPipeline(): Promise<void> {
|
||||
process.exit(1);
|
||||
}
|
||||
} finally {
|
||||
await telemetry.shutdown();
|
||||
await connection.close();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ export interface PipelineInput {
|
||||
outputPath?: string;
|
||||
pipelineTestingMode?: boolean;
|
||||
workflowId?: string; // Added by client, used for audit correlation
|
||||
installationId?: string; // Persistent anonymous ID for counting unique installations
|
||||
}
|
||||
|
||||
export interface AgentMetrics {
|
||||
|
||||
@@ -26,6 +26,7 @@ import path from 'node:path';
|
||||
import dotenv from 'dotenv';
|
||||
import chalk from 'chalk';
|
||||
import * as activities from './activities.js';
|
||||
import { telemetry } from '../telemetry/index.js';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@@ -37,6 +38,10 @@ async function runWorker(): Promise<void> {
|
||||
|
||||
const connection = await NativeConnection.connect({ address });
|
||||
|
||||
// Initialize telemetry for activity execution
|
||||
// Worker doesn't know pipelineTestingMode until activity runs, so default to false
|
||||
telemetry.initialize();
|
||||
|
||||
// Bundle workflows for Temporal's V8 isolate
|
||||
console.log(chalk.gray('Bundling workflows...'));
|
||||
const workflowBundle = await bundleWorkflowCode({
|
||||
@@ -68,6 +73,7 @@ async function runWorker(): Promise<void> {
|
||||
try {
|
||||
await worker.run();
|
||||
} finally {
|
||||
await telemetry.shutdown();
|
||||
await connection.close();
|
||||
console.log(chalk.gray('Worker stopped'));
|
||||
}
|
||||
|
||||
@@ -136,6 +136,9 @@ export async function pentestPipelineWorkflow(
|
||||
...(input.pipelineTestingMode !== undefined && {
|
||||
pipelineTestingMode: input.pipelineTestingMode,
|
||||
}),
|
||||
...(input.installationId !== undefined && {
|
||||
installationId: input.installationId,
|
||||
}),
|
||||
};
|
||||
|
||||
try {
|
||||
@@ -267,7 +270,17 @@ export async function pentestPipelineWorkflow(
|
||||
await a.assembleReportActivity(activityInput);
|
||||
|
||||
// Then run the report agent to add executive summary and clean up
|
||||
state.agentMetrics['report'] = await a.runReportAgent(activityInput);
|
||||
// Pass workflow start time and stats for accurate telemetry
|
||||
const reportInput = {
|
||||
...activityInput,
|
||||
workflowStartTime: state.startTime,
|
||||
workflowStats: {
|
||||
totalAgents: 13, // pre-recon, recon, 5 vuln, 5 exploit, report
|
||||
agentsSucceeded: state.completedAgents.length,
|
||||
agentsFailed: failedPipelines.length,
|
||||
},
|
||||
};
|
||||
state.agentMetrics['report'] = await a.runReportAgent(reportInput);
|
||||
state.completedAgents.push('report');
|
||||
|
||||
// === Complete ===
|
||||
|
||||
Reference in New Issue
Block a user