feat: add PostHog telemetry with persistent installation tracking

- Add telemetry module with PostHog integration and opt-out support
- Track workflow/agent lifecycle events (start, complete, fail, retry)
- Persist anonymous installation ID to ~/.shannon/telemetry-id
- Include hashed target hostname for unique target counting
- Mount host ~/.shannon in container for ID persistence across rebuilds
This commit is contained in:
ajmallesh
2026-01-13 17:51:51 -08:00
parent 3b391ec54c
commit eb8ab3be86
14 changed files with 733 additions and 5 deletions
+26
View File
@@ -0,0 +1,26 @@
// Copyright (C) 2025 Keygraph, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License version 3
// as published by the Free Software Foundation.
/**
* Telemetry Module - Public API
*
* Usage:
* import { telemetry, TelemetryEvent } from '../telemetry/index.js';
*
* telemetry.initialize();
* telemetry.track(TelemetryEvent.WORKFLOW_START, { has_config: true });
* await telemetry.shutdown();
*/
export { telemetry, hashTargetUrl } from './telemetry-manager.js';
export { TelemetryEvent } from './telemetry-events.js';
export { getInstallationId } from './installation-id.js';
export type {
BaseTelemetryProperties,
AgentEventProperties,
WorkflowEventProperties,
} from './telemetry-events.js';
export { loadTelemetryConfig } from './telemetry-config.js';
+78
View File
@@ -0,0 +1,78 @@
// Copyright (C) 2025 Keygraph, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License version 3
// as published by the Free Software Foundation.
/**
* Installation ID - Persistent anonymous identifier for telemetry.
*
* Generates a UUID and persists it to ~/.shannon/telemetry-id
* On subsequent runs, reads the existing ID from the file.
* Handles errors gracefully by returning a random UUID.
*/
import { randomUUID } from 'crypto';
import { readFile, writeFile, mkdir } from 'fs/promises';
import { join } from 'path';
import { homedir } from 'os';
const SHANNON_DIR = '.shannon';
const TELEMETRY_ID_FILE = 'telemetry-id';
/**
* Get the path to the telemetry ID file.
* Returns ~/.shannon/telemetry-id
*/
function getTelemetryIdPath(): string {
return join(homedir(), SHANNON_DIR, TELEMETRY_ID_FILE);
}
/**
* Get the path to the Shannon config directory.
* Returns ~/.shannon
*/
function getShannonDir(): string {
return join(homedir(), SHANNON_DIR);
}
/**
* Get or create a persistent installation ID.
*
* - If ~/.shannon/telemetry-id exists, reads and returns the ID
* - If not, generates a new UUID, persists it, and returns it
* - On any error, returns a random UUID (doesn't persist)
*
* @returns Promise<string> - The installation ID (UUID format)
*/
export async function getInstallationId(): Promise<string> {
const filePath = getTelemetryIdPath();
try {
// Try to read existing ID
const existingId = await readFile(filePath, 'utf-8');
const trimmedId = existingId.trim();
// Validate it looks like a UUID (basic check)
if (trimmedId.length >= 32) {
return trimmedId;
}
} catch {
// File doesn't exist or can't be read - will create new ID
}
// Generate new ID
const newId = randomUUID();
try {
// Ensure ~/.shannon directory exists
await mkdir(getShannonDir(), { recursive: true });
// Persist the new ID
await writeFile(filePath, newId, 'utf-8');
} catch {
// Failed to persist - return the ID anyway (won't be persistent)
}
return newId;
}
+68
View File
@@ -0,0 +1,68 @@
// Copyright (C) 2025 Keygraph, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License version 3
// as published by the Free Software Foundation.
/**
* Telemetry configuration with opt-out support.
*
* Telemetry is enabled by default. Users can disable via:
* - DO_NOT_TRACK=1 (standard convention: https://consoledonottrack.com/)
* - SHANNON_TELEMETRY=off|false|0
*/
export interface TelemetryConfig {
enabled: boolean;
apiKey: string;
host: string;
}
// PostHog project configuration
// This is a write-only key - safe to publish, users cannot read analytics
const POSTHOG_API_KEY = 'phc_9EF2G6mm83rfLef5WmVLiNSyGQ4x0p8NzTRKiEAgvD4';
const POSTHOG_HOST = 'https://us.i.posthog.com';
/**
* Check if telemetry is enabled based on environment variables.
*/
function isTelemetryEnabled(): boolean {
// Standard opt-out: DO_NOT_TRACK
const doNotTrack = process.env.DO_NOT_TRACK;
if (doNotTrack === '1' || doNotTrack?.toLowerCase() === 'true') {
return false;
}
// Shannon-specific opt-out
const shannonTelemetry = process.env.SHANNON_TELEMETRY?.toLowerCase();
if (
shannonTelemetry === 'off' ||
shannonTelemetry === 'false' ||
shannonTelemetry === '0'
) {
return false;
}
return true;
}
/**
* Load telemetry configuration from environment.
* Never throws - returns disabled config on any error.
*/
export function loadTelemetryConfig(): TelemetryConfig {
try {
return {
enabled: isTelemetryEnabled(),
apiKey: POSTHOG_API_KEY,
host: POSTHOG_HOST,
};
} catch {
// Config loading should never fail - return disabled
return {
enabled: false,
apiKey: POSTHOG_API_KEY,
host: POSTHOG_HOST,
};
}
}
+60
View File
@@ -0,0 +1,60 @@
// Copyright (C) 2025 Keygraph, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License version 3
// as published by the Free Software Foundation.
/**
* Telemetry event definitions for Shannon.
*
* All PostHog event names are defined here for consistency and type safety.
* These events are anonymous - no PII or sensitive data is ever sent.
*/
/**
* Telemetry event names.
* Using an enum ensures consistency across the codebase.
*/
export enum TelemetryEvent {
// Workflow lifecycle (emitted from client.ts)
WORKFLOW_START = 'workflow_start',
// Agent lifecycle (emitted from activities.ts)
AGENT_START = 'agent_start',
AGENT_COMPLETE = 'agent_complete',
AGENT_FAILED = 'agent_failed',
AGENT_RETRY = 'agent_retry',
// Pipeline completion (emitted from report agent in activities.ts)
WORKFLOW_COMPLETE = 'workflow_complete',
WORKFLOW_FAILED = 'workflow_failed',
}
/**
* Base properties included with every telemetry event.
*/
export interface BaseTelemetryProperties {
os_platform: string;
node_version: string;
}
/**
* Properties for agent-level events.
*/
export interface AgentEventProperties {
agent_name: string;
attempt_number: number;
duration_ms?: number;
cost_usd?: number;
error_type?: string; // Only error classification, never the actual message
}
/**
* Properties for workflow-level events.
*/
export interface WorkflowEventProperties {
has_config?: boolean;
total_duration_ms?: number;
total_cost_usd?: number;
error_type?: string; // Only error classification, never the actual message
}
+246
View File
@@ -0,0 +1,246 @@
// Copyright (C) 2025 Keygraph, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License version 3
// as published by the Free Software Foundation.
/**
* Telemetry Manager - PostHog integration with safety guarantees.
*
* CRITICAL: All public methods are wrapped in try-catch to ensure
* telemetry NEVER interferes with workflow execution. Failures are
* silently swallowed - telemetry is optional, not critical.
*
* Features:
* - Safe initialization (never throws)
* - Auto-redaction of sensitive data before sending
* - Fire-and-forget tracking (non-blocking)
* - Graceful shutdown with timeout (never blocks)
*/
import { PostHog } from 'posthog-node';
import crypto from 'crypto';
import { loadTelemetryConfig, type TelemetryConfig } from './telemetry-config.js';
import { TelemetryEvent, type BaseTelemetryProperties } from './telemetry-events.js';
// Shutdown timeout - don't block workflow completion
const SHUTDOWN_TIMEOUT_MS = 2000;
// Sensitive keys to redact from properties (case-insensitive matching)
const SENSITIVE_KEYS = [
'weburl',
'repopath',
'configpath',
'outputpath',
'targeturl',
'url',
'path',
'error',
'message',
'stack',
'findings',
'vulnerabilities',
'credentials',
'password',
'secret',
'token',
'apikey',
'key',
];
/**
* Generate anonymous distinct ID as a UUID.
*/
function generateDistinctId(): string {
return crypto.randomUUID();
}
/**
* Hash a URL's hostname using SHA-256.
* Returns a hex string hash of just the hostname portion.
* Returns undefined if URL is invalid.
*/
export function hashTargetUrl(url: string): string | undefined {
try {
const hostname = new URL(url).hostname;
return crypto.createHash('sha256').update(hostname).digest('hex');
} catch {
return undefined;
}
}
/**
* Check if a key name contains sensitive information.
*/
function isSensitiveKey(key: string): boolean {
const keyLower = key.toLowerCase();
return SENSITIVE_KEYS.some((sensitive) => keyLower.includes(sensitive));
}
/**
* Redact sensitive values from properties object.
* Returns a new object with sensitive keys removed.
*/
function redactSensitiveData(
properties: Record<string, unknown>
): Record<string, unknown> {
const redacted: Record<string, unknown> = {};
for (const [key, value] of Object.entries(properties)) {
// Skip sensitive keys entirely
if (isSensitiveKey(key)) {
continue;
}
// Recursively redact nested objects
if (value && typeof value === 'object' && !Array.isArray(value)) {
redacted[key] = redactSensitiveData(value as Record<string, unknown>);
} else if (typeof value === 'string') {
// Skip string values that look like paths or URLs
if (
value.startsWith('/') ||
value.startsWith('http') ||
value.includes('://')
) {
continue;
}
redacted[key] = value;
} else {
redacted[key] = value;
}
}
return redacted;
}
class TelemetryManager {
private client: PostHog | null = null;
private config: TelemetryConfig;
private distinctId: string;
private initialized = false;
private pipelineTestingMode = false;
constructor() {
this.config = loadTelemetryConfig();
this.distinctId = generateDistinctId();
}
/**
* Set the distinct ID for all subsequent events.
* Call this with workflowId to ensure consistent ID across client/worker.
*/
setDistinctId(id: string): void {
this.distinctId = id;
}
/**
* Initialize PostHog client.
* Safe: never throws, logs warning on failure.
*
* @param pipelineTestingMode - Whether running in testing mode
*/
initialize(pipelineTestingMode = false): void {
try {
if (this.initialized) {
return;
}
this.pipelineTestingMode = pipelineTestingMode;
this.initialized = true;
if (!this.config.enabled) {
return;
}
// Don't initialize if API key isn't configured
if (this.config.apiKey.includes('REPLACE_WITH')) {
this.config.enabled = false;
return;
}
this.client = new PostHog(this.config.apiKey, {
host: this.config.host,
disableGeoip: true,
flushAt: 10,
flushInterval: 5000,
});
} catch {
// Initialization failure is silent - telemetry is optional
this.initialized = true;
this.config.enabled = false;
}
}
/**
* Track an event with properties.
* Safe: never throws, silently fails on error.
*
* @param event - Event name from TelemetryEvent enum
* @param properties - Event properties (sensitive data auto-redacted)
*/
track(event: TelemetryEvent, properties: Record<string, unknown> = {}): void {
try {
if (!this.config.enabled || !this.client) {
return;
}
// Build base properties
const baseProps: BaseTelemetryProperties & Record<string, unknown> = {
pipeline_testing_mode: this.pipelineTestingMode,
os_platform: process.platform,
node_version: process.version,
$lib: 'shannon',
};
// Redact sensitive data and merge with base props
const safeProps = {
...baseProps,
...redactSensitiveData(properties),
};
// Fire and forget - don't await
this.client.capture({
distinctId: this.distinctId,
event,
properties: safeProps,
});
} catch {
// Tracking failure is silent - never interfere with workflow
}
}
/**
* Shutdown PostHog client gracefully.
* Safe: never throws, uses timeout to prevent blocking.
*
* @returns Promise that resolves when shutdown completes (or times out)
*/
async shutdown(): Promise<void> {
try {
if (!this.client) {
return;
}
// Race shutdown against timeout to never block workflow
await Promise.race([
this.client.shutdown(),
new Promise<void>((resolve) => setTimeout(resolve, SHUTDOWN_TIMEOUT_MS)),
]);
} catch {
// Shutdown failure is silent
} finally {
this.client = null;
}
}
/**
* Check if telemetry is enabled.
*/
isEnabled(): boolean {
return this.config.enabled && this.client !== null;
}
}
// Singleton instance - import this in other modules
export const telemetry = new TelemetryManager();
+90 -1
View File
@@ -70,6 +70,7 @@ import {
import { assembleFinalReport } from '../phases/reporting.js';
import { getPromptNameForAgent } from '../types/agents.js';
import { AuditSession } from '../audit/index.js';
import { telemetry, TelemetryEvent, hashTargetUrl } from '../telemetry/index.js';
import type { AgentName } from '../types/agents.js';
import type { AgentMetrics } from './shared.js';
import type { DistributedConfig } from '../types/config.js';
@@ -88,6 +89,14 @@ export interface ActivityInput {
outputPath?: string;
pipelineTestingMode?: boolean;
workflowId: string;
workflowStartTime?: number; // Epoch ms, used for total workflow duration in telemetry
installationId?: string; // Persistent anonymous ID for counting unique installations
// Workflow stats for telemetry (only passed to report agent)
workflowStats?: {
totalAgents: number;
agentsSucceeded: number;
agentsFailed: number;
};
}
/**
@@ -115,6 +124,7 @@ async function runAgentActivity(
outputPath,
pipelineTestingMode = false,
workflowId,
installationId,
} = input;
const startTime = Date.now();
@@ -122,6 +132,18 @@ async function runAgentActivity(
// Get attempt number from Temporal context (tracks retries automatically)
const attemptNumber = Context.current().info.attempt;
// Set installationId as distinct ID for unique user tracking
if (installationId) {
telemetry.setDistinctId(installationId);
}
// Track agent start
telemetry.track(TelemetryEvent.AGENT_START, {
agent_name: agentName,
attempt_number: attemptNumber,
workflow_id: workflowId,
});
// Heartbeat loop - signals worker is alive to Temporal server
const heartbeatInterval = setInterval(() => {
const elapsed = Math.floor((Date.now() - startTime) / 1000);
@@ -226,6 +248,15 @@ async function runAgentActivity(
});
await commitGitSuccess(repoPath, agentName);
// Track agent completion
telemetry.track(TelemetryEvent.AGENT_COMPLETE, {
agent_name: agentName,
attempt_number: attemptNumber,
duration_ms: Date.now() - startTime,
cost_usd: result.cost ?? undefined,
workflow_id: workflowId,
});
// 10. Return metrics
return {
durationMs: Date.now() - startTime,
@@ -246,6 +277,17 @@ async function runAgentActivity(
// If error is already an ApplicationFailure (e.g., from our retry limit logic),
// re-throw it directly without re-classifying
if (error instanceof ApplicationFailure) {
// Track retry or failure based on retryability
telemetry.track(
error.nonRetryable ? TelemetryEvent.AGENT_FAILED : TelemetryEvent.AGENT_RETRY,
{
agent_name: agentName,
attempt_number: attemptNumber,
duration_ms: Date.now() - startTime,
error_type: error.type || 'UnknownError',
workflow_id: workflowId,
}
);
throw error;
}
@@ -255,6 +297,18 @@ async function runAgentActivity(
const rawMessage = error instanceof Error ? error.message : String(error);
const message = truncateErrorMessage(rawMessage);
// Track retry or failure based on classification
telemetry.track(
classified.retryable ? TelemetryEvent.AGENT_RETRY : TelemetryEvent.AGENT_FAILED,
{
agent_name: agentName,
attempt_number: attemptNumber,
duration_ms: Date.now() - startTime,
error_type: classified.type,
workflow_id: workflowId,
}
);
if (classified.retryable) {
// Temporal will retry with configured backoff
const failure = ApplicationFailure.create({
@@ -329,7 +383,42 @@ export async function runAuthzExploitAgent(input: ActivityInput): Promise<AgentM
}
export async function runReportAgent(input: ActivityInput): Promise<AgentMetrics> {
return runAgentActivity('report', input);
// Use workflow start time for total duration if available, otherwise fall back to now
const workflowStartTime = input.workflowStartTime ?? Date.now();
const stats = input.workflowStats;
const targetHash = hashTargetUrl(input.webUrl);
const workflowId = input.workflowId;
try {
const metrics = await runAgentActivity('report', input);
// Report agent success = workflow complete
telemetry.track(TelemetryEvent.WORKFLOW_COMPLETE, {
total_duration_ms: Date.now() - workflowStartTime,
total_cost_usd: metrics.costUsd ?? undefined,
total_agents: stats?.totalAgents,
agents_succeeded: stats?.agentsSucceeded,
agents_failed: stats?.agentsFailed,
target_hash: targetHash,
workflow_id: workflowId,
});
return metrics;
} catch (error) {
// Report agent failure = workflow failed
const errorType =
error instanceof ApplicationFailure
? error.type || 'UnknownError'
: classifyErrorForTemporal(error).type;
telemetry.track(TelemetryEvent.WORKFLOW_FAILED, {
total_duration_ms: Date.now() - workflowStartTime,
error_type: errorType,
last_agent: 'report',
total_agents: stats?.totalAgents,
agents_succeeded: stats?.agentsSucceeded,
agents_failed: stats?.agentsFailed,
target_hash: targetHash,
workflow_id: workflowId,
});
throw error;
}
}
/**
+18
View File
@@ -31,6 +31,7 @@ import dotenv from 'dotenv';
import chalk from 'chalk';
import { displaySplashScreen } from '../splash-screen.js';
import { sanitizeHostname } from '../audit/utils.js';
import { telemetry, TelemetryEvent, hashTargetUrl, getInstallationId } from '../telemetry/index.js';
// Import types only - these don't pull in workflow runtime code
import type { PipelineInput, PipelineState, PipelineProgress } from './shared.js';
@@ -130,12 +131,20 @@ async function startPipeline(): Promise<void> {
const hostname = sanitizeHostname(webUrl);
const workflowId = customWorkflowId || `${hostname}_shannon-${Date.now()}`;
// Get persistent installation ID for unique installation counting
const installationId = await getInstallationId();
// Initialize telemetry with installation ID as distinct ID (for unique user tracking)
telemetry.initialize(pipelineTestingMode);
telemetry.setDistinctId(installationId);
const input: PipelineInput = {
webUrl,
repoPath,
...(configPath && { configPath }),
...(outputPath && { outputPath }),
...(pipelineTestingMode && { pipelineTestingMode }),
installationId,
};
console.log(chalk.green.bold(`✓ Workflow started: ${workflowId}`));
@@ -160,6 +169,14 @@ async function startPipeline(): Promise<void> {
}
);
// Track workflow start
telemetry.track(TelemetryEvent.WORKFLOW_START, {
has_config: !!configPath,
pipeline_testing_mode: pipelineTestingMode,
target_hash: hashTargetUrl(webUrl),
workflow_id: workflowId,
});
if (!waitForCompletion) {
console.log(chalk.bold('Monitor progress:'));
console.log(chalk.white(' Web UI: ') + chalk.blue(`http://localhost:8233/namespaces/default/workflows/${workflowId}`));
@@ -202,6 +219,7 @@ async function startPipeline(): Promise<void> {
process.exit(1);
}
} finally {
await telemetry.shutdown();
await connection.close();
}
}
+1
View File
@@ -9,6 +9,7 @@ export interface PipelineInput {
outputPath?: string;
pipelineTestingMode?: boolean;
workflowId?: string; // Added by client, used for audit correlation
installationId?: string; // Persistent anonymous ID for counting unique installations
}
export interface AgentMetrics {
+6
View File
@@ -26,6 +26,7 @@ import path from 'node:path';
import dotenv from 'dotenv';
import chalk from 'chalk';
import * as activities from './activities.js';
import { telemetry } from '../telemetry/index.js';
dotenv.config();
@@ -37,6 +38,10 @@ async function runWorker(): Promise<void> {
const connection = await NativeConnection.connect({ address });
// Initialize telemetry for activity execution
// Worker doesn't know pipelineTestingMode until activity runs, so default to false
telemetry.initialize();
// Bundle workflows for Temporal's V8 isolate
console.log(chalk.gray('Bundling workflows...'));
const workflowBundle = await bundleWorkflowCode({
@@ -68,6 +73,7 @@ async function runWorker(): Promise<void> {
try {
await worker.run();
} finally {
await telemetry.shutdown();
await connection.close();
console.log(chalk.gray('Worker stopped'));
}
+14 -1
View File
@@ -136,6 +136,9 @@ export async function pentestPipelineWorkflow(
...(input.pipelineTestingMode !== undefined && {
pipelineTestingMode: input.pipelineTestingMode,
}),
...(input.installationId !== undefined && {
installationId: input.installationId,
}),
};
try {
@@ -267,7 +270,17 @@ export async function pentestPipelineWorkflow(
await a.assembleReportActivity(activityInput);
// Then run the report agent to add executive summary and clean up
state.agentMetrics['report'] = await a.runReportAgent(activityInput);
// Pass workflow start time and stats for accurate telemetry
const reportInput = {
...activityInput,
workflowStartTime: state.startTime,
workflowStats: {
totalAgents: 13, // pre-recon, recon, 5 vuln, 5 exploit, report
agentsSucceeded: state.completedAgents.length,
agentsFailed: failedPipelines.length,
},
};
state.agentMetrics['report'] = await a.runReportAgent(reportInput);
state.completedAgents.push('report');
// === Complete ===