/** * redact-patterns — the canonical redaction taxonomy. * * Single source of truth shared by `lib/redact-engine.ts`, `bin/gstack-redact`, * `bin/gstack-redact-prepush`, and (via `scripts/resolvers/redact-doc.ts`) the * generated SKILL.md docs for /spec, /ship, /cso, /document-release, and * /document-generate. * * Design notes (locked in /plan-eng-review + two Codex passes): * * - Three tiers. HIGH = genuinely-secret credentials (block). MEDIUM = PII, * legal/damaging, internal-leak, plus credential-shaped patterns that have * high false-positive rates (confirm via AskUserQuestion). LOW = surface only. * - NO wholesale MEDIUM->HIGH promotion on public repos (TENSION-2-followup). * Public repos get sterner per-finding confirmation, not auto-block. The * engine never mutates a finding's tier based on visibility. * - Tier-1 calibration: a gate that cries wolf gets ignored. Stripe * publishable keys, Google AIza keys, JWTs, and env-style KV are MEDIUM, not * HIGH (they are context-variable / high-FP). Only genuinely-secret * credentials block. * - ReDoS safety: every pattern here MUST be linear-time (no nested unbounded * quantifiers). `test/redact-pattern-lint.test.ts` fails CI on a catastrophic * form. The engine also enforces a hard input-size cap that fails CLOSED. * - Placeholder suppression is per-matched-span, not per-line. * * Pattern matching contract: every `regex` is used with the global+multiline * flags the engine applies (`g`, `m`). Capture group 1, when present, is the * "secret span" the engine masks and (for proximity rules) anchors on; when * absent, match[0] is the span. */ export type Tier = "HIGH" | "MEDIUM" | "LOW"; export type Category = | "secret" | "pii" | "legal" | "internal" | "hygiene"; export interface RedactPattern { /** Stable dotted id, e.g. "aws.access_key". Used in findings + tests. */ id: string; tier: Tier; category: Category; /** Human-readable one-liner for the findings table + docs. */ description: string; /** * The detection regex. Linter-enforced linear-time. The engine adds the * `gm` flags; do not bake `g`/`m` into the source here (keeps `.source` * clean for the docs table and avoids double-global bugs). */ regex: RegExp; /** * Patterns whose redaction is unambiguous enough to offer one-keystroke * auto-redact at MEDIUM tier (email / phone / ssn / cc). The engine wires * the `` replacement token from `redactToken`. */ autoRedactable?: boolean; /** Replacement token for auto-redact, e.g. "". */ redactToken?: string; /** * Extra validators run AFTER the regex matches, ALL must pass for the match * to count. Used for Luhn (credit cards), entropy (env-KV), checksum * (crypto wallets), RFC1918-exclusion (public IPs), etc. Receives the * matched secret span (group 1 or match[0]) and the full match array. */ validate?: (span: string, match: RegExpExecArray) => boolean; /** * Proximity requirement: the pattern only counts if `nearRegex` also matches * within `nearWindow` chars of the match. Used for AWS secret keys (need * `aws_secret_access_key` nearby) and Twilio auth tokens (need an SID nearby). */ nearRegex?: RegExp; nearWindow?: number; } // ── Validators ────────────────────────────────────────────────────────────── /** Luhn checksum — credit-card validity. Strips spaces/dashes first. */ export function luhnValid(span: string): boolean { const digits = span.replace(/[ \-]/g, ""); if (!/^\d{13,19}$/.test(digits)) return false; let sum = 0; let alt = false; for (let i = digits.length - 1; i >= 0; i--) { let d = digits.charCodeAt(i) - 48; if (alt) { d *= 2; if (d > 9) d -= 9; } sum += d; alt = !alt; } return sum % 10 === 0; } /** Shannon entropy in bits/char. Used to gate env-style KV (skip placeholders). */ export function shannonEntropy(s: string): number { if (!s.length) return 0; const freq: Record = {}; for (const ch of s) freq[ch] = (freq[ch] || 0) + 1; let h = 0; for (const ch in freq) { const p = freq[ch] / s.length; h -= p * Math.log2(p); } return h; } /** True when an IPv4 string is a public address (not RFC1918/loopback/etc). */ export function isPublicIPv4(ip: string): boolean { const m = ip.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/); if (!m) return false; const o = m.slice(1, 5).map(Number); if (o.some((n) => n > 255)) return false; const [a, b] = o; if (a === 10) return false; // 10.0.0.0/8 if (a === 127) return false; // loopback if (a === 0) return false; // this-network if (a === 192 && b === 168) return false; // 192.168.0.0/16 if (a === 169 && b === 254) return false; // link-local if (a === 172 && b >= 16 && b <= 31) return false; // 172.16.0.0/12 if (a === 100 && b >= 64 && b <= 127) return false; // CGNAT 100.64.0.0/10 if (a >= 224) return false; // multicast / reserved return true; } // EIP-55 checksum is out of scope (heavy); we require a length+charset match and // reject all-same-char vanity strings to cut the worst FPs. function looksLikeWallet(span: string): boolean { if (/^0x[a-fA-F0-9]{40}$/.test(span)) { // reject 0x000...0 / 0xfff...f style const body = span.slice(2).toLowerCase(); return !/^(.)\1{39}$/.test(body); } // bech32 / base58 — length sanity only return span.length >= 26 && span.length <= 62; } // ── Placeholder suppression (per-matched-span, NOT per-line) ───────────────── /** * A finding is suppressed only if the MATCHED SPAN itself is a placeholder * form — not merely co-located on a line with the word EXAMPLE. This is the * tightened rule from the Codex review (line-based suppression was dangerous). */ // Structural placeholder forms — apply to ANY span (including URLs). const PLACEHOLDER_STRUCTURAL = [ /^your[_-]/i, /^<[^>]*>$/, // , /^\*+$/, // all-asterisks mask /^x{6,}$/i, // xxxxxx mask ]; // Substring placeholder words (example/test/dummy/...). These are NOT applied to // compound spans containing `://` or `@`, because a legit URL/host can contain // "example" (e.g. db.example.com) without being a placeholder secret. AWS docs // keys like AKIAIOSFODNN7EXAMPLE are bare tokens, so the guard still catches them. const PLACEHOLDER_SUBSTRING = [ /example/i, // AKIAIOSFODNN7EXAMPLE etc — AWS docs convention /^changeme$/i, /^redacted/i, /^placeholder/i, /^dummy/i, /^fake/i, /test[_-]?(key|token|secret)/i, ]; export function isPlaceholderSpan(span: string): boolean { if (PLACEHOLDER_STRUCTURAL.some((re) => re.test(span))) return true; const isCompound = span.includes("://") || span.includes("@"); if (!isCompound && PLACEHOLDER_SUBSTRING.some((re) => re.test(span))) return true; return false; } // ── The taxonomy ───────────────────────────────────────────────────────────── export const PATTERNS: RedactPattern[] = [ // ===== HIGH — genuinely-secret credentials (block) ===== { id: "aws.access_key", tier: "HIGH", category: "secret", description: "AWS access key ID (AKIA…)", regex: /\b(AKIA[0-9A-Z]{16})\b/, }, { id: "aws.secret_key", tier: "HIGH", category: "secret", description: "AWS secret access key (with aws_secret_access_key nearby)", regex: /\b([A-Za-z0-9/+=]{40})\b/, nearRegex: /aws.{0,3}secret.{0,3}access.{0,3}key/i, nearWindow: 100, }, { id: "github.pat", tier: "HIGH", category: "secret", description: "GitHub personal access token (classic)", regex: /\b(ghp_[A-Za-z0-9]{36})\b/, }, { id: "github.oauth", tier: "HIGH", category: "secret", description: "GitHub OAuth token", regex: /\b(gho_[A-Za-z0-9]{36})\b/, }, { id: "github.server", tier: "HIGH", category: "secret", description: "GitHub server-to-server token", regex: /\b(ghs_[A-Za-z0-9]{36})\b/, }, { id: "github.fine_grained", tier: "HIGH", category: "secret", description: "GitHub fine-grained PAT", regex: /\b(github_pat_[A-Za-z0-9_]{82})\b/, }, { id: "anthropic.key", tier: "HIGH", category: "secret", description: "Anthropic API key", regex: /\b(sk-ant-[A-Za-z0-9_\-]{20,})\b/, }, { id: "openai.key", tier: "HIGH", category: "secret", description: "OpenAI API key (incl. sk-proj-)", regex: /\b(sk-(?:proj-)?[A-Za-z0-9]{32,})\b/, }, { id: "sendgrid.key", tier: "HIGH", category: "secret", description: "SendGrid API key", regex: /\b(SG\.[A-Za-z0-9_\-]{22}\.[A-Za-z0-9_\-]{43})\b/, }, { id: "stripe.secret", tier: "HIGH", category: "secret", description: "Stripe live SECRET key", regex: /\b(sk_live_[A-Za-z0-9]{24,})\b/, }, { id: "slack.token", tier: "HIGH", category: "secret", description: "Slack token (bot/user/app)", regex: /\b(xox[baprs]-[A-Za-z0-9-]{10,})\b/, }, { id: "slack.webhook", tier: "HIGH", category: "secret", description: "Slack incoming webhook URL", regex: /(https:\/\/hooks\.slack\.com\/services\/T[A-Z0-9]+\/B[A-Z0-9]+\/[A-Za-z0-9]{24})/, }, { id: "discord.webhook", tier: "HIGH", category: "secret", description: "Discord webhook URL", regex: /(https:\/\/(?:canary\.|ptb\.)?discord(?:app)?\.com\/api\/webhooks\/[0-9]{17,20}\/[A-Za-z0-9_\-]{60,})/, }, { id: "twilio.auth_token", tier: "HIGH", category: "secret", description: "Twilio auth token (32 hex, with an Account SID nearby)", regex: /\b([a-f0-9]{32})\b/, nearRegex: /\bAC[a-f0-9]{32}\b/, nearWindow: 200, }, { id: "pem.private_key", tier: "HIGH", category: "secret", description: "PEM private key block", regex: /(-----BEGIN (?:RSA |EC |DSA |OPENSSH |PGP |ENCRYPTED )?PRIVATE KEY-----)/, }, { id: "db.url_with_password", tier: "HIGH", category: "secret", description: "Database URL with embedded password", regex: /\b((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp):\/\/[^:\s/@]+:[^@\s/]+@[^\s/]+)/, // Skip when the password segment is itself a placeholder. validate: (span) => { const m = span.match(/:\/\/[^:]+:([^@]+)@/); const pw = m?.[1] ?? ""; return !isPlaceholderSpan(pw) && pw !== "" && !/^\$\{?[A-Z_]+\}?$/.test(pw); }, }, { id: "creds.basic_auth_url", tier: "HIGH", category: "secret", description: "HTTP(S) URL with embedded basic-auth credentials", regex: /(https?:\/\/[^:\s/@]+:[^@\s/]+@[^\s/]+)/, validate: (span) => { const m = span.match(/:\/\/[^:]+:([^@]+)@/); const pw = m?.[1] ?? ""; return !isPlaceholderSpan(pw) && pw !== "" && !/^\$\{?[A-Z_]+\}?$/.test(pw); }, }, // ===== MEDIUM — demoted credential-shaped (high-FP / context-variable) ===== { id: "stripe.publishable", tier: "MEDIUM", category: "secret", description: "Stripe live publishable key (often intentionally public)", regex: /\b(pk_live_[A-Za-z0-9]{24,})\b/, }, { id: "google.api_key", tier: "MEDIUM", category: "secret", description: "Google API key (AIza…; sometimes a public client key)", regex: /\b(AIza[0-9A-Za-z\-_]{35})\b/, }, { id: "jwt", tier: "MEDIUM", category: "secret", description: "JSON Web Token (3-segment base64url)", regex: /\b(eyJ[A-Za-z0-9_\-]{8,}\.eyJ[A-Za-z0-9_\-]{8,}\.[A-Za-z0-9_\-]{8,})\b/, }, { id: "env.kv", tier: "MEDIUM", category: "secret", description: "Env-style SECRET assignment with high-entropy value", regex: /^[ \t]*(?:export[ \t]+)?[A-Z][A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIALS?|DSN|AUTH|COOKIE|SESSION|PRIVATE)[ \t]*=[ \t]*['"]?([^\s'"]{8,})['"]?/, // Only fire on high-entropy values — kills `FOO_KEY=changeme` FPs. validate: (span) => !isPlaceholderSpan(span) && !/^\$\{?[A-Za-z_]/.test(span) && shannonEntropy(span) >= 3.0, }, // ===== MEDIUM — PII (auto-redactable subset) ===== { id: "pii.email", tier: "MEDIUM", category: "pii", description: "Email address", regex: /\b([A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,})\b/, autoRedactable: true, redactToken: "", // Engine layers the email allowlist (example.com, noreply@, user's own, // repo-public authors) on top of this — see redact-engine.ts. }, { id: "pii.phone.e164", tier: "MEDIUM", category: "pii", description: "Phone number (E.164 / common national formats; US/EU-biased)", regex: /(?", validate: (span) => span.replace(/\D/g, "").length >= 10, }, { id: "pii.ssn", tier: "MEDIUM", category: "pii", description: "US Social Security Number", regex: /\b(\d{3}-\d{2}-\d{4})\b/, autoRedactable: true, redactToken: "", // Reject the all-zero-octet placeholders SSNs never use. validate: (span) => { const [a, b, c] = span.split("-"); return a !== "000" && b !== "00" && c !== "0000" && a !== "666" && a[0] !== "9"; }, }, { id: "pii.cc", tier: "MEDIUM", category: "pii", description: "Credit-card number (Luhn-valid)", regex: /\b((?:\d[ \-]?){13,19})\b/, autoRedactable: true, redactToken: "", validate: (span) => luhnValid(span), }, { id: "pii.ip_public", tier: "MEDIUM", category: "pii", description: "Public IPv4 address", regex: /\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/, validate: (span) => isPublicIPv4(span), }, { id: "pii.wallet", tier: "MEDIUM", category: "pii", description: "Crypto wallet address (ETH/BTC)", regex: /\b(0x[a-fA-F0-9]{40}|bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\b/, validate: (span) => looksLikeWallet(span), }, // ===== MEDIUM — internal-leak ===== { id: "internal.hostname", tier: "MEDIUM", category: "internal", description: "Internal hostname (*.internal/.corp/.local/.prod/.staging)", regex: /\b([a-z0-9][a-z0-9\-]*\.(?:internal|corp|local|lan|prod|staging))\b/i, }, { id: "internal.url_private", tier: "MEDIUM", category: "internal", description: "localhost URL with a non-trivial path", regex: /(https?:\/\/(?:localhost|127\.0\.0\.1):\d{2,5}\/[^\s)]+)/, }, // ===== MEDIUM — legal / damaging ===== { id: "legal.nda_marker", tier: "MEDIUM", category: "legal", description: "Confidentiality / NDA marker", regex: /\b(CONFIDENTIAL|UNDER NDA|ATTORNEY[- ]CLIENT|PRIVILEGED|DO NOT DISTRIBUTE|EYES ONLY)\b/, }, { id: "legal.named_criticism", tier: "MEDIUM", category: "legal", description: "Negative judgment near a capitalized full name (semantic pass is primary)", regex: /\b(incompetent|negligent|fraudulent|fraud|fired|terminated|harassed|underperforming)\b/i, // Require a Capitalized Two-Word name within the window. nearRegex: /\b[A-Z][a-z]+ [A-Z][a-z]+\b/, nearWindow: 80, }, // ===== LOW — surface only ===== { id: "internal.user_path", tier: "LOW", category: "internal", description: "Absolute path under a user home dir", regex: /(\/(?:Users|home)\/[a-z][a-z0-9_\-]+\/[^\s)]*)/, }, { id: "hygiene.todo", tier: "LOW", category: "hygiene", description: "TODO(owner) marker carried into the artifact", regex: /\b(TODO\([^)]+\))/, }, ]; /** Lookup by id. */ export const PATTERNS_BY_ID: Record = Object.fromEntries( PATTERNS.map((p) => [p.id, p]), );