feat(redact): shared redaction engine + taxonomy (pure lib, no behavior change)

Add the foundation for cross-skill PII/secret/legal redaction: - lib/redact-patterns.ts — canonical 3-tier taxonomy (HIGH genuinely-secret credentials, MEDIUM PII/legal/internal + high-FP credential-shaped, LOW surface-only). Tier-1 calibration: Stripe-publishable, Google AIza, JWT, and env-KV are MEDIUM not HIGH (context-variable / high-FP). Validators: Luhn, Shannon-entropy gate, RFC1918 exclusion, wallet sanity. Per-span placeholder suppression (not line-based). - lib/redact-engine.ts — pure scan() + applyRedactions(). Normalization pass (NFKC + zero-width strip + entity decode) with offset map back to original. Oversize input fails CLOSED. No visibility-based tier promotion (records repoVisibility for sterner wording only). Tool-attributed-fence WARN-degrade for obvious doc-examples. Safe preview masking (≤4 leading chars). - 100 unit tests: per-pattern positives, FP filters, validators, email allowlist, no-promotion semantics, tool-fence degrade, normalization, oversize-fail-closed, ReDoS pattern-lint + runtime budget, auto-redact (idempotent, right-to-left, structural-corruption guard). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-08-03 12:58:40 +02:00 · 2026-05-29 07:05:17 -07:00
parent a6fb31726c
commit de59a5cc3e
5 changed files with 1358 additions and 0 deletions
@@ -0,0 +1,479 @@
+/**
+ * redact-engine — pure scanning + auto-redaction over the shared taxonomy.
+ *
+ * No I/O. Deterministic. The CLI shim (`bin/gstack-redact`), the pre-push hook
+ * (`bin/gstack-redact-prepush`), and tests all import from here.
+ *
+ * Key behaviors (locked in /plan-eng-review + two Codex passes):
+ *   - Normalization BEFORE matching (NFKC + strip zero-width + decode a small
+ *     set of HTML entities) so Unicode-confusable / zero-width evasion fails.
+ *     Findings map back to ORIGINAL offsets via an index map.
+ *   - ReDoS safety: a hard input-size cap that fails CLOSED (oversize input
+ *     returns a single synthetic HIGH "input too large to scan safely" finding,
+ *     so callers block rather than skip). Patterns are linear-time (lint-tested).
+ *   - NO visibility-based tier mutation. `repoVisibility` is recorded on each
+ *     finding (drives sterner AUQ wording in the skill) but never promotes a
+ *     MEDIUM to HIGH. (TENSION-2-followup.)
+ *   - Placeholder suppression is per-matched-span.
+ *   - Tool-attributed fences (``` ```codex-review ``` / ``` ```greptile ```)
+ *     degrade credential findings to a non-blocking WARN — UNLESS the span is a
+ *     live-format credential the doc-example heuristic can't excuse. No nonce,
+ *     no trust exemption (the marker scheme was dropped as theater).
+ */
+
+import {
+  PATTERNS,
+  PATTERNS_BY_ID,
+  isPlaceholderSpan,
+  type RedactPattern,
+  type Tier,
+  type Category,
+} from "./redact-patterns";
+
+export type RepoVisibility = "public" | "private" | "unknown";
+
+/** A WARN is a finding that does not block but is surfaced (tool-fence degrade). */
+export type Severity = Tier | "WARN";
+
+export interface Finding {
+  id: string;
+  tier: Tier;
+  /** Effective severity after tool-fence degrade. HIGH/MEDIUM/LOW or WARN. */
+  severity: Severity;
+  category: Category;
+  description: string;
+  /** 1-based line in the ORIGINAL (un-normalized) text. */
+  line: number;
+  /** 1-based column in the ORIGINAL text. */
+  col: number;
+  /** Safe-masked preview (never more than 4 leading chars of the secret). */
+  preview: string;
+  /** Whether this finding offers one-keystroke auto-redact (PII subset). */
+  autoRedactable: boolean;
+  /** Repo visibility at scan time — drives sterner AUQ wording, not the tier. */
+  repoVisibility: RepoVisibility;
+  /** True when degraded to WARN because it sat in a tool-attributed fence. */
+  toolFenceDegraded?: boolean;
+}
+
+export interface ScanOptions {
+  repoVisibility?: RepoVisibility;
+  /** Extra allowlist entries (exact strings) that suppress a matched span. */
+  allowlist?: string[];
+  /** The invoking user's own email (from `git config user.email`) — allowlisted. */
+  selfEmail?: string;
+  /**
+   * Emails already public in the repo (git log authors, package.json, CODEOWNERS).
+   * Suppressed for `pii.email` since they're not a new leak.
+   */
+  repoPublicEmails?: string[];
+  /** Hard byte cap. Oversize input fails CLOSED. Default 1 MiB. */
+  maxBytes?: number;
+}
+
+export interface ScanResult {
+  findings: Finding[];
+  counts: { HIGH: number; MEDIUM: number; LOW: number; WARN: number };
+  repoVisibility: RepoVisibility;
+  /** True when the input-size cap tripped (caller should BLOCK). */
+  oversize: boolean;
+}
+
+const DEFAULT_MAX_BYTES = 1024 * 1024; // 1 MiB
+
+const EMAIL_ALLOW_DOMAINS = [/@example\.(com|org|net)$/i, /@example\.[a-z]{2,}$/i];
+const EMAIL_ALLOW_LOCALPARTS = [/^noreply@/i, /^no-reply@/i, /^donotreply@/i];
+
+// ── Normalization ─────────────────────────────────────────────────────────────
+
+const ZERO_WIDTH = /[‌‍⁠]/g;
+const HTML_ENTITIES: Record<string, string> = {
+  "&amp;": "&",
+  "&lt;": "<",
+  "&gt;": ">",
+  "&quot;": '"',
+  "&#39;": "'",
+  "&apos;": "'",
+};
+
+/**
+ * Normalize text for matching while producing an index map back to the original.
+ * Returns the normalized string and a function mapping a normalized offset to
+ * the corresponding original offset.
+ *
+ * Strategy: walk the original char-by-char, applying NFKC per char, dropping
+ * zero-width chars, and expanding a small fixed set of HTML entities. Each
+ * emitted normalized char records the original offset it came from. This keeps
+ * the map exact for the transformations we apply (which are all local).
+ */
+export function normalizeWithMap(input: string): {
+  normalized: string;
+  map: number[];
+} {
+  const out: string[] = [];
+  const map: number[] = [];
+  let i = 0;
+  while (i < input.length) {
+    // HTML entity expansion (fixed small set; longest first).
+    let matchedEntity = false;
+    for (const ent in HTML_ENTITIES) {
+      if (input.startsWith(ent, i)) {
+        const rep = HTML_ENTITIES[ent];
+        for (const ch of rep) {
+          out.push(ch);
+          map.push(i);
+        }
+        i += ent.length;
+        matchedEntity = true;
+        break;
+      }
+    }
+    if (matchedEntity) continue;
+
+    const ch = input[i];
+    if (ZERO_WIDTH.test(ch)) {
+      ZERO_WIDTH.lastIndex = 0;
+      i += 1;
+      continue;
+    }
+    ZERO_WIDTH.lastIndex = 0;
+
+    const norm = ch.normalize("NFKC");
+    for (const nch of norm) {
+      out.push(nch);
+      map.push(i);
+    }
+    i += 1;
+  }
+  // Sentinel so an offset == length maps to the original length.
+  map.push(input.length);
+  return { normalized: out.join(""), map };
+}
+
+// ── Offset → line/col on the ORIGINAL text ────────────────────────────────────
+
+function lineColAt(original: string, offset: number): { line: number; col: number } {
+  let line = 1;
+  let col = 1;
+  for (let i = 0; i < offset && i < original.length; i++) {
+    if (original[i] === "\n") {
+      line += 1;
+      col = 1;
+    } else {
+      col += 1;
+    }
+  }
+  return { line, col };
+}
+
+// ── Safe preview masking ──────────────────────────────────────────────────────
+
+/** Show ≤4 leading chars, mask the rest. Never reconstructable. */
+export function maskPreview(span: string): string {
+  const visible = span.slice(0, 4);
+  const masked = span.length > 4 ? "*".repeat(Math.min(span.length - 4, 8)) : "";
+  return `${visible}${masked}${span.length > 12 ? "…" : ""}`;
+}
+
+// ── Tool-attributed fence detection ───────────────────────────────────────────
+
+const TOOL_FENCE_INFO = /^```(codex-review|greptile|eval|codex|tool-output)\b/;
+
+/**
+ * Returns a sorted list of [start, end) offset ranges (in normalized text) that
+ * sit inside a tool-attributed fenced code block. Credential findings inside
+ * these ranges degrade to WARN (unless the doc-example heuristic says the span
+ * is live-format and must still block).
+ */
+function toolFenceRanges(normalized: string): Array<[number, number]> {
+  const ranges: Array<[number, number]> = [];
+  const lines = normalized.split("\n");
+  let offset = 0;
+  let inFence = false;
+  let fenceStart = 0;
+  for (const ln of lines) {
+    const isFenceMarker = ln.startsWith("```");
+    if (isFenceMarker) {
+      if (!inFence && TOOL_FENCE_INFO.test(ln)) {
+        inFence = true;
+        fenceStart = offset + ln.length + 1; // content starts after this line
+      } else if (inFence) {
+        ranges.push([fenceStart, offset]); // up to start of closing fence
+        inFence = false;
+      }
+    }
+    offset += ln.length + 1; // +1 for the \n
+  }
+  if (inFence) ranges.push([fenceStart, normalized.length]); // unterminated → still degrade its own body
+  return ranges;
+}
+
+function inRanges(offset: number, ranges: Array<[number, number]>): boolean {
+  for (const [s, e] of ranges) if (offset >= s && offset < e) return true;
+  return false;
+}
+
+/**
+ * Doc-example heuristic: a credential span inside a tool fence still BLOCKS if
+ * it looks like a LIVE credential (not an obvious placeholder/example). We only
+ * downgrade-to-WARN spans that are clearly illustrative.
+ */
+function isObviousDocExample(span: string): boolean {
+  return isPlaceholderSpan(span);
+}
+
+// ── Proximity check ───────────────────────────────────────────────────────────
+
+function hasNear(
+  normalized: string,
+  matchStart: number,
+  matchEnd: number,
+  nearRegex: RegExp,
+  window: number,
+): boolean {
+  const from = Math.max(0, matchStart - window);
+  const to = Math.min(normalized.length, matchEnd + window);
+  const slice = normalized.slice(from, to);
+  const re = new RegExp(nearRegex.source, nearRegex.flags.replace(/g/g, ""));
+  return re.test(slice);
+}
+
+// ── Email allowlist ───────────────────────────────────────────────────────────
+
+function emailAllowed(email: string, opts: ScanOptions): boolean {
+  const lower = email.toLowerCase();
+  if (opts.selfEmail && lower === opts.selfEmail.toLowerCase()) return true;
+  if (opts.repoPublicEmails?.some((e) => e.toLowerCase() === lower)) return true;
+  if (EMAIL_ALLOW_DOMAINS.some((re) => re.test(email))) return true;
+  if (EMAIL_ALLOW_LOCALPARTS.some((re) => re.test(email))) return true;
+  return false;
+}
+
+// ── The scan ──────────────────────────────────────────────────────────────────
+
+export function scan(input: string, opts: ScanOptions = {}): ScanResult {
+  const repoVisibility: RepoVisibility = opts.repoVisibility ?? "unknown";
+  const maxBytes = opts.maxBytes ?? DEFAULT_MAX_BYTES;
+
+  // Fail CLOSED on oversize input. Check byte length BEFORE heavy work.
+  const byteLen = Buffer.byteLength(input, "utf8");
+  if (byteLen > maxBytes) {
+    const finding: Finding = {
+      id: "engine.input_too_large",
+      tier: "HIGH",
+      severity: "HIGH",
+      category: "secret",
+      description: `Input too large to scan safely (${byteLen} > ${maxBytes} bytes) — blocking fail-closed`,
+      line: 1,
+      col: 1,
+      preview: "",
+      autoRedactable: false,
+      repoVisibility,
+    };
+    return {
+      findings: [finding],
+      counts: { HIGH: 1, MEDIUM: 0, LOW: 0, WARN: 0 },
+      repoVisibility,
+      oversize: true,
+    };
+  }
+
+  const { normalized, map } = normalizeWithMap(input);
+  const fenceRanges = toolFenceRanges(normalized);
+  const allow = new Set(opts.allowlist ?? []);
+
+  const findings: Finding[] = [];
+  // Dedup by (id, original-offset) so overlapping global matches don't double-count.
+  const seen = new Set<string>();
+
+  for (const pat of PATTERNS) {
+    const re = new RegExp(pat.regex.source, withFlags(pat.regex.flags));
+    let m: RegExpExecArray | null;
+    while ((m = re.exec(normalized)) !== null) {
+      // Guard against zero-width matches looping forever.
+      if (m.index === re.lastIndex) re.lastIndex++;
+
+      const span = m[1] ?? m[0];
+      const spanStartInMatch = m[1] !== undefined ? m[0].indexOf(m[1]) : 0;
+      const normOffset = m.index + Math.max(0, spanStartInMatch);
+
+      // Per-span placeholder suppression.
+      if (isPlaceholderSpan(span)) continue;
+      if (allow.has(span)) continue;
+
+      // Pattern-specific validators (Luhn, entropy, RFC1918, etc).
+      if (pat.validate && !pat.validate(span, m)) continue;
+
+      // Proximity requirement.
+      if (
+        pat.nearRegex &&
+        !hasNear(normalized, m.index, m.index + m[0].length, pat.nearRegex, pat.nearWindow ?? 100)
+      ) {
+        continue;
+      }
+
+      // Email allowlist (layered on top of the pattern).
+      if (pat.id === "pii.email" && emailAllowed(span, opts)) continue;
+
+      const origOffset = map[Math.min(normOffset, map.length - 1)] ?? 0;
+      const key = `${pat.id}:${origOffset}`;
+      if (seen.has(key)) continue;
+      seen.add(key);
+
+      const { line, col } = lineColAt(input, origOffset);
+
+      // Tool-fence degrade: only credential-category, only obvious doc examples.
+      let severity: Severity = pat.tier;
+      let toolFenceDegraded = false;
+      if (
+        pat.category === "secret" &&
+        inRanges(normOffset, fenceRanges) &&
+        isObviousDocExample(span)
+      ) {
+        severity = "WARN";
+        toolFenceDegraded = true;
+      }
+
+      findings.push({
+        id: pat.id,
+        tier: pat.tier,
+        severity,
+        category: pat.category,
+        description: pat.description,
+        line,
+        col,
+        preview: maskPreview(span),
+        autoRedactable: !!pat.autoRedactable,
+        repoVisibility,
+        ...(toolFenceDegraded ? { toolFenceDegraded } : {}),
+      });
+    }
+  }
+
+  // Stable order: by line, then col, then id.
+  findings.sort((a, b) => a.line - b.line || a.col - b.col || a.id.localeCompare(b.id));
+
+  const counts = { HIGH: 0, MEDIUM: 0, LOW: 0, WARN: 0 };
+  for (const f of findings) counts[f.severity] += 1;
+
+  return { findings, counts, repoVisibility, oversize: false };
+}
+
+function withFlags(flags: string): string {
+  let f = flags;
+  if (!f.includes("g")) f += "g";
+  if (!f.includes("m")) f += "m";
+  return f;
+}
+
+// ── Auto-redaction ────────────────────────────────────────────────────────────
+
+export interface RedactResult {
+  body: string;
+  /** ASCII unified-diff preview of the substitutions. */
+  diff: string;
+  /** Findings that could NOT be auto-redacted (structural-corruption guard). */
+  skipped: Finding[];
+}
+
+/**
+ * Substitute redact tokens for the given finding ids, right-to-left so offsets
+ * stay valid. Refuses to redact a span that sits inside a structural token
+ * (markdown link target, JSON string value) — those fall back to `skipped` so
+ * the skill drops the user to manual edit rather than silently mangling output.
+ */
+export function applyRedactions(
+  input: string,
+  findingIds: string[],
+  opts: ScanOptions = {},
+): RedactResult {
+  const ids = new Set(findingIds);
+  const { findings } = scan(input, opts);
+  const targets = findings
+    .filter((f) => ids.has(f.id) && f.autoRedactable)
+    .map((f) => ({ f, ...locateSpan(input, f) }))
+    .filter((t) => t.start >= 0);
+
+  // Right-to-left so earlier offsets remain valid after splicing.
+  targets.sort((a, b) => b.start - a.start);
+
+  const skipped: Finding[] = [];
+  const diffLines: string[] = [];
+  let body = input;
+
+  for (const t of targets) {
+    const pat = PATTERNS_BY_ID[t.f.id];
+    const token = pat?.redactToken ?? "<REDACTED>";
+    if (inStructuralToken(body, t.start, t.end)) {
+      skipped.push(t.f);
+      continue;
+    }
+    const before = lineContaining(body, t.start);
+    body = body.slice(0, t.start) + token + body.slice(t.end);
+    const after = lineContaining(body, t.start);
+    diffLines.push(`- ${before}`);
+    diffLines.push(`+ ${after}`);
+  }
+
+  return { body, diff: diffLines.reverse().join("\n"), skipped };
+}
+
+function locateSpan(input: string, f: Finding): { start: number; end: number } {
+  // Re-derive the offset from line/col on the original text.
+  let offset = 0;
+  let line = 1;
+  while (line < f.line && offset < input.length) {
+    if (input[offset] === "\n") line++;
+    offset++;
+  }
+  offset += f.col - 1;
+  const pat = PATTERNS_BY_ID[f.id];
+  if (!pat) return { start: -1, end: -1 };
+  const re = new RegExp(pat.regex.source, withFlags(pat.regex.flags));
+  re.lastIndex = Math.max(0, offset - 2);
+  const m = re.exec(input);
+  if (!m) return { start: -1, end: -1 };
+  const span = m[1] ?? m[0];
+  const start = m.index + (m[1] !== undefined ? m[0].indexOf(m[1]) : 0);
+  return { start, end: start + span.length };
+}
+
+function inStructuralToken(body: string, start: number, end: number): boolean {
+  // Markdown link target: [text](...span...). The span may sit anywhere inside
+  // the parenthesized target (e.g. an email embedded in a URL). Walk backward
+  // from the span: if we reach `](` before hitting `)`/whitespace, and forward
+  // we reach `)` before whitespace, the span is inside a link target.
+  for (let i = start - 1; i >= 0; i--) {
+    const ch = body[i];
+    if (ch === ")" || ch === "\n" || ch === " " || ch === "\t") break;
+    if (ch === "(" && i > 0 && body[i - 1] === "]") {
+      for (let j = end; j < body.length; j++) {
+        const c = body[j];
+        if (c === " " || c === "\t" || c === "\n") break;
+        if (c === ")") return true;
+      }
+      break;
+    }
+  }
+  // JSON string value: "key": "...span..."  — span is inside a quoted value.
+  const before = body.slice(Math.max(0, start - 80), start);
+  const after = body.slice(end, Math.min(body.length, end + 4));
+  if (/:\s*"$/.test(before) && /^"/.test(after)) return true;
+  return false;
+}
+
+function lineContaining(body: string, offset: number): string {
+  const start = body.lastIndexOf("\n", offset - 1) + 1;
+  let end = body.indexOf("\n", offset);
+  if (end === -1) end = body.length;
+  return body.slice(start, end);
+}
+
+// ── Exit-code helper for the CLI shim ─────────────────────────────────────────
+
+/** 0 clean, 2 MEDIUM present (no HIGH), 3 HIGH present. WARN does not gate. */
+export function exitCodeFor(result: ScanResult): 0 | 2 | 3 {
+  if (result.counts.HIGH > 0) return 3;
+  if (result.counts.MEDIUM > 0) return 2;
+  return 0;
+}
@@ -0,0 +1,469 @@
+/**
+ * redact-patterns — the canonical redaction taxonomy.
+ *
+ * Single source of truth shared by `lib/redact-engine.ts`, `bin/gstack-redact`,
+ * `bin/gstack-redact-prepush`, and (via `scripts/resolvers/redact-doc.ts`) the
+ * generated SKILL.md docs for /spec, /ship, /cso, /document-release, and
+ * /document-generate.
+ *
+ * Design notes (locked in /plan-eng-review + two Codex passes):
+ *
+ *   - Three tiers. HIGH = genuinely-secret credentials (block). MEDIUM = PII,
+ *     legal/damaging, internal-leak, plus credential-shaped patterns that have
+ *     high false-positive rates (confirm via AskUserQuestion). LOW = surface only.
+ *   - NO wholesale MEDIUM->HIGH promotion on public repos (TENSION-2-followup).
+ *     Public repos get sterner per-finding confirmation, not auto-block. The
+ *     engine never mutates a finding's tier based on visibility.
+ *   - Tier-1 calibration: a gate that cries wolf gets ignored. Stripe
+ *     publishable keys, Google AIza keys, JWTs, and env-style KV are MEDIUM, not
+ *     HIGH (they are context-variable / high-FP). Only genuinely-secret
+ *     credentials block.
+ *   - ReDoS safety: every pattern here MUST be linear-time (no nested unbounded
+ *     quantifiers). `test/redact-pattern-lint.test.ts` fails CI on a catastrophic
+ *     form. The engine also enforces a hard input-size cap that fails CLOSED.
+ *   - Placeholder suppression is per-matched-span, not per-line.
+ *
+ * Pattern matching contract: every `regex` is used with the global+multiline
+ * flags the engine applies (`g`, `m`). Capture group 1, when present, is the
+ * "secret span" the engine masks and (for proximity rules) anchors on; when
+ * absent, match[0] is the span.
+ */
+
+export type Tier = "HIGH" | "MEDIUM" | "LOW";
+
+export type Category =
+  | "secret"
+  | "pii"
+  | "legal"
+  | "internal"
+  | "hygiene";
+
+export interface RedactPattern {
+  /** Stable dotted id, e.g. "aws.access_key". Used in findings + tests. */
+  id: string;
+  tier: Tier;
+  category: Category;
+  /** Human-readable one-liner for the findings table + docs. */
+  description: string;
+  /**
+   * The detection regex. Linter-enforced linear-time. The engine adds the
+   * `gm` flags; do not bake `g`/`m` into the source here (keeps `.source`
+   * clean for the docs table and avoids double-global bugs).
+   */
+  regex: RegExp;
+  /**
+   * Patterns whose redaction is unambiguous enough to offer one-keystroke
+   * auto-redact at MEDIUM tier (email / phone / ssn / cc). The engine wires
+   * the `<REDACTED-*>` replacement token from `redactToken`.
+   */
+  autoRedactable?: boolean;
+  /** Replacement token for auto-redact, e.g. "<REDACTED-EMAIL>". */
+  redactToken?: string;
+  /**
+   * Extra validators run AFTER the regex matches, ALL must pass for the match
+   * to count. Used for Luhn (credit cards), entropy (env-KV), checksum
+   * (crypto wallets), RFC1918-exclusion (public IPs), etc. Receives the
+   * matched secret span (group 1 or match[0]) and the full match array.
+   */
+  validate?: (span: string, match: RegExpExecArray) => boolean;
+  /**
+   * Proximity requirement: the pattern only counts if `nearRegex` also matches
+   * within `nearWindow` chars of the match. Used for AWS secret keys (need
+   * `aws_secret_access_key` nearby) and Twilio auth tokens (need an SID nearby).
+   */
+  nearRegex?: RegExp;
+  nearWindow?: number;
+}
+
+// ── Validators ──────────────────────────────────────────────────────────────
+
+/** Luhn checksum — credit-card validity. Strips spaces/dashes first. */
+export function luhnValid(span: string): boolean {
+  const digits = span.replace(/[ \-]/g, "");
+  if (!/^\d{13,19}$/.test(digits)) return false;
+  let sum = 0;
+  let alt = false;
+  for (let i = digits.length - 1; i >= 0; i--) {
+    let d = digits.charCodeAt(i) - 48;
+    if (alt) {
+      d *= 2;
+      if (d > 9) d -= 9;
+    }
+    sum += d;
+    alt = !alt;
+  }
+  return sum % 10 === 0;
+}
+
+/** Shannon entropy in bits/char. Used to gate env-style KV (skip placeholders). */
+export function shannonEntropy(s: string): number {
+  if (!s.length) return 0;
+  const freq: Record<string, number> = {};
+  for (const ch of s) freq[ch] = (freq[ch] || 0) + 1;
+  let h = 0;
+  for (const ch in freq) {
+    const p = freq[ch] / s.length;
+    h -= p * Math.log2(p);
+  }
+  return h;
+}
+
+/** True when an IPv4 string is a public address (not RFC1918/loopback/etc). */
+export function isPublicIPv4(ip: string): boolean {
+  const m = ip.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
+  if (!m) return false;
+  const o = m.slice(1, 5).map(Number);
+  if (o.some((n) => n > 255)) return false;
+  const [a, b] = o;
+  if (a === 10) return false; // 10.0.0.0/8
+  if (a === 127) return false; // loopback
+  if (a === 0) return false; // this-network
+  if (a === 192 && b === 168) return false; // 192.168.0.0/16
+  if (a === 169 && b === 254) return false; // link-local
+  if (a === 172 && b >= 16 && b <= 31) return false; // 172.16.0.0/12
+  if (a === 100 && b >= 64 && b <= 127) return false; // CGNAT 100.64.0.0/10
+  if (a >= 224) return false; // multicast / reserved
+  return true;
+}
+
+// EIP-55 checksum is out of scope (heavy); we require a length+charset match and
+// reject all-same-char vanity strings to cut the worst FPs.
+function looksLikeWallet(span: string): boolean {
+  if (/^0x[a-fA-F0-9]{40}$/.test(span)) {
+    // reject 0x000...0 / 0xfff...f style
+    const body = span.slice(2).toLowerCase();
+    return !/^(.)\1{39}$/.test(body);
+  }
+  // bech32 / base58 — length sanity only
+  return span.length >= 26 && span.length <= 62;
+}
+
+// ── Placeholder suppression (per-matched-span, NOT per-line) ─────────────────
+
+/**
+ * A finding is suppressed only if the MATCHED SPAN itself is a placeholder
+ * form — not merely co-located on a line with the word EXAMPLE. This is the
+ * tightened rule from the Codex review (line-based suppression was dangerous).
+ */
+// Structural placeholder forms — apply to ANY span (including URLs).
+const PLACEHOLDER_STRUCTURAL = [
+  /^your[_-]/i,
+  /^<[^>]*>$/, // <REDACTED-FOO>, <your-key>
+  /^\*+$/, // all-asterisks mask
+  /^x{6,}$/i, // xxxxxx mask
+];
+
+// Substring placeholder words (example/test/dummy/...). These are NOT applied to
+// compound spans containing `://` or `@`, because a legit URL/host can contain
+// "example" (e.g. db.example.com) without being a placeholder secret. AWS docs
+// keys like AKIAIOSFODNN7EXAMPLE are bare tokens, so the guard still catches them.
+const PLACEHOLDER_SUBSTRING = [
+  /example/i, // AKIAIOSFODNN7EXAMPLE etc — AWS docs convention
+  /^changeme$/i,
+  /^redacted/i,
+  /^placeholder/i,
+  /^dummy/i,
+  /^fake/i,
+  /test[_-]?(key|token|secret)/i,
+];
+
+export function isPlaceholderSpan(span: string): boolean {
+  if (PLACEHOLDER_STRUCTURAL.some((re) => re.test(span))) return true;
+  const isCompound = span.includes("://") || span.includes("@");
+  if (!isCompound && PLACEHOLDER_SUBSTRING.some((re) => re.test(span))) return true;
+  return false;
+}
+
+// ── The taxonomy ─────────────────────────────────────────────────────────────
+
+export const PATTERNS: RedactPattern[] = [
+  // ===== HIGH — genuinely-secret credentials (block) =====
+  {
+    id: "aws.access_key",
+    tier: "HIGH",
+    category: "secret",
+    description: "AWS access key ID (AKIA…)",
+    regex: /\b(AKIA[0-9A-Z]{16})\b/,
+  },
+  {
+    id: "aws.secret_key",
+    tier: "HIGH",
+    category: "secret",
+    description: "AWS secret access key (with aws_secret_access_key nearby)",
+    regex: /\b([A-Za-z0-9/+=]{40})\b/,
+    nearRegex: /aws.{0,3}secret.{0,3}access.{0,3}key/i,
+    nearWindow: 100,
+  },
+  {
+    id: "github.pat",
+    tier: "HIGH",
+    category: "secret",
+    description: "GitHub personal access token (classic)",
+    regex: /\b(ghp_[A-Za-z0-9]{36})\b/,
+  },
+  {
+    id: "github.oauth",
+    tier: "HIGH",
+    category: "secret",
+    description: "GitHub OAuth token",
+    regex: /\b(gho_[A-Za-z0-9]{36})\b/,
+  },
+  {
+    id: "github.server",
+    tier: "HIGH",
+    category: "secret",
+    description: "GitHub server-to-server token",
+    regex: /\b(ghs_[A-Za-z0-9]{36})\b/,
+  },
+  {
+    id: "github.fine_grained",
+    tier: "HIGH",
+    category: "secret",
+    description: "GitHub fine-grained PAT",
+    regex: /\b(github_pat_[A-Za-z0-9_]{82})\b/,
+  },
+  {
+    id: "anthropic.key",
+    tier: "HIGH",
+    category: "secret",
+    description: "Anthropic API key",
+    regex: /\b(sk-ant-[A-Za-z0-9_\-]{20,})\b/,
+  },
+  {
+    id: "openai.key",
+    tier: "HIGH",
+    category: "secret",
+    description: "OpenAI API key (incl. sk-proj-)",
+    regex: /\b(sk-(?:proj-)?[A-Za-z0-9]{32,})\b/,
+  },
+  {
+    id: "sendgrid.key",
+    tier: "HIGH",
+    category: "secret",
+    description: "SendGrid API key",
+    regex: /\b(SG\.[A-Za-z0-9_\-]{22}\.[A-Za-z0-9_\-]{43})\b/,
+  },
+  {
+    id: "stripe.secret",
+    tier: "HIGH",
+    category: "secret",
+    description: "Stripe live SECRET key",
+    regex: /\b(sk_live_[A-Za-z0-9]{24,})\b/,
+  },
+  {
+    id: "slack.token",
+    tier: "HIGH",
+    category: "secret",
+    description: "Slack token (bot/user/app)",
+    regex: /\b(xox[baprs]-[A-Za-z0-9-]{10,})\b/,
+  },
+  {
+    id: "slack.webhook",
+    tier: "HIGH",
+    category: "secret",
+    description: "Slack incoming webhook URL",
+    regex: /(https:\/\/hooks\.slack\.com\/services\/T[A-Z0-9]+\/B[A-Z0-9]+\/[A-Za-z0-9]{24})/,
+  },
+  {
+    id: "discord.webhook",
+    tier: "HIGH",
+    category: "secret",
+    description: "Discord webhook URL",
+    regex: /(https:\/\/(?:canary\.|ptb\.)?discord(?:app)?\.com\/api\/webhooks\/[0-9]{17,20}\/[A-Za-z0-9_\-]{60,})/,
+  },
+  {
+    id: "twilio.auth_token",
+    tier: "HIGH",
+    category: "secret",
+    description: "Twilio auth token (32 hex, with an Account SID nearby)",
+    regex: /\b([a-f0-9]{32})\b/,
+    nearRegex: /\bAC[a-f0-9]{32}\b/,
+    nearWindow: 200,
+  },
+  {
+    id: "pem.private_key",
+    tier: "HIGH",
+    category: "secret",
+    description: "PEM private key block",
+    regex: /(-----BEGIN (?:RSA |EC |DSA |OPENSSH |PGP |ENCRYPTED )?PRIVATE KEY-----)/,
+  },
+  {
+    id: "db.url_with_password",
+    tier: "HIGH",
+    category: "secret",
+    description: "Database URL with embedded password",
+    regex: /\b((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp):\/\/[^:\s/@]+:[^@\s/]+@[^\s/]+)/,
+    // Skip when the password segment is itself a placeholder.
+    validate: (span) => {
+      const m = span.match(/:\/\/[^:]+:([^@]+)@/);
+      const pw = m?.[1] ?? "";
+      return !isPlaceholderSpan(pw) && pw !== "" && !/^\$\{?[A-Z_]+\}?$/.test(pw);
+    },
+  },
+  {
+    id: "creds.basic_auth_url",
+    tier: "HIGH",
+    category: "secret",
+    description: "HTTP(S) URL with embedded basic-auth credentials",
+    regex: /(https?:\/\/[^:\s/@]+:[^@\s/]+@[^\s/]+)/,
+    validate: (span) => {
+      const m = span.match(/:\/\/[^:]+:([^@]+)@/);
+      const pw = m?.[1] ?? "";
+      return !isPlaceholderSpan(pw) && pw !== "" && !/^\$\{?[A-Z_]+\}?$/.test(pw);
+    },
+  },
+
+  // ===== MEDIUM — demoted credential-shaped (high-FP / context-variable) =====
+  {
+    id: "stripe.publishable",
+    tier: "MEDIUM",
+    category: "secret",
+    description: "Stripe live publishable key (often intentionally public)",
+    regex: /\b(pk_live_[A-Za-z0-9]{24,})\b/,
+  },
+  {
+    id: "google.api_key",
+    tier: "MEDIUM",
+    category: "secret",
+    description: "Google API key (AIza…; sometimes a public client key)",
+    regex: /\b(AIza[0-9A-Za-z\-_]{35})\b/,
+  },
+  {
+    id: "jwt",
+    tier: "MEDIUM",
+    category: "secret",
+    description: "JSON Web Token (3-segment base64url)",
+    regex: /\b(eyJ[A-Za-z0-9_\-]{8,}\.eyJ[A-Za-z0-9_\-]{8,}\.[A-Za-z0-9_\-]{8,})\b/,
+  },
+  {
+    id: "env.kv",
+    tier: "MEDIUM",
+    category: "secret",
+    description: "Env-style SECRET assignment with high-entropy value",
+    regex: /^[ \t]*(?:export[ \t]+)?[A-Z][A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIALS?|DSN|AUTH|COOKIE|SESSION|PRIVATE)[ \t]*=[ \t]*['"]?([^\s'"]{8,})['"]?/,
+    // Only fire on high-entropy values — kills `FOO_KEY=changeme` FPs.
+    validate: (span) =>
+      !isPlaceholderSpan(span) &&
+      !/^\$\{?[A-Za-z_]/.test(span) &&
+      shannonEntropy(span) >= 3.0,
+  },
+
+  // ===== MEDIUM — PII (auto-redactable subset) =====
+  {
+    id: "pii.email",
+    tier: "MEDIUM",
+    category: "pii",
+    description: "Email address",
+    regex: /\b([A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,})\b/,
+    autoRedactable: true,
+    redactToken: "<REDACTED-EMAIL>",
+    // Engine layers the email allowlist (example.com, noreply@, user's own,
+    // repo-public authors) on top of this — see redact-engine.ts.
+  },
+  {
+    id: "pii.phone.e164",
+    tier: "MEDIUM",
+    category: "pii",
+    description: "Phone number (E.164 / common national formats; US/EU-biased)",
+    regex: /(?<![\w.])(\+?[1-9]\d{0,2}[ \-.]?\(?\d{2,4}\)?[ \-.]?\d{3,4}[ \-.]?\d{3,4})(?![\w.])/,
+    autoRedactable: true,
+    redactToken: "<REDACTED-PHONE>",
+    validate: (span) => span.replace(/\D/g, "").length >= 10,
+  },
+  {
+    id: "pii.ssn",
+    tier: "MEDIUM",
+    category: "pii",
+    description: "US Social Security Number",
+    regex: /\b(\d{3}-\d{2}-\d{4})\b/,
+    autoRedactable: true,
+    redactToken: "<REDACTED-SSN>",
+    // Reject the all-zero-octet placeholders SSNs never use.
+    validate: (span) => {
+      const [a, b, c] = span.split("-");
+      return a !== "000" && b !== "00" && c !== "0000" && a !== "666" && a[0] !== "9";
+    },
+  },
+  {
+    id: "pii.cc",
+    tier: "MEDIUM",
+    category: "pii",
+    description: "Credit-card number (Luhn-valid)",
+    regex: /\b((?:\d[ \-]?){13,19})\b/,
+    autoRedactable: true,
+    redactToken: "<REDACTED-CC>",
+    validate: (span) => luhnValid(span),
+  },
+  {
+    id: "pii.ip_public",
+    tier: "MEDIUM",
+    category: "pii",
+    description: "Public IPv4 address",
+    regex: /\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/,
+    validate: (span) => isPublicIPv4(span),
+  },
+  {
+    id: "pii.wallet",
+    tier: "MEDIUM",
+    category: "pii",
+    description: "Crypto wallet address (ETH/BTC)",
+    regex: /\b(0x[a-fA-F0-9]{40}|bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\b/,
+    validate: (span) => looksLikeWallet(span),
+  },
+
+  // ===== MEDIUM — internal-leak =====
+  {
+    id: "internal.hostname",
+    tier: "MEDIUM",
+    category: "internal",
+    description: "Internal hostname (*.internal/.corp/.local/.prod/.staging)",
+    regex: /\b([a-z0-9][a-z0-9\-]*\.(?:internal|corp|local|lan|prod|staging))\b/i,
+  },
+  {
+    id: "internal.url_private",
+    tier: "MEDIUM",
+    category: "internal",
+    description: "localhost URL with a non-trivial path",
+    regex: /(https?:\/\/(?:localhost|127\.0\.0\.1):\d{2,5}\/[^\s)]+)/,
+  },
+
+  // ===== MEDIUM — legal / damaging =====
+  {
+    id: "legal.nda_marker",
+    tier: "MEDIUM",
+    category: "legal",
+    description: "Confidentiality / NDA marker",
+    regex: /\b(CONFIDENTIAL|UNDER NDA|ATTORNEY[- ]CLIENT|PRIVILEGED|DO NOT DISTRIBUTE|EYES ONLY)\b/,
+  },
+  {
+    id: "legal.named_criticism",
+    tier: "MEDIUM",
+    category: "legal",
+    description: "Negative judgment near a capitalized full name (semantic pass is primary)",
+    regex: /\b(incompetent|negligent|fraudulent|fraud|fired|terminated|harassed|underperforming)\b/i,
+    // Require a Capitalized Two-Word name within the window.
+    nearRegex: /\b[A-Z][a-z]+ [A-Z][a-z]+\b/,
+    nearWindow: 80,
+  },
+
+  // ===== LOW — surface only =====
+  {
+    id: "internal.user_path",
+    tier: "LOW",
+    category: "internal",
+    description: "Absolute path under a user home dir",
+    regex: /(\/(?:Users|home)\/[a-z][a-z0-9_\-]+\/[^\s)]*)/,
+  },
+  {
+    id: "hygiene.todo",
+    tier: "LOW",
+    category: "hygiene",
+    description: "TODO(owner) marker carried into the artifact",
+    regex: /\b(TODO\([^)]+\))/,
+  },
+];
+
+/** Lookup by id. */
+export const PATTERNS_BY_ID: Record<string, RedactPattern> = Object.fromEntries(
+  PATTERNS.map((p) => [p.id, p]),
+);