feat(redact): shared redaction engine + taxonomy (pure lib, no behavior change)

Add the foundation for cross-skill PII/secret/legal redaction: - lib/redact-patterns.ts — canonical 3-tier taxonomy (HIGH genuinely-secret credentials, MEDIUM PII/legal/internal + high-FP credential-shaped, LOW surface-only). Tier-1 calibration: Stripe-publishable, Google AIza, JWT, and env-KV are MEDIUM not HIGH (context-variable / high-FP). Validators: Luhn, Shannon-entropy gate, RFC1918 exclusion, wallet sanity. Per-span placeholder suppression (not line-based). - lib/redact-engine.ts — pure scan() + applyRedactions(). Normalization pass (NFKC + zero-width strip + entity decode) with offset map back to original. Oversize input fails CLOSED. No visibility-based tier promotion (records repoVisibility for sterner wording only). Tool-attributed-fence WARN-degrade for obvious doc-examples. Safe preview masking (≤4 leading chars). - 100 unit tests: per-pattern positives, FP filters, validators, email allowlist, no-promotion semantics, tool-fence degrade, normalization, oversize-fail-closed, ReDoS pattern-lint + runtime budget, auto-redact (idempotent, right-to-left, structural-corruption guard). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-08-03 12:58:40 +02:00 · 2026-05-29 07:05:17 -07:00
parent a6fb31726c
commit de59a5cc3e
5 changed files with 1358 additions and 0 deletions
@@ -0,0 +1,63 @@
+/**
+ * Auto-redact tests (T15) — applyRedactions() substitutes redact tokens for the
+ * cleanly-substitutable PII patterns, right-to-left so offsets stay valid,
+ * refuses to mangle structural tokens, and is idempotent (re-scan after = clean).
+ */
+import { describe, test, expect } from "bun:test";
+import { applyRedactions, scan } from "../lib/redact-engine";
+
+describe("applyRedactions", () => {
+  test("substitutes email + phone tokens", () => {
+    const input = "contact me at alice@corp.io or +14155550123 today";
+    const { body } = applyRedactions(input, ["pii.email", "pii.phone.e164"], {
+      repoVisibility: "private",
+    });
+    expect(body).toContain("<REDACTED-EMAIL>");
+    expect(body).toContain("<REDACTED-PHONE>");
+    expect(body).not.toContain("alice@corp.io");
+    expect(body).not.toContain("4155550123");
+  });
+
+  test("multiple findings on one line redact correctly (right-to-left)", () => {
+    const input = "a@x.io and b@y.io and c@z.io";
+    const { body } = applyRedactions(input, ["pii.email"], { repoVisibility: "private" });
+    expect(body).toBe("<REDACTED-EMAIL> and <REDACTED-EMAIL> and <REDACTED-EMAIL>");
+  });
+
+  test("idempotent: re-scanning the redacted body finds no PII", () => {
+    const input = "ssn 123-45-6789 card 4111111111111111 mail x@corp.io";
+    const { body } = applyRedactions(
+      input,
+      ["pii.ssn", "pii.cc", "pii.email"],
+      { repoVisibility: "private" },
+    );
+    const after = scan(body, { repoVisibility: "private" });
+    const piiLeft = after.findings.filter((f) => f.category === "pii");
+    expect(piiLeft).toHaveLength(0);
+  });
+
+  test("produces an ASCII unified diff preview", () => {
+    const input = "reach alice@corp.io";
+    const { diff } = applyRedactions(input, ["pii.email"], { repoVisibility: "private" });
+    expect(diff).toContain("- reach alice@corp.io");
+    expect(diff).toContain("+ reach <REDACTED-EMAIL>");
+  });
+
+  test("refuses to redact a span inside a markdown link target (structural guard)", () => {
+    const input = "see [profile](https://x.io/u/alice@corp.io)";
+    const { body, skipped } = applyRedactions(input, ["pii.email"], {
+      repoVisibility: "private",
+    });
+    // structural guard: not auto-redacted, surfaced as skipped
+    expect(skipped.some((f) => f.id === "pii.email")).toBe(true);
+    expect(body).toContain("alice@corp.io");
+  });
+
+  test("non-autoRedactable ids are ignored", () => {
+    const input = "host db1.corp internal";
+    const { body } = applyRedactions(input, ["internal.hostname"], {
+      repoVisibility: "private",
+    });
+    expect(body).toBe(input); // hostname is not autoRedactable
+  });
+});
@@ -0,0 +1,283 @@
+/**
+ * Unit tests for lib/redact-engine.ts + lib/redact-patterns.ts.
+ *
+ * One positive test per pattern, plus FP-filters, validators (Luhn/entropy/
+ * RFC1918), email allowlist, no-promotion visibility semantics, tool-fence
+ * degrade, normalization (zero-width / homoglyph / entity), oversize fail-closed,
+ * and pure-function purity.
+ */
+import { describe, test, expect } from "bun:test";
+import {
+  scan,
+  exitCodeFor,
+  maskPreview,
+  normalizeWithMap,
+  type RepoVisibility,
+} from "../lib/redact-engine";
+import {
+  PATTERNS,
+  luhnValid,
+  shannonEntropy,
+  isPublicIPv4,
+  isPlaceholderSpan,
+} from "../lib/redact-patterns";
+
+function ids(text: string, vis: RepoVisibility = "private"): string[] {
+  return scan(text, { repoVisibility: vis }).findings.map((f) => f.id);
+}
+
+describe("HIGH credential patterns", () => {
+  const cases: Array<[string, string]> = [
+    ["aws.access_key", "key = AKIA1234567890ABCDEF"],
+    ["aws.secret_key", "aws_secret_access_key = AbCdEfGhIjKlMnOpQrStUvWxYz0123456789AbCd"],
+    ["github.pat", "token ghp_" + "1234567890abcdefghijklmnopqrstuvwxyz"],
+    ["github.oauth", "gho_" + "1234567890abcdefghijklmnopqrstuvwxyz"],
+    ["github.server", "ghs_1234567890abcdefghijklmnopqrstuvwxyz"],
+    ["github.fine_grained", "github_pat_" + "A".repeat(82)],
+    ["anthropic.key", "sk-ant-" + "api03-abcdefghij1234567890XYZ"],
+    ["openai.key", "sk-proj-" + "a".repeat(40)],
+    ["sendgrid.key", "SG." + "a".repeat(22) + "." + "b".repeat(43)],
+    ["stripe.secret", "sk_live_" + "a".repeat(30)],
+    ["slack.token", "xox" + "b-1234567890-abcdefghijklmnop"],
+    ["slack.webhook", "https://hooks.slack.com/services/T00000000/B11111111/" + "a".repeat(24)],
+    ["discord.webhook", "https://discord.com/api/webhooks/123456789012345678/" + "a".repeat(60)],
+    ["pem.private_key", "-----BEGIN RSA PRIVATE KEY-----"],
+  ];
+  for (const [id, text] of cases) {
+    test(`flags ${id}`, () => {
+      expect(ids(text)).toContain(id);
+    });
+  }
+
+  test("twilio.auth_token needs an SID nearby", () => {
+    const sid = "AC" + "a".repeat(32);
+    const tok = "b".repeat(32);
+    expect(ids(`account ${sid} token ${tok}`)).toContain("twilio.auth_token");
+    // bare 32-hex with no SID nearby should NOT flag as twilio
+    expect(ids(`random ${tok} here`)).not.toContain("twilio.auth_token");
+  });
+
+  test("db.url_with_password flags real password, skips placeholder/env-var", () => {
+    expect(ids("postgres://user:s3cretP@ss@db.example.com/app")).toContain("db.url_with_password");
+    expect(ids("postgres://user:${DB_PASSWORD}@host/app")).not.toContain("db.url_with_password");
+  });
+
+  test("all HIGH patterns block (exit 3)", () => {
+    const r = scan("AKIA1234567890ABCDEF", { repoVisibility: "private" });
+    expect(exitCodeFor(r)).toBe(3);
+  });
+});
+
+describe("MEDIUM demoted credential-shaped patterns (TENSION-1)", () => {
+  test("stripe.publishable is MEDIUM not HIGH", () => {
+    const f = scan("pk_live_" + "a".repeat(30), { repoVisibility: "private" }).findings.find(
+      (x) => x.id === "stripe.publishable",
+    );
+    expect(f?.tier).toBe("MEDIUM");
+  });
+  test("google.api_key is MEDIUM", () => {
+    const f = scan("AIza" + "a".repeat(35), { repoVisibility: "private" }).findings.find(
+      (x) => x.id === "google.api_key",
+    );
+    expect(f?.tier).toBe("MEDIUM");
+  });
+  test("jwt is MEDIUM", () => {
+    const jwt = "eyJhbGciOiJ.eyJzdWIiOiI." + "x".repeat(20);
+    const f = scan(jwt, { repoVisibility: "private" }).findings.find((x) => x.id === "jwt");
+    expect(f?.tier).toBe("MEDIUM");
+  });
+  test("env.kv fires on high-entropy, skips placeholder", () => {
+    expect(ids("API_TOKEN=8Fk2pQ9vXz4wL7mN3rT6yB1cD5eG0hJ")).toContain("env.kv");
+    expect(ids("API_KEY=changeme")).not.toContain("env.kv");
+    expect(ids("API_KEY=${MY_VAR}")).not.toContain("env.kv");
+  });
+});
+
+describe("PII patterns", () => {
+  test("email flags + is autoRedactable", () => {
+    const f = scan("ping alice@corp.io please", { repoVisibility: "private" }).findings.find(
+      (x) => x.id === "pii.email",
+    );
+    expect(f).toBeTruthy();
+    expect(f?.autoRedactable).toBe(true);
+  });
+  test("email allowlist: example.com, noreply, self, repo-public", () => {
+    expect(ids("see user@example.com")).not.toContain("pii.email");
+    expect(ids("from noreply@github.com")).not.toContain("pii.email");
+    expect(
+      scan("me@garry.dev", { repoVisibility: "private", selfEmail: "me@garry.dev" }).findings,
+    ).toHaveLength(0);
+    expect(
+      scan("bob@acme.co", { repoVisibility: "private", repoPublicEmails: ["bob@acme.co"] }).findings,
+    ).toHaveLength(0);
+  });
+  test("phone E.164", () => {
+    expect(ids("call +14155550123 now")).toContain("pii.phone.e164");
+  });
+  test("ssn flags valid, skips 000 octet", () => {
+    expect(ids("ssn 123-45-6789")).toContain("pii.ssn");
+    expect(ids("000-12-3456")).not.toContain("pii.ssn");
+  });
+  test("credit card needs Luhn", () => {
+    expect(ids("card 4111111111111111")).toContain("pii.cc");
+    expect(ids("num 4111111111111112")).not.toContain("pii.cc");
+  });
+  test("public IP flagged, RFC1918 skipped", () => {
+    expect(ids("connect 8.8.8.8")).toContain("pii.ip_public");
+    expect(ids("local 192.168.1.5")).not.toContain("pii.ip_public");
+    expect(ids("local 10.0.0.1")).not.toContain("pii.ip_public");
+  });
+});
+
+describe("internal + legal patterns", () => {
+  test("internal hostname", () => {
+    expect(ids("db1.corp internal host")).toContain("internal.hostname");
+  });
+  test("localhost url with path", () => {
+    expect(ids("hit http://localhost:8080/admin/secrets")).toContain("internal.url_private");
+  });
+  test("NDA marker", () => {
+    expect(ids("This is CONFIDENTIAL material")).toContain("legal.nda_marker");
+  });
+  test("named criticism needs a capitalized full name nearby", () => {
+    expect(ids("John Smith is incompetent at this")).toContain("legal.named_criticism");
+    expect(ids("the build is incompet019ently configured".replace("019", ""))).not.toContain(
+      "legal.named_criticism",
+    );
+  });
+});
+
+describe("LOW patterns surface only", () => {
+  test("user path is LOW", () => {
+    const f = scan("/Users/bob/secret/config", { repoVisibility: "private" }).findings.find(
+      (x) => x.id === "internal.user_path",
+    );
+    expect(f?.tier).toBe("LOW");
+  });
+  test("TODO marker is LOW", () => {
+    const f = scan("TODO(alice) fix later", { repoVisibility: "private" }).findings.find(
+      (x) => x.id === "hygiene.todo",
+    );
+    expect(f?.tier).toBe("LOW");
+  });
+});
+
+describe("placeholder suppression (per-span)", () => {
+  test("AWS docs EXAMPLE key not flagged", () => {
+    expect(ids("AKIAIOSFODNN7EXAMPLE")).not.toContain("aws.access_key");
+  });
+  test("your_ prefix not flagged", () => {
+    expect(isPlaceholderSpan("your_api_key")).toBe(true);
+  });
+  test("a real secret on a line that ALSO contains EXAMPLE still flags", () => {
+    // line-based suppression would wrongly skip this; per-span must catch it.
+    expect(ids("# EXAMPLE usage\nkey AKIA1234567890ABCDEF")).toContain("aws.access_key");
+  });
+});
+
+describe("no visibility-based tier promotion (TENSION-2-followup)", () => {
+  test("email stays MEDIUM on both private and public", () => {
+    const priv = scan("x@corp.io", { repoVisibility: "private" }).findings[0];
+    const pub = scan("x@corp.io", { repoVisibility: "public" }).findings[0];
+    expect(priv.tier).toBe("MEDIUM");
+    expect(pub.tier).toBe("MEDIUM");
+    expect(pub.severity).toBe("MEDIUM"); // NOT promoted to HIGH
+    expect(pub.repoVisibility).toBe("public"); // recorded for sterner wording
+  });
+  test("demoted credential patterns stay MEDIUM on public", () => {
+    const pub = scan("pk_live_" + "a".repeat(30), { repoVisibility: "public" }).findings[0];
+    expect(pub.severity).toBe("MEDIUM");
+  });
+  test("unknown visibility treated as public for wording, still no promotion", () => {
+    const r = scan("x@corp.io", { repoVisibility: "unknown" });
+    expect(r.findings[0].severity).toBe("MEDIUM");
+  });
+});
+
+describe("tool-attributed fence WARN-degrade (TENSION-3)", () => {
+  test("placeholder-shaped credential in tool fence → WARN", () => {
+    const text = "```codex-review\nfound your_aws_key AKIAIOSFODNN7EXAMPLE in code\n```";
+    const r = scan(text, { repoVisibility: "private" });
+    // the EXAMPLE key is suppressed as placeholder; verify a non-credential note doesn't block
+    expect(r.counts.HIGH).toBe(0);
+  });
+  test("live-format credential in tool fence STILL blocks", () => {
+    const text = "```codex-review\nleaked AKIA1234567890ABCDEF here\n```";
+    const r = scan(text, { repoVisibility: "private" });
+    expect(r.counts.HIGH).toBe(1); // not degraded — live format
+  });
+  test("AKIA outside any fence blocks", () => {
+    expect(exitCodeFor(scan("AKIA1234567890ABCDEF", {}))).toBe(3);
+  });
+});
+
+describe("normalization", () => {
+  test("zero-width chars inside a key are stripped before matching", () => {
+    const zwsp = "";
+    const broken = "AKIA1234567890" + zwsp + "ABCDEF";
+    expect(ids(broken)).toContain("aws.access_key");
+  });
+  test("HTML entity decode", () => {
+    const { normalized } = normalizeWithMap("a &amp; b");
+    expect(normalized).toBe("a & b");
+  });
+  test("offset map points back into original", () => {
+    const input = "xyz";
+    const { normalized, map } = normalizeWithMap(input);
+    expect(normalized).toBe("xyz");
+    // 'z' is at normalized index 2, original index 3
+    expect(map[2]).toBe(3);
+  });
+});
+
+describe("oversize fails CLOSED", () => {
+  test("input over the byte cap returns a single blocking HIGH finding", () => {
+    const big = "a".repeat(2000);
+    const r = scan(big, { maxBytes: 1000 });
+    expect(r.oversize).toBe(true);
+    expect(r.counts.HIGH).toBe(1);
+    expect(r.findings[0].id).toBe("engine.input_too_large");
+    expect(exitCodeFor(r)).toBe(3);
+  });
+});
+
+describe("validators", () => {
+  test("luhn", () => {
+    expect(luhnValid("4111111111111111")).toBe(true);
+    expect(luhnValid("4111111111111112")).toBe(false);
+  });
+  test("entropy", () => {
+    expect(shannonEntropy("aaaaaaaa")).toBeLessThan(1);
+    expect(shannonEntropy("8Fk2pQ9vXz4wL7mN")).toBeGreaterThan(3);
+  });
+  test("isPublicIPv4", () => {
+    expect(isPublicIPv4("8.8.8.8")).toBe(true);
+    expect(isPublicIPv4("10.1.2.3")).toBe(false);
+    expect(isPublicIPv4("172.16.5.5")).toBe(false);
+    expect(isPublicIPv4("999.1.1.1")).toBe(false);
+  });
+});
+
+describe("masking + purity", () => {
+  test("preview never leaks more than 4 leading chars", () => {
+    expect(maskPreview("AKIA1234567890ABCDEF")).toBe("AKIA********…");
+    expect(maskPreview("abc")).toBe("abc");
+  });
+  test("scan is pure — same input twice yields identical findings", () => {
+    const a = scan("AKIA1234567890ABCDEF x@corp.io", { repoVisibility: "public" });
+    const b = scan("AKIA1234567890ABCDEF x@corp.io", { repoVisibility: "public" });
+    expect(a).toEqual(b);
+  });
+});
+
+describe("taxonomy integrity", () => {
+  test("every pattern has a unique id", () => {
+    const set = new Set(PATTERNS.map((p) => p.id));
+    expect(set.size).toBe(PATTERNS.length);
+  });
+  test("autoRedactable patterns have a redactToken", () => {
+    for (const p of PATTERNS) {
+      if (p.autoRedactable) expect(p.redactToken).toBeTruthy();
+    }
+  });
+});
@@ -0,0 +1,64 @@
+/**
+ * ReDoS guard (T10) — fails CI if any taxonomy pattern has a catastrophic-
+ * backtracking shape, and asserts the engine's oversize-input path fails CLOSED.
+ *
+ * We do two things:
+ *   1. Static lint: reject nested unbounded quantifiers like (a+)+ / (a*)* /
+ *      (a+)* in any pattern source. These are the classic ReDoS forms.
+ *   2. Runtime budget: run every pattern against a pathological input and assert
+ *      no single pattern takes more than a generous wall-clock budget. This
+ *      catches catastrophic forms the static check might miss.
+ */
+import { describe, test, expect } from "bun:test";
+import { PATTERNS } from "../lib/redact-patterns";
+import { scan } from "../lib/redact-engine";
+
+// Nested-quantifier ReDoS shapes: a group ending in +/*/{n,} that is itself
+// immediately quantified by +/*/{n,}. e.g. (x+)+  (x*)*  (x+)*  (?:x+){2,}
+const NESTED_QUANTIFIER = /\([^)]*[+*]\)[+*]|\([^)]*[+*]\)\{\d+,?\}|\([^)]*\{\d+,\}\)[+*]/;
+
+describe("pattern lint — no catastrophic backtracking", () => {
+  for (const p of PATTERNS) {
+    test(`${p.id} has no nested unbounded quantifier`, () => {
+      expect(NESTED_QUANTIFIER.test(p.regex.source)).toBe(false);
+    });
+  }
+
+  test("a planted catastrophic pattern WOULD be caught by the linter", () => {
+    // meta-test: prove the linter actually detects the bad shape
+    expect(NESTED_QUANTIFIER.test("(a+)+")).toBe(true);
+    expect(NESTED_QUANTIFIER.test("(\\d*)*")).toBe(true);
+  });
+});
+
+describe("runtime budget — pathological inputs do not hang", () => {
+  // Inputs designed to stress backtracking on the real patterns.
+  const adversarial = [
+    "a".repeat(5000) + "!",
+    "AKIA" + "A".repeat(5000),
+    "eyJ" + "a".repeat(2000) + "." + "b".repeat(2000),
+    "x@" + "a".repeat(3000),
+    "/Users/" + "a".repeat(4000),
+    ("1".repeat(19) + " ").repeat(200),
+  ];
+
+  for (const [i, input] of adversarial.entries()) {
+    test(`adversarial input #${i} scans within budget`, () => {
+      const start = performance.now();
+      scan(input, { repoVisibility: "private", maxBytes: 1024 * 1024 });
+      const elapsed = performance.now() - start;
+      // Generous: full taxonomy over a 5KB pathological string should be well
+      // under 1s on any CI box. A catastrophic pattern would blow past this.
+      expect(elapsed).toBeLessThan(1000);
+    });
+  }
+});
+
+describe("oversize fails closed (the real ReDoS backstop)", () => {
+  test("input over cap returns blocking HIGH, never runs the patterns", () => {
+    const r = scan("a".repeat(50_000), { maxBytes: 10_000 });
+    expect(r.oversize).toBe(true);
+    expect(r.counts.HIGH).toBe(1);
+    expect(r.findings[0].id).toBe("engine.input_too_large");
+  });
+});