mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
feat(redact): shared redaction engine + taxonomy (pure lib, no behavior change)
Add the foundation for cross-skill PII/secret/legal redaction: - lib/redact-patterns.ts — canonical 3-tier taxonomy (HIGH genuinely-secret credentials, MEDIUM PII/legal/internal + high-FP credential-shaped, LOW surface-only). Tier-1 calibration: Stripe-publishable, Google AIza, JWT, and env-KV are MEDIUM not HIGH (context-variable / high-FP). Validators: Luhn, Shannon-entropy gate, RFC1918 exclusion, wallet sanity. Per-span placeholder suppression (not line-based). - lib/redact-engine.ts — pure scan() + applyRedactions(). Normalization pass (NFKC + zero-width strip + entity decode) with offset map back to original. Oversize input fails CLOSED. No visibility-based tier promotion (records repoVisibility for sterner wording only). Tool-attributed-fence WARN-degrade for obvious doc-examples. Safe preview masking (≤4 leading chars). - 100 unit tests: per-pattern positives, FP filters, validators, email allowlist, no-promotion semantics, tool-fence degrade, normalization, oversize-fail-closed, ReDoS pattern-lint + runtime budget, auto-redact (idempotent, right-to-left, structural-corruption guard). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,479 @@
|
||||
/**
|
||||
* redact-engine — pure scanning + auto-redaction over the shared taxonomy.
|
||||
*
|
||||
* No I/O. Deterministic. The CLI shim (`bin/gstack-redact`), the pre-push hook
|
||||
* (`bin/gstack-redact-prepush`), and tests all import from here.
|
||||
*
|
||||
* Key behaviors (locked in /plan-eng-review + two Codex passes):
|
||||
* - Normalization BEFORE matching (NFKC + strip zero-width + decode a small
|
||||
* set of HTML entities) so Unicode-confusable / zero-width evasion fails.
|
||||
* Findings map back to ORIGINAL offsets via an index map.
|
||||
* - ReDoS safety: a hard input-size cap that fails CLOSED (oversize input
|
||||
* returns a single synthetic HIGH "input too large to scan safely" finding,
|
||||
* so callers block rather than skip). Patterns are linear-time (lint-tested).
|
||||
* - NO visibility-based tier mutation. `repoVisibility` is recorded on each
|
||||
* finding (drives sterner AUQ wording in the skill) but never promotes a
|
||||
* MEDIUM to HIGH. (TENSION-2-followup.)
|
||||
* - Placeholder suppression is per-matched-span.
|
||||
* - Tool-attributed fences (``` ```codex-review ``` / ``` ```greptile ```)
|
||||
* degrade credential findings to a non-blocking WARN — UNLESS the span is a
|
||||
* live-format credential the doc-example heuristic can't excuse. No nonce,
|
||||
* no trust exemption (the marker scheme was dropped as theater).
|
||||
*/
|
||||
|
||||
import {
|
||||
PATTERNS,
|
||||
PATTERNS_BY_ID,
|
||||
isPlaceholderSpan,
|
||||
type RedactPattern,
|
||||
type Tier,
|
||||
type Category,
|
||||
} from "./redact-patterns";
|
||||
|
||||
export type RepoVisibility = "public" | "private" | "unknown";
|
||||
|
||||
/** A WARN is a finding that does not block but is surfaced (tool-fence degrade). */
|
||||
export type Severity = Tier | "WARN";
|
||||
|
||||
export interface Finding {
|
||||
id: string;
|
||||
tier: Tier;
|
||||
/** Effective severity after tool-fence degrade. HIGH/MEDIUM/LOW or WARN. */
|
||||
severity: Severity;
|
||||
category: Category;
|
||||
description: string;
|
||||
/** 1-based line in the ORIGINAL (un-normalized) text. */
|
||||
line: number;
|
||||
/** 1-based column in the ORIGINAL text. */
|
||||
col: number;
|
||||
/** Safe-masked preview (never more than 4 leading chars of the secret). */
|
||||
preview: string;
|
||||
/** Whether this finding offers one-keystroke auto-redact (PII subset). */
|
||||
autoRedactable: boolean;
|
||||
/** Repo visibility at scan time — drives sterner AUQ wording, not the tier. */
|
||||
repoVisibility: RepoVisibility;
|
||||
/** True when degraded to WARN because it sat in a tool-attributed fence. */
|
||||
toolFenceDegraded?: boolean;
|
||||
}
|
||||
|
||||
export interface ScanOptions {
|
||||
repoVisibility?: RepoVisibility;
|
||||
/** Extra allowlist entries (exact strings) that suppress a matched span. */
|
||||
allowlist?: string[];
|
||||
/** The invoking user's own email (from `git config user.email`) — allowlisted. */
|
||||
selfEmail?: string;
|
||||
/**
|
||||
* Emails already public in the repo (git log authors, package.json, CODEOWNERS).
|
||||
* Suppressed for `pii.email` since they're not a new leak.
|
||||
*/
|
||||
repoPublicEmails?: string[];
|
||||
/** Hard byte cap. Oversize input fails CLOSED. Default 1 MiB. */
|
||||
maxBytes?: number;
|
||||
}
|
||||
|
||||
export interface ScanResult {
|
||||
findings: Finding[];
|
||||
counts: { HIGH: number; MEDIUM: number; LOW: number; WARN: number };
|
||||
repoVisibility: RepoVisibility;
|
||||
/** True when the input-size cap tripped (caller should BLOCK). */
|
||||
oversize: boolean;
|
||||
}
|
||||
|
||||
const DEFAULT_MAX_BYTES = 1024 * 1024; // 1 MiB
|
||||
|
||||
const EMAIL_ALLOW_DOMAINS = [/@example\.(com|org|net)$/i, /@example\.[a-z]{2,}$/i];
|
||||
const EMAIL_ALLOW_LOCALPARTS = [/^noreply@/i, /^no-reply@/i, /^donotreply@/i];
|
||||
|
||||
// ── Normalization ─────────────────────────────────────────────────────────────
|
||||
|
||||
const ZERO_WIDTH = /[]/g;
|
||||
const HTML_ENTITIES: Record<string, string> = {
|
||||
"&": "&",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
""": '"',
|
||||
"'": "'",
|
||||
"'": "'",
|
||||
};
|
||||
|
||||
/**
|
||||
* Normalize text for matching while producing an index map back to the original.
|
||||
* Returns the normalized string and a function mapping a normalized offset to
|
||||
* the corresponding original offset.
|
||||
*
|
||||
* Strategy: walk the original char-by-char, applying NFKC per char, dropping
|
||||
* zero-width chars, and expanding a small fixed set of HTML entities. Each
|
||||
* emitted normalized char records the original offset it came from. This keeps
|
||||
* the map exact for the transformations we apply (which are all local).
|
||||
*/
|
||||
export function normalizeWithMap(input: string): {
|
||||
normalized: string;
|
||||
map: number[];
|
||||
} {
|
||||
const out: string[] = [];
|
||||
const map: number[] = [];
|
||||
let i = 0;
|
||||
while (i < input.length) {
|
||||
// HTML entity expansion (fixed small set; longest first).
|
||||
let matchedEntity = false;
|
||||
for (const ent in HTML_ENTITIES) {
|
||||
if (input.startsWith(ent, i)) {
|
||||
const rep = HTML_ENTITIES[ent];
|
||||
for (const ch of rep) {
|
||||
out.push(ch);
|
||||
map.push(i);
|
||||
}
|
||||
i += ent.length;
|
||||
matchedEntity = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (matchedEntity) continue;
|
||||
|
||||
const ch = input[i];
|
||||
if (ZERO_WIDTH.test(ch)) {
|
||||
ZERO_WIDTH.lastIndex = 0;
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
ZERO_WIDTH.lastIndex = 0;
|
||||
|
||||
const norm = ch.normalize("NFKC");
|
||||
for (const nch of norm) {
|
||||
out.push(nch);
|
||||
map.push(i);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
// Sentinel so an offset == length maps to the original length.
|
||||
map.push(input.length);
|
||||
return { normalized: out.join(""), map };
|
||||
}
|
||||
|
||||
// ── Offset → line/col on the ORIGINAL text ────────────────────────────────────
|
||||
|
||||
function lineColAt(original: string, offset: number): { line: number; col: number } {
|
||||
let line = 1;
|
||||
let col = 1;
|
||||
for (let i = 0; i < offset && i < original.length; i++) {
|
||||
if (original[i] === "\n") {
|
||||
line += 1;
|
||||
col = 1;
|
||||
} else {
|
||||
col += 1;
|
||||
}
|
||||
}
|
||||
return { line, col };
|
||||
}
|
||||
|
||||
// ── Safe preview masking ──────────────────────────────────────────────────────
|
||||
|
||||
/** Show ≤4 leading chars, mask the rest. Never reconstructable. */
|
||||
export function maskPreview(span: string): string {
|
||||
const visible = span.slice(0, 4);
|
||||
const masked = span.length > 4 ? "*".repeat(Math.min(span.length - 4, 8)) : "";
|
||||
return `${visible}${masked}${span.length > 12 ? "…" : ""}`;
|
||||
}
|
||||
|
||||
// ── Tool-attributed fence detection ───────────────────────────────────────────
|
||||
|
||||
const TOOL_FENCE_INFO = /^```(codex-review|greptile|eval|codex|tool-output)\b/;
|
||||
|
||||
/**
|
||||
* Returns a sorted list of [start, end) offset ranges (in normalized text) that
|
||||
* sit inside a tool-attributed fenced code block. Credential findings inside
|
||||
* these ranges degrade to WARN (unless the doc-example heuristic says the span
|
||||
* is live-format and must still block).
|
||||
*/
|
||||
function toolFenceRanges(normalized: string): Array<[number, number]> {
|
||||
const ranges: Array<[number, number]> = [];
|
||||
const lines = normalized.split("\n");
|
||||
let offset = 0;
|
||||
let inFence = false;
|
||||
let fenceStart = 0;
|
||||
for (const ln of lines) {
|
||||
const isFenceMarker = ln.startsWith("```");
|
||||
if (isFenceMarker) {
|
||||
if (!inFence && TOOL_FENCE_INFO.test(ln)) {
|
||||
inFence = true;
|
||||
fenceStart = offset + ln.length + 1; // content starts after this line
|
||||
} else if (inFence) {
|
||||
ranges.push([fenceStart, offset]); // up to start of closing fence
|
||||
inFence = false;
|
||||
}
|
||||
}
|
||||
offset += ln.length + 1; // +1 for the \n
|
||||
}
|
||||
if (inFence) ranges.push([fenceStart, normalized.length]); // unterminated → still degrade its own body
|
||||
return ranges;
|
||||
}
|
||||
|
||||
function inRanges(offset: number, ranges: Array<[number, number]>): boolean {
|
||||
for (const [s, e] of ranges) if (offset >= s && offset < e) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Doc-example heuristic: a credential span inside a tool fence still BLOCKS if
|
||||
* it looks like a LIVE credential (not an obvious placeholder/example). We only
|
||||
* downgrade-to-WARN spans that are clearly illustrative.
|
||||
*/
|
||||
function isObviousDocExample(span: string): boolean {
|
||||
return isPlaceholderSpan(span);
|
||||
}
|
||||
|
||||
// ── Proximity check ───────────────────────────────────────────────────────────
|
||||
|
||||
function hasNear(
|
||||
normalized: string,
|
||||
matchStart: number,
|
||||
matchEnd: number,
|
||||
nearRegex: RegExp,
|
||||
window: number,
|
||||
): boolean {
|
||||
const from = Math.max(0, matchStart - window);
|
||||
const to = Math.min(normalized.length, matchEnd + window);
|
||||
const slice = normalized.slice(from, to);
|
||||
const re = new RegExp(nearRegex.source, nearRegex.flags.replace(/g/g, ""));
|
||||
return re.test(slice);
|
||||
}
|
||||
|
||||
// ── Email allowlist ───────────────────────────────────────────────────────────
|
||||
|
||||
function emailAllowed(email: string, opts: ScanOptions): boolean {
|
||||
const lower = email.toLowerCase();
|
||||
if (opts.selfEmail && lower === opts.selfEmail.toLowerCase()) return true;
|
||||
if (opts.repoPublicEmails?.some((e) => e.toLowerCase() === lower)) return true;
|
||||
if (EMAIL_ALLOW_DOMAINS.some((re) => re.test(email))) return true;
|
||||
if (EMAIL_ALLOW_LOCALPARTS.some((re) => re.test(email))) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// ── The scan ──────────────────────────────────────────────────────────────────
|
||||
|
||||
export function scan(input: string, opts: ScanOptions = {}): ScanResult {
|
||||
const repoVisibility: RepoVisibility = opts.repoVisibility ?? "unknown";
|
||||
const maxBytes = opts.maxBytes ?? DEFAULT_MAX_BYTES;
|
||||
|
||||
// Fail CLOSED on oversize input. Check byte length BEFORE heavy work.
|
||||
const byteLen = Buffer.byteLength(input, "utf8");
|
||||
if (byteLen > maxBytes) {
|
||||
const finding: Finding = {
|
||||
id: "engine.input_too_large",
|
||||
tier: "HIGH",
|
||||
severity: "HIGH",
|
||||
category: "secret",
|
||||
description: `Input too large to scan safely (${byteLen} > ${maxBytes} bytes) — blocking fail-closed`,
|
||||
line: 1,
|
||||
col: 1,
|
||||
preview: "",
|
||||
autoRedactable: false,
|
||||
repoVisibility,
|
||||
};
|
||||
return {
|
||||
findings: [finding],
|
||||
counts: { HIGH: 1, MEDIUM: 0, LOW: 0, WARN: 0 },
|
||||
repoVisibility,
|
||||
oversize: true,
|
||||
};
|
||||
}
|
||||
|
||||
const { normalized, map } = normalizeWithMap(input);
|
||||
const fenceRanges = toolFenceRanges(normalized);
|
||||
const allow = new Set(opts.allowlist ?? []);
|
||||
|
||||
const findings: Finding[] = [];
|
||||
// Dedup by (id, original-offset) so overlapping global matches don't double-count.
|
||||
const seen = new Set<string>();
|
||||
|
||||
for (const pat of PATTERNS) {
|
||||
const re = new RegExp(pat.regex.source, withFlags(pat.regex.flags));
|
||||
let m: RegExpExecArray | null;
|
||||
while ((m = re.exec(normalized)) !== null) {
|
||||
// Guard against zero-width matches looping forever.
|
||||
if (m.index === re.lastIndex) re.lastIndex++;
|
||||
|
||||
const span = m[1] ?? m[0];
|
||||
const spanStartInMatch = m[1] !== undefined ? m[0].indexOf(m[1]) : 0;
|
||||
const normOffset = m.index + Math.max(0, spanStartInMatch);
|
||||
|
||||
// Per-span placeholder suppression.
|
||||
if (isPlaceholderSpan(span)) continue;
|
||||
if (allow.has(span)) continue;
|
||||
|
||||
// Pattern-specific validators (Luhn, entropy, RFC1918, etc).
|
||||
if (pat.validate && !pat.validate(span, m)) continue;
|
||||
|
||||
// Proximity requirement.
|
||||
if (
|
||||
pat.nearRegex &&
|
||||
!hasNear(normalized, m.index, m.index + m[0].length, pat.nearRegex, pat.nearWindow ?? 100)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Email allowlist (layered on top of the pattern).
|
||||
if (pat.id === "pii.email" && emailAllowed(span, opts)) continue;
|
||||
|
||||
const origOffset = map[Math.min(normOffset, map.length - 1)] ?? 0;
|
||||
const key = `${pat.id}:${origOffset}`;
|
||||
if (seen.has(key)) continue;
|
||||
seen.add(key);
|
||||
|
||||
const { line, col } = lineColAt(input, origOffset);
|
||||
|
||||
// Tool-fence degrade: only credential-category, only obvious doc examples.
|
||||
let severity: Severity = pat.tier;
|
||||
let toolFenceDegraded = false;
|
||||
if (
|
||||
pat.category === "secret" &&
|
||||
inRanges(normOffset, fenceRanges) &&
|
||||
isObviousDocExample(span)
|
||||
) {
|
||||
severity = "WARN";
|
||||
toolFenceDegraded = true;
|
||||
}
|
||||
|
||||
findings.push({
|
||||
id: pat.id,
|
||||
tier: pat.tier,
|
||||
severity,
|
||||
category: pat.category,
|
||||
description: pat.description,
|
||||
line,
|
||||
col,
|
||||
preview: maskPreview(span),
|
||||
autoRedactable: !!pat.autoRedactable,
|
||||
repoVisibility,
|
||||
...(toolFenceDegraded ? { toolFenceDegraded } : {}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Stable order: by line, then col, then id.
|
||||
findings.sort((a, b) => a.line - b.line || a.col - b.col || a.id.localeCompare(b.id));
|
||||
|
||||
const counts = { HIGH: 0, MEDIUM: 0, LOW: 0, WARN: 0 };
|
||||
for (const f of findings) counts[f.severity] += 1;
|
||||
|
||||
return { findings, counts, repoVisibility, oversize: false };
|
||||
}
|
||||
|
||||
function withFlags(flags: string): string {
|
||||
let f = flags;
|
||||
if (!f.includes("g")) f += "g";
|
||||
if (!f.includes("m")) f += "m";
|
||||
return f;
|
||||
}
|
||||
|
||||
// ── Auto-redaction ────────────────────────────────────────────────────────────
|
||||
|
||||
export interface RedactResult {
|
||||
body: string;
|
||||
/** ASCII unified-diff preview of the substitutions. */
|
||||
diff: string;
|
||||
/** Findings that could NOT be auto-redacted (structural-corruption guard). */
|
||||
skipped: Finding[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitute redact tokens for the given finding ids, right-to-left so offsets
|
||||
* stay valid. Refuses to redact a span that sits inside a structural token
|
||||
* (markdown link target, JSON string value) — those fall back to `skipped` so
|
||||
* the skill drops the user to manual edit rather than silently mangling output.
|
||||
*/
|
||||
export function applyRedactions(
|
||||
input: string,
|
||||
findingIds: string[],
|
||||
opts: ScanOptions = {},
|
||||
): RedactResult {
|
||||
const ids = new Set(findingIds);
|
||||
const { findings } = scan(input, opts);
|
||||
const targets = findings
|
||||
.filter((f) => ids.has(f.id) && f.autoRedactable)
|
||||
.map((f) => ({ f, ...locateSpan(input, f) }))
|
||||
.filter((t) => t.start >= 0);
|
||||
|
||||
// Right-to-left so earlier offsets remain valid after splicing.
|
||||
targets.sort((a, b) => b.start - a.start);
|
||||
|
||||
const skipped: Finding[] = [];
|
||||
const diffLines: string[] = [];
|
||||
let body = input;
|
||||
|
||||
for (const t of targets) {
|
||||
const pat = PATTERNS_BY_ID[t.f.id];
|
||||
const token = pat?.redactToken ?? "<REDACTED>";
|
||||
if (inStructuralToken(body, t.start, t.end)) {
|
||||
skipped.push(t.f);
|
||||
continue;
|
||||
}
|
||||
const before = lineContaining(body, t.start);
|
||||
body = body.slice(0, t.start) + token + body.slice(t.end);
|
||||
const after = lineContaining(body, t.start);
|
||||
diffLines.push(`- ${before}`);
|
||||
diffLines.push(`+ ${after}`);
|
||||
}
|
||||
|
||||
return { body, diff: diffLines.reverse().join("\n"), skipped };
|
||||
}
|
||||
|
||||
function locateSpan(input: string, f: Finding): { start: number; end: number } {
|
||||
// Re-derive the offset from line/col on the original text.
|
||||
let offset = 0;
|
||||
let line = 1;
|
||||
while (line < f.line && offset < input.length) {
|
||||
if (input[offset] === "\n") line++;
|
||||
offset++;
|
||||
}
|
||||
offset += f.col - 1;
|
||||
const pat = PATTERNS_BY_ID[f.id];
|
||||
if (!pat) return { start: -1, end: -1 };
|
||||
const re = new RegExp(pat.regex.source, withFlags(pat.regex.flags));
|
||||
re.lastIndex = Math.max(0, offset - 2);
|
||||
const m = re.exec(input);
|
||||
if (!m) return { start: -1, end: -1 };
|
||||
const span = m[1] ?? m[0];
|
||||
const start = m.index + (m[1] !== undefined ? m[0].indexOf(m[1]) : 0);
|
||||
return { start, end: start + span.length };
|
||||
}
|
||||
|
||||
function inStructuralToken(body: string, start: number, end: number): boolean {
|
||||
// Markdown link target: [text](...span...). The span may sit anywhere inside
|
||||
// the parenthesized target (e.g. an email embedded in a URL). Walk backward
|
||||
// from the span: if we reach `](` before hitting `)`/whitespace, and forward
|
||||
// we reach `)` before whitespace, the span is inside a link target.
|
||||
for (let i = start - 1; i >= 0; i--) {
|
||||
const ch = body[i];
|
||||
if (ch === ")" || ch === "\n" || ch === " " || ch === "\t") break;
|
||||
if (ch === "(" && i > 0 && body[i - 1] === "]") {
|
||||
for (let j = end; j < body.length; j++) {
|
||||
const c = body[j];
|
||||
if (c === " " || c === "\t" || c === "\n") break;
|
||||
if (c === ")") return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
// JSON string value: "key": "...span..." — span is inside a quoted value.
|
||||
const before = body.slice(Math.max(0, start - 80), start);
|
||||
const after = body.slice(end, Math.min(body.length, end + 4));
|
||||
if (/:\s*"$/.test(before) && /^"/.test(after)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
function lineContaining(body: string, offset: number): string {
|
||||
const start = body.lastIndexOf("\n", offset - 1) + 1;
|
||||
let end = body.indexOf("\n", offset);
|
||||
if (end === -1) end = body.length;
|
||||
return body.slice(start, end);
|
||||
}
|
||||
|
||||
// ── Exit-code helper for the CLI shim ─────────────────────────────────────────
|
||||
|
||||
/** 0 clean, 2 MEDIUM present (no HIGH), 3 HIGH present. WARN does not gate. */
|
||||
export function exitCodeFor(result: ScanResult): 0 | 2 | 3 {
|
||||
if (result.counts.HIGH > 0) return 3;
|
||||
if (result.counts.MEDIUM > 0) return 2;
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,469 @@
|
||||
/**
|
||||
* redact-patterns — the canonical redaction taxonomy.
|
||||
*
|
||||
* Single source of truth shared by `lib/redact-engine.ts`, `bin/gstack-redact`,
|
||||
* `bin/gstack-redact-prepush`, and (via `scripts/resolvers/redact-doc.ts`) the
|
||||
* generated SKILL.md docs for /spec, /ship, /cso, /document-release, and
|
||||
* /document-generate.
|
||||
*
|
||||
* Design notes (locked in /plan-eng-review + two Codex passes):
|
||||
*
|
||||
* - Three tiers. HIGH = genuinely-secret credentials (block). MEDIUM = PII,
|
||||
* legal/damaging, internal-leak, plus credential-shaped patterns that have
|
||||
* high false-positive rates (confirm via AskUserQuestion). LOW = surface only.
|
||||
* - NO wholesale MEDIUM->HIGH promotion on public repos (TENSION-2-followup).
|
||||
* Public repos get sterner per-finding confirmation, not auto-block. The
|
||||
* engine never mutates a finding's tier based on visibility.
|
||||
* - Tier-1 calibration: a gate that cries wolf gets ignored. Stripe
|
||||
* publishable keys, Google AIza keys, JWTs, and env-style KV are MEDIUM, not
|
||||
* HIGH (they are context-variable / high-FP). Only genuinely-secret
|
||||
* credentials block.
|
||||
* - ReDoS safety: every pattern here MUST be linear-time (no nested unbounded
|
||||
* quantifiers). `test/redact-pattern-lint.test.ts` fails CI on a catastrophic
|
||||
* form. The engine also enforces a hard input-size cap that fails CLOSED.
|
||||
* - Placeholder suppression is per-matched-span, not per-line.
|
||||
*
|
||||
* Pattern matching contract: every `regex` is used with the global+multiline
|
||||
* flags the engine applies (`g`, `m`). Capture group 1, when present, is the
|
||||
* "secret span" the engine masks and (for proximity rules) anchors on; when
|
||||
* absent, match[0] is the span.
|
||||
*/
|
||||
|
||||
export type Tier = "HIGH" | "MEDIUM" | "LOW";
|
||||
|
||||
export type Category =
|
||||
| "secret"
|
||||
| "pii"
|
||||
| "legal"
|
||||
| "internal"
|
||||
| "hygiene";
|
||||
|
||||
export interface RedactPattern {
|
||||
/** Stable dotted id, e.g. "aws.access_key". Used in findings + tests. */
|
||||
id: string;
|
||||
tier: Tier;
|
||||
category: Category;
|
||||
/** Human-readable one-liner for the findings table + docs. */
|
||||
description: string;
|
||||
/**
|
||||
* The detection regex. Linter-enforced linear-time. The engine adds the
|
||||
* `gm` flags; do not bake `g`/`m` into the source here (keeps `.source`
|
||||
* clean for the docs table and avoids double-global bugs).
|
||||
*/
|
||||
regex: RegExp;
|
||||
/**
|
||||
* Patterns whose redaction is unambiguous enough to offer one-keystroke
|
||||
* auto-redact at MEDIUM tier (email / phone / ssn / cc). The engine wires
|
||||
* the `<REDACTED-*>` replacement token from `redactToken`.
|
||||
*/
|
||||
autoRedactable?: boolean;
|
||||
/** Replacement token for auto-redact, e.g. "<REDACTED-EMAIL>". */
|
||||
redactToken?: string;
|
||||
/**
|
||||
* Extra validators run AFTER the regex matches, ALL must pass for the match
|
||||
* to count. Used for Luhn (credit cards), entropy (env-KV), checksum
|
||||
* (crypto wallets), RFC1918-exclusion (public IPs), etc. Receives the
|
||||
* matched secret span (group 1 or match[0]) and the full match array.
|
||||
*/
|
||||
validate?: (span: string, match: RegExpExecArray) => boolean;
|
||||
/**
|
||||
* Proximity requirement: the pattern only counts if `nearRegex` also matches
|
||||
* within `nearWindow` chars of the match. Used for AWS secret keys (need
|
||||
* `aws_secret_access_key` nearby) and Twilio auth tokens (need an SID nearby).
|
||||
*/
|
||||
nearRegex?: RegExp;
|
||||
nearWindow?: number;
|
||||
}
|
||||
|
||||
// ── Validators ──────────────────────────────────────────────────────────────
|
||||
|
||||
/** Luhn checksum — credit-card validity. Strips spaces/dashes first. */
|
||||
export function luhnValid(span: string): boolean {
|
||||
const digits = span.replace(/[ \-]/g, "");
|
||||
if (!/^\d{13,19}$/.test(digits)) return false;
|
||||
let sum = 0;
|
||||
let alt = false;
|
||||
for (let i = digits.length - 1; i >= 0; i--) {
|
||||
let d = digits.charCodeAt(i) - 48;
|
||||
if (alt) {
|
||||
d *= 2;
|
||||
if (d > 9) d -= 9;
|
||||
}
|
||||
sum += d;
|
||||
alt = !alt;
|
||||
}
|
||||
return sum % 10 === 0;
|
||||
}
|
||||
|
||||
/** Shannon entropy in bits/char. Used to gate env-style KV (skip placeholders). */
|
||||
export function shannonEntropy(s: string): number {
|
||||
if (!s.length) return 0;
|
||||
const freq: Record<string, number> = {};
|
||||
for (const ch of s) freq[ch] = (freq[ch] || 0) + 1;
|
||||
let h = 0;
|
||||
for (const ch in freq) {
|
||||
const p = freq[ch] / s.length;
|
||||
h -= p * Math.log2(p);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
/** True when an IPv4 string is a public address (not RFC1918/loopback/etc). */
|
||||
export function isPublicIPv4(ip: string): boolean {
|
||||
const m = ip.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
|
||||
if (!m) return false;
|
||||
const o = m.slice(1, 5).map(Number);
|
||||
if (o.some((n) => n > 255)) return false;
|
||||
const [a, b] = o;
|
||||
if (a === 10) return false; // 10.0.0.0/8
|
||||
if (a === 127) return false; // loopback
|
||||
if (a === 0) return false; // this-network
|
||||
if (a === 192 && b === 168) return false; // 192.168.0.0/16
|
||||
if (a === 169 && b === 254) return false; // link-local
|
||||
if (a === 172 && b >= 16 && b <= 31) return false; // 172.16.0.0/12
|
||||
if (a === 100 && b >= 64 && b <= 127) return false; // CGNAT 100.64.0.0/10
|
||||
if (a >= 224) return false; // multicast / reserved
|
||||
return true;
|
||||
}
|
||||
|
||||
// EIP-55 checksum is out of scope (heavy); we require a length+charset match and
|
||||
// reject all-same-char vanity strings to cut the worst FPs.
|
||||
function looksLikeWallet(span: string): boolean {
|
||||
if (/^0x[a-fA-F0-9]{40}$/.test(span)) {
|
||||
// reject 0x000...0 / 0xfff...f style
|
||||
const body = span.slice(2).toLowerCase();
|
||||
return !/^(.)\1{39}$/.test(body);
|
||||
}
|
||||
// bech32 / base58 — length sanity only
|
||||
return span.length >= 26 && span.length <= 62;
|
||||
}
|
||||
|
||||
// ── Placeholder suppression (per-matched-span, NOT per-line) ─────────────────
|
||||
|
||||
/**
|
||||
* A finding is suppressed only if the MATCHED SPAN itself is a placeholder
|
||||
* form — not merely co-located on a line with the word EXAMPLE. This is the
|
||||
* tightened rule from the Codex review (line-based suppression was dangerous).
|
||||
*/
|
||||
// Structural placeholder forms — apply to ANY span (including URLs).
|
||||
const PLACEHOLDER_STRUCTURAL = [
|
||||
/^your[_-]/i,
|
||||
/^<[^>]*>$/, // <REDACTED-FOO>, <your-key>
|
||||
/^\*+$/, // all-asterisks mask
|
||||
/^x{6,}$/i, // xxxxxx mask
|
||||
];
|
||||
|
||||
// Substring placeholder words (example/test/dummy/...). These are NOT applied to
|
||||
// compound spans containing `://` or `@`, because a legit URL/host can contain
|
||||
// "example" (e.g. db.example.com) without being a placeholder secret. AWS docs
|
||||
// keys like AKIAIOSFODNN7EXAMPLE are bare tokens, so the guard still catches them.
|
||||
const PLACEHOLDER_SUBSTRING = [
|
||||
/example/i, // AKIAIOSFODNN7EXAMPLE etc — AWS docs convention
|
||||
/^changeme$/i,
|
||||
/^redacted/i,
|
||||
/^placeholder/i,
|
||||
/^dummy/i,
|
||||
/^fake/i,
|
||||
/test[_-]?(key|token|secret)/i,
|
||||
];
|
||||
|
||||
export function isPlaceholderSpan(span: string): boolean {
|
||||
if (PLACEHOLDER_STRUCTURAL.some((re) => re.test(span))) return true;
|
||||
const isCompound = span.includes("://") || span.includes("@");
|
||||
if (!isCompound && PLACEHOLDER_SUBSTRING.some((re) => re.test(span))) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// ── The taxonomy ─────────────────────────────────────────────────────────────
|
||||
|
||||
export const PATTERNS: RedactPattern[] = [
|
||||
// ===== HIGH — genuinely-secret credentials (block) =====
|
||||
{
|
||||
id: "aws.access_key",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "AWS access key ID (AKIA…)",
|
||||
regex: /\b(AKIA[0-9A-Z]{16})\b/,
|
||||
},
|
||||
{
|
||||
id: "aws.secret_key",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "AWS secret access key (with aws_secret_access_key nearby)",
|
||||
regex: /\b([A-Za-z0-9/+=]{40})\b/,
|
||||
nearRegex: /aws.{0,3}secret.{0,3}access.{0,3}key/i,
|
||||
nearWindow: 100,
|
||||
},
|
||||
{
|
||||
id: "github.pat",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "GitHub personal access token (classic)",
|
||||
regex: /\b(ghp_[A-Za-z0-9]{36})\b/,
|
||||
},
|
||||
{
|
||||
id: "github.oauth",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "GitHub OAuth token",
|
||||
regex: /\b(gho_[A-Za-z0-9]{36})\b/,
|
||||
},
|
||||
{
|
||||
id: "github.server",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "GitHub server-to-server token",
|
||||
regex: /\b(ghs_[A-Za-z0-9]{36})\b/,
|
||||
},
|
||||
{
|
||||
id: "github.fine_grained",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "GitHub fine-grained PAT",
|
||||
regex: /\b(github_pat_[A-Za-z0-9_]{82})\b/,
|
||||
},
|
||||
{
|
||||
id: "anthropic.key",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "Anthropic API key",
|
||||
regex: /\b(sk-ant-[A-Za-z0-9_\-]{20,})\b/,
|
||||
},
|
||||
{
|
||||
id: "openai.key",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "OpenAI API key (incl. sk-proj-)",
|
||||
regex: /\b(sk-(?:proj-)?[A-Za-z0-9]{32,})\b/,
|
||||
},
|
||||
{
|
||||
id: "sendgrid.key",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "SendGrid API key",
|
||||
regex: /\b(SG\.[A-Za-z0-9_\-]{22}\.[A-Za-z0-9_\-]{43})\b/,
|
||||
},
|
||||
{
|
||||
id: "stripe.secret",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "Stripe live SECRET key",
|
||||
regex: /\b(sk_live_[A-Za-z0-9]{24,})\b/,
|
||||
},
|
||||
{
|
||||
id: "slack.token",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "Slack token (bot/user/app)",
|
||||
regex: /\b(xox[baprs]-[A-Za-z0-9-]{10,})\b/,
|
||||
},
|
||||
{
|
||||
id: "slack.webhook",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "Slack incoming webhook URL",
|
||||
regex: /(https:\/\/hooks\.slack\.com\/services\/T[A-Z0-9]+\/B[A-Z0-9]+\/[A-Za-z0-9]{24})/,
|
||||
},
|
||||
{
|
||||
id: "discord.webhook",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "Discord webhook URL",
|
||||
regex: /(https:\/\/(?:canary\.|ptb\.)?discord(?:app)?\.com\/api\/webhooks\/[0-9]{17,20}\/[A-Za-z0-9_\-]{60,})/,
|
||||
},
|
||||
{
|
||||
id: "twilio.auth_token",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "Twilio auth token (32 hex, with an Account SID nearby)",
|
||||
regex: /\b([a-f0-9]{32})\b/,
|
||||
nearRegex: /\bAC[a-f0-9]{32}\b/,
|
||||
nearWindow: 200,
|
||||
},
|
||||
{
|
||||
id: "pem.private_key",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "PEM private key block",
|
||||
regex: /(-----BEGIN (?:RSA |EC |DSA |OPENSSH |PGP |ENCRYPTED )?PRIVATE KEY-----)/,
|
||||
},
|
||||
{
|
||||
id: "db.url_with_password",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "Database URL with embedded password",
|
||||
regex: /\b((?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp):\/\/[^:\s/@]+:[^@\s/]+@[^\s/]+)/,
|
||||
// Skip when the password segment is itself a placeholder.
|
||||
validate: (span) => {
|
||||
const m = span.match(/:\/\/[^:]+:([^@]+)@/);
|
||||
const pw = m?.[1] ?? "";
|
||||
return !isPlaceholderSpan(pw) && pw !== "" && !/^\$\{?[A-Z_]+\}?$/.test(pw);
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "creds.basic_auth_url",
|
||||
tier: "HIGH",
|
||||
category: "secret",
|
||||
description: "HTTP(S) URL with embedded basic-auth credentials",
|
||||
regex: /(https?:\/\/[^:\s/@]+:[^@\s/]+@[^\s/]+)/,
|
||||
validate: (span) => {
|
||||
const m = span.match(/:\/\/[^:]+:([^@]+)@/);
|
||||
const pw = m?.[1] ?? "";
|
||||
return !isPlaceholderSpan(pw) && pw !== "" && !/^\$\{?[A-Z_]+\}?$/.test(pw);
|
||||
},
|
||||
},
|
||||
|
||||
// ===== MEDIUM — demoted credential-shaped (high-FP / context-variable) =====
|
||||
{
|
||||
id: "stripe.publishable",
|
||||
tier: "MEDIUM",
|
||||
category: "secret",
|
||||
description: "Stripe live publishable key (often intentionally public)",
|
||||
regex: /\b(pk_live_[A-Za-z0-9]{24,})\b/,
|
||||
},
|
||||
{
|
||||
id: "google.api_key",
|
||||
tier: "MEDIUM",
|
||||
category: "secret",
|
||||
description: "Google API key (AIza…; sometimes a public client key)",
|
||||
regex: /\b(AIza[0-9A-Za-z\-_]{35})\b/,
|
||||
},
|
||||
{
|
||||
id: "jwt",
|
||||
tier: "MEDIUM",
|
||||
category: "secret",
|
||||
description: "JSON Web Token (3-segment base64url)",
|
||||
regex: /\b(eyJ[A-Za-z0-9_\-]{8,}\.eyJ[A-Za-z0-9_\-]{8,}\.[A-Za-z0-9_\-]{8,})\b/,
|
||||
},
|
||||
{
|
||||
id: "env.kv",
|
||||
tier: "MEDIUM",
|
||||
category: "secret",
|
||||
description: "Env-style SECRET assignment with high-entropy value",
|
||||
regex: /^[ \t]*(?:export[ \t]+)?[A-Z][A-Z0-9_]*(?:KEY|TOKEN|SECRET|PASSWORD|PASSWD|CREDENTIALS?|DSN|AUTH|COOKIE|SESSION|PRIVATE)[ \t]*=[ \t]*['"]?([^\s'"]{8,})['"]?/,
|
||||
// Only fire on high-entropy values — kills `FOO_KEY=changeme` FPs.
|
||||
validate: (span) =>
|
||||
!isPlaceholderSpan(span) &&
|
||||
!/^\$\{?[A-Za-z_]/.test(span) &&
|
||||
shannonEntropy(span) >= 3.0,
|
||||
},
|
||||
|
||||
// ===== MEDIUM — PII (auto-redactable subset) =====
|
||||
{
|
||||
id: "pii.email",
|
||||
tier: "MEDIUM",
|
||||
category: "pii",
|
||||
description: "Email address",
|
||||
regex: /\b([A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,})\b/,
|
||||
autoRedactable: true,
|
||||
redactToken: "<REDACTED-EMAIL>",
|
||||
// Engine layers the email allowlist (example.com, noreply@, user's own,
|
||||
// repo-public authors) on top of this — see redact-engine.ts.
|
||||
},
|
||||
{
|
||||
id: "pii.phone.e164",
|
||||
tier: "MEDIUM",
|
||||
category: "pii",
|
||||
description: "Phone number (E.164 / common national formats; US/EU-biased)",
|
||||
regex: /(?<![\w.])(\+?[1-9]\d{0,2}[ \-.]?\(?\d{2,4}\)?[ \-.]?\d{3,4}[ \-.]?\d{3,4})(?![\w.])/,
|
||||
autoRedactable: true,
|
||||
redactToken: "<REDACTED-PHONE>",
|
||||
validate: (span) => span.replace(/\D/g, "").length >= 10,
|
||||
},
|
||||
{
|
||||
id: "pii.ssn",
|
||||
tier: "MEDIUM",
|
||||
category: "pii",
|
||||
description: "US Social Security Number",
|
||||
regex: /\b(\d{3}-\d{2}-\d{4})\b/,
|
||||
autoRedactable: true,
|
||||
redactToken: "<REDACTED-SSN>",
|
||||
// Reject the all-zero-octet placeholders SSNs never use.
|
||||
validate: (span) => {
|
||||
const [a, b, c] = span.split("-");
|
||||
return a !== "000" && b !== "00" && c !== "0000" && a !== "666" && a[0] !== "9";
|
||||
},
|
||||
},
|
||||
{
|
||||
id: "pii.cc",
|
||||
tier: "MEDIUM",
|
||||
category: "pii",
|
||||
description: "Credit-card number (Luhn-valid)",
|
||||
regex: /\b((?:\d[ \-]?){13,19})\b/,
|
||||
autoRedactable: true,
|
||||
redactToken: "<REDACTED-CC>",
|
||||
validate: (span) => luhnValid(span),
|
||||
},
|
||||
{
|
||||
id: "pii.ip_public",
|
||||
tier: "MEDIUM",
|
||||
category: "pii",
|
||||
description: "Public IPv4 address",
|
||||
regex: /\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b/,
|
||||
validate: (span) => isPublicIPv4(span),
|
||||
},
|
||||
{
|
||||
id: "pii.wallet",
|
||||
tier: "MEDIUM",
|
||||
category: "pii",
|
||||
description: "Crypto wallet address (ETH/BTC)",
|
||||
regex: /\b(0x[a-fA-F0-9]{40}|bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\b/,
|
||||
validate: (span) => looksLikeWallet(span),
|
||||
},
|
||||
|
||||
// ===== MEDIUM — internal-leak =====
|
||||
{
|
||||
id: "internal.hostname",
|
||||
tier: "MEDIUM",
|
||||
category: "internal",
|
||||
description: "Internal hostname (*.internal/.corp/.local/.prod/.staging)",
|
||||
regex: /\b([a-z0-9][a-z0-9\-]*\.(?:internal|corp|local|lan|prod|staging))\b/i,
|
||||
},
|
||||
{
|
||||
id: "internal.url_private",
|
||||
tier: "MEDIUM",
|
||||
category: "internal",
|
||||
description: "localhost URL with a non-trivial path",
|
||||
regex: /(https?:\/\/(?:localhost|127\.0\.0\.1):\d{2,5}\/[^\s)]+)/,
|
||||
},
|
||||
|
||||
// ===== MEDIUM — legal / damaging =====
|
||||
{
|
||||
id: "legal.nda_marker",
|
||||
tier: "MEDIUM",
|
||||
category: "legal",
|
||||
description: "Confidentiality / NDA marker",
|
||||
regex: /\b(CONFIDENTIAL|UNDER NDA|ATTORNEY[- ]CLIENT|PRIVILEGED|DO NOT DISTRIBUTE|EYES ONLY)\b/,
|
||||
},
|
||||
{
|
||||
id: "legal.named_criticism",
|
||||
tier: "MEDIUM",
|
||||
category: "legal",
|
||||
description: "Negative judgment near a capitalized full name (semantic pass is primary)",
|
||||
regex: /\b(incompetent|negligent|fraudulent|fraud|fired|terminated|harassed|underperforming)\b/i,
|
||||
// Require a Capitalized Two-Word name within the window.
|
||||
nearRegex: /\b[A-Z][a-z]+ [A-Z][a-z]+\b/,
|
||||
nearWindow: 80,
|
||||
},
|
||||
|
||||
// ===== LOW — surface only =====
|
||||
{
|
||||
id: "internal.user_path",
|
||||
tier: "LOW",
|
||||
category: "internal",
|
||||
description: "Absolute path under a user home dir",
|
||||
regex: /(\/(?:Users|home)\/[a-z][a-z0-9_\-]+\/[^\s)]*)/,
|
||||
},
|
||||
{
|
||||
id: "hygiene.todo",
|
||||
tier: "LOW",
|
||||
category: "hygiene",
|
||||
description: "TODO(owner) marker carried into the artifact",
|
||||
regex: /\b(TODO\([^)]+\))/,
|
||||
},
|
||||
];
|
||||
|
||||
/** Lookup by id. */
|
||||
export const PATTERNS_BY_ID: Record<string, RedactPattern> = Object.fromEntries(
|
||||
PATTERNS.map((p) => [p.id, p]),
|
||||
);
|
||||
@@ -0,0 +1,63 @@
|
||||
/**
|
||||
* Auto-redact tests (T15) — applyRedactions() substitutes redact tokens for the
|
||||
* cleanly-substitutable PII patterns, right-to-left so offsets stay valid,
|
||||
* refuses to mangle structural tokens, and is idempotent (re-scan after = clean).
|
||||
*/
|
||||
import { describe, test, expect } from "bun:test";
|
||||
import { applyRedactions, scan } from "../lib/redact-engine";
|
||||
|
||||
describe("applyRedactions", () => {
|
||||
test("substitutes email + phone tokens", () => {
|
||||
const input = "contact me at alice@corp.io or +14155550123 today";
|
||||
const { body } = applyRedactions(input, ["pii.email", "pii.phone.e164"], {
|
||||
repoVisibility: "private",
|
||||
});
|
||||
expect(body).toContain("<REDACTED-EMAIL>");
|
||||
expect(body).toContain("<REDACTED-PHONE>");
|
||||
expect(body).not.toContain("alice@corp.io");
|
||||
expect(body).not.toContain("4155550123");
|
||||
});
|
||||
|
||||
test("multiple findings on one line redact correctly (right-to-left)", () => {
|
||||
const input = "a@x.io and b@y.io and c@z.io";
|
||||
const { body } = applyRedactions(input, ["pii.email"], { repoVisibility: "private" });
|
||||
expect(body).toBe("<REDACTED-EMAIL> and <REDACTED-EMAIL> and <REDACTED-EMAIL>");
|
||||
});
|
||||
|
||||
test("idempotent: re-scanning the redacted body finds no PII", () => {
|
||||
const input = "ssn 123-45-6789 card 4111111111111111 mail x@corp.io";
|
||||
const { body } = applyRedactions(
|
||||
input,
|
||||
["pii.ssn", "pii.cc", "pii.email"],
|
||||
{ repoVisibility: "private" },
|
||||
);
|
||||
const after = scan(body, { repoVisibility: "private" });
|
||||
const piiLeft = after.findings.filter((f) => f.category === "pii");
|
||||
expect(piiLeft).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("produces an ASCII unified diff preview", () => {
|
||||
const input = "reach alice@corp.io";
|
||||
const { diff } = applyRedactions(input, ["pii.email"], { repoVisibility: "private" });
|
||||
expect(diff).toContain("- reach alice@corp.io");
|
||||
expect(diff).toContain("+ reach <REDACTED-EMAIL>");
|
||||
});
|
||||
|
||||
test("refuses to redact a span inside a markdown link target (structural guard)", () => {
|
||||
const input = "see [profile](https://x.io/u/alice@corp.io)";
|
||||
const { body, skipped } = applyRedactions(input, ["pii.email"], {
|
||||
repoVisibility: "private",
|
||||
});
|
||||
// structural guard: not auto-redacted, surfaced as skipped
|
||||
expect(skipped.some((f) => f.id === "pii.email")).toBe(true);
|
||||
expect(body).toContain("alice@corp.io");
|
||||
});
|
||||
|
||||
test("non-autoRedactable ids are ignored", () => {
|
||||
const input = "host db1.corp internal";
|
||||
const { body } = applyRedactions(input, ["internal.hostname"], {
|
||||
repoVisibility: "private",
|
||||
});
|
||||
expect(body).toBe(input); // hostname is not autoRedactable
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,283 @@
|
||||
/**
|
||||
* Unit tests for lib/redact-engine.ts + lib/redact-patterns.ts.
|
||||
*
|
||||
* One positive test per pattern, plus FP-filters, validators (Luhn/entropy/
|
||||
* RFC1918), email allowlist, no-promotion visibility semantics, tool-fence
|
||||
* degrade, normalization (zero-width / homoglyph / entity), oversize fail-closed,
|
||||
* and pure-function purity.
|
||||
*/
|
||||
import { describe, test, expect } from "bun:test";
|
||||
import {
|
||||
scan,
|
||||
exitCodeFor,
|
||||
maskPreview,
|
||||
normalizeWithMap,
|
||||
type RepoVisibility,
|
||||
} from "../lib/redact-engine";
|
||||
import {
|
||||
PATTERNS,
|
||||
luhnValid,
|
||||
shannonEntropy,
|
||||
isPublicIPv4,
|
||||
isPlaceholderSpan,
|
||||
} from "../lib/redact-patterns";
|
||||
|
||||
function ids(text: string, vis: RepoVisibility = "private"): string[] {
|
||||
return scan(text, { repoVisibility: vis }).findings.map((f) => f.id);
|
||||
}
|
||||
|
||||
describe("HIGH credential patterns", () => {
|
||||
const cases: Array<[string, string]> = [
|
||||
["aws.access_key", "key = AKIA1234567890ABCDEF"],
|
||||
["aws.secret_key", "aws_secret_access_key = AbCdEfGhIjKlMnOpQrStUvWxYz0123456789AbCd"],
|
||||
["github.pat", "token ghp_" + "1234567890abcdefghijklmnopqrstuvwxyz"],
|
||||
["github.oauth", "gho_" + "1234567890abcdefghijklmnopqrstuvwxyz"],
|
||||
["github.server", "ghs_1234567890abcdefghijklmnopqrstuvwxyz"],
|
||||
["github.fine_grained", "github_pat_" + "A".repeat(82)],
|
||||
["anthropic.key", "sk-ant-" + "api03-abcdefghij1234567890XYZ"],
|
||||
["openai.key", "sk-proj-" + "a".repeat(40)],
|
||||
["sendgrid.key", "SG." + "a".repeat(22) + "." + "b".repeat(43)],
|
||||
["stripe.secret", "sk_live_" + "a".repeat(30)],
|
||||
["slack.token", "xox" + "b-1234567890-abcdefghijklmnop"],
|
||||
["slack.webhook", "https://hooks.slack.com/services/T00000000/B11111111/" + "a".repeat(24)],
|
||||
["discord.webhook", "https://discord.com/api/webhooks/123456789012345678/" + "a".repeat(60)],
|
||||
["pem.private_key", "-----BEGIN RSA PRIVATE KEY-----"],
|
||||
];
|
||||
for (const [id, text] of cases) {
|
||||
test(`flags ${id}`, () => {
|
||||
expect(ids(text)).toContain(id);
|
||||
});
|
||||
}
|
||||
|
||||
test("twilio.auth_token needs an SID nearby", () => {
|
||||
const sid = "AC" + "a".repeat(32);
|
||||
const tok = "b".repeat(32);
|
||||
expect(ids(`account ${sid} token ${tok}`)).toContain("twilio.auth_token");
|
||||
// bare 32-hex with no SID nearby should NOT flag as twilio
|
||||
expect(ids(`random ${tok} here`)).not.toContain("twilio.auth_token");
|
||||
});
|
||||
|
||||
test("db.url_with_password flags real password, skips placeholder/env-var", () => {
|
||||
expect(ids("postgres://user:s3cretP@ss@db.example.com/app")).toContain("db.url_with_password");
|
||||
expect(ids("postgres://user:${DB_PASSWORD}@host/app")).not.toContain("db.url_with_password");
|
||||
});
|
||||
|
||||
test("all HIGH patterns block (exit 3)", () => {
|
||||
const r = scan("AKIA1234567890ABCDEF", { repoVisibility: "private" });
|
||||
expect(exitCodeFor(r)).toBe(3);
|
||||
});
|
||||
});
|
||||
|
||||
describe("MEDIUM demoted credential-shaped patterns (TENSION-1)", () => {
|
||||
test("stripe.publishable is MEDIUM not HIGH", () => {
|
||||
const f = scan("pk_live_" + "a".repeat(30), { repoVisibility: "private" }).findings.find(
|
||||
(x) => x.id === "stripe.publishable",
|
||||
);
|
||||
expect(f?.tier).toBe("MEDIUM");
|
||||
});
|
||||
test("google.api_key is MEDIUM", () => {
|
||||
const f = scan("AIza" + "a".repeat(35), { repoVisibility: "private" }).findings.find(
|
||||
(x) => x.id === "google.api_key",
|
||||
);
|
||||
expect(f?.tier).toBe("MEDIUM");
|
||||
});
|
||||
test("jwt is MEDIUM", () => {
|
||||
const jwt = "eyJhbGciOiJ.eyJzdWIiOiI." + "x".repeat(20);
|
||||
const f = scan(jwt, { repoVisibility: "private" }).findings.find((x) => x.id === "jwt");
|
||||
expect(f?.tier).toBe("MEDIUM");
|
||||
});
|
||||
test("env.kv fires on high-entropy, skips placeholder", () => {
|
||||
expect(ids("API_TOKEN=8Fk2pQ9vXz4wL7mN3rT6yB1cD5eG0hJ")).toContain("env.kv");
|
||||
expect(ids("API_KEY=changeme")).not.toContain("env.kv");
|
||||
expect(ids("API_KEY=${MY_VAR}")).not.toContain("env.kv");
|
||||
});
|
||||
});
|
||||
|
||||
describe("PII patterns", () => {
|
||||
test("email flags + is autoRedactable", () => {
|
||||
const f = scan("ping alice@corp.io please", { repoVisibility: "private" }).findings.find(
|
||||
(x) => x.id === "pii.email",
|
||||
);
|
||||
expect(f).toBeTruthy();
|
||||
expect(f?.autoRedactable).toBe(true);
|
||||
});
|
||||
test("email allowlist: example.com, noreply, self, repo-public", () => {
|
||||
expect(ids("see user@example.com")).not.toContain("pii.email");
|
||||
expect(ids("from noreply@github.com")).not.toContain("pii.email");
|
||||
expect(
|
||||
scan("me@garry.dev", { repoVisibility: "private", selfEmail: "me@garry.dev" }).findings,
|
||||
).toHaveLength(0);
|
||||
expect(
|
||||
scan("bob@acme.co", { repoVisibility: "private", repoPublicEmails: ["bob@acme.co"] }).findings,
|
||||
).toHaveLength(0);
|
||||
});
|
||||
test("phone E.164", () => {
|
||||
expect(ids("call +14155550123 now")).toContain("pii.phone.e164");
|
||||
});
|
||||
test("ssn flags valid, skips 000 octet", () => {
|
||||
expect(ids("ssn 123-45-6789")).toContain("pii.ssn");
|
||||
expect(ids("000-12-3456")).not.toContain("pii.ssn");
|
||||
});
|
||||
test("credit card needs Luhn", () => {
|
||||
expect(ids("card 4111111111111111")).toContain("pii.cc");
|
||||
expect(ids("num 4111111111111112")).not.toContain("pii.cc");
|
||||
});
|
||||
test("public IP flagged, RFC1918 skipped", () => {
|
||||
expect(ids("connect 8.8.8.8")).toContain("pii.ip_public");
|
||||
expect(ids("local 192.168.1.5")).not.toContain("pii.ip_public");
|
||||
expect(ids("local 10.0.0.1")).not.toContain("pii.ip_public");
|
||||
});
|
||||
});
|
||||
|
||||
describe("internal + legal patterns", () => {
|
||||
test("internal hostname", () => {
|
||||
expect(ids("db1.corp internal host")).toContain("internal.hostname");
|
||||
});
|
||||
test("localhost url with path", () => {
|
||||
expect(ids("hit http://localhost:8080/admin/secrets")).toContain("internal.url_private");
|
||||
});
|
||||
test("NDA marker", () => {
|
||||
expect(ids("This is CONFIDENTIAL material")).toContain("legal.nda_marker");
|
||||
});
|
||||
test("named criticism needs a capitalized full name nearby", () => {
|
||||
expect(ids("John Smith is incompetent at this")).toContain("legal.named_criticism");
|
||||
expect(ids("the build is incompet019ently configured".replace("019", ""))).not.toContain(
|
||||
"legal.named_criticism",
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
describe("LOW patterns surface only", () => {
|
||||
test("user path is LOW", () => {
|
||||
const f = scan("/Users/bob/secret/config", { repoVisibility: "private" }).findings.find(
|
||||
(x) => x.id === "internal.user_path",
|
||||
);
|
||||
expect(f?.tier).toBe("LOW");
|
||||
});
|
||||
test("TODO marker is LOW", () => {
|
||||
const f = scan("TODO(alice) fix later", { repoVisibility: "private" }).findings.find(
|
||||
(x) => x.id === "hygiene.todo",
|
||||
);
|
||||
expect(f?.tier).toBe("LOW");
|
||||
});
|
||||
});
|
||||
|
||||
describe("placeholder suppression (per-span)", () => {
|
||||
test("AWS docs EXAMPLE key not flagged", () => {
|
||||
expect(ids("AKIAIOSFODNN7EXAMPLE")).not.toContain("aws.access_key");
|
||||
});
|
||||
test("your_ prefix not flagged", () => {
|
||||
expect(isPlaceholderSpan("your_api_key")).toBe(true);
|
||||
});
|
||||
test("a real secret on a line that ALSO contains EXAMPLE still flags", () => {
|
||||
// line-based suppression would wrongly skip this; per-span must catch it.
|
||||
expect(ids("# EXAMPLE usage\nkey AKIA1234567890ABCDEF")).toContain("aws.access_key");
|
||||
});
|
||||
});
|
||||
|
||||
describe("no visibility-based tier promotion (TENSION-2-followup)", () => {
|
||||
test("email stays MEDIUM on both private and public", () => {
|
||||
const priv = scan("x@corp.io", { repoVisibility: "private" }).findings[0];
|
||||
const pub = scan("x@corp.io", { repoVisibility: "public" }).findings[0];
|
||||
expect(priv.tier).toBe("MEDIUM");
|
||||
expect(pub.tier).toBe("MEDIUM");
|
||||
expect(pub.severity).toBe("MEDIUM"); // NOT promoted to HIGH
|
||||
expect(pub.repoVisibility).toBe("public"); // recorded for sterner wording
|
||||
});
|
||||
test("demoted credential patterns stay MEDIUM on public", () => {
|
||||
const pub = scan("pk_live_" + "a".repeat(30), { repoVisibility: "public" }).findings[0];
|
||||
expect(pub.severity).toBe("MEDIUM");
|
||||
});
|
||||
test("unknown visibility treated as public for wording, still no promotion", () => {
|
||||
const r = scan("x@corp.io", { repoVisibility: "unknown" });
|
||||
expect(r.findings[0].severity).toBe("MEDIUM");
|
||||
});
|
||||
});
|
||||
|
||||
describe("tool-attributed fence WARN-degrade (TENSION-3)", () => {
|
||||
test("placeholder-shaped credential in tool fence → WARN", () => {
|
||||
const text = "```codex-review\nfound your_aws_key AKIAIOSFODNN7EXAMPLE in code\n```";
|
||||
const r = scan(text, { repoVisibility: "private" });
|
||||
// the EXAMPLE key is suppressed as placeholder; verify a non-credential note doesn't block
|
||||
expect(r.counts.HIGH).toBe(0);
|
||||
});
|
||||
test("live-format credential in tool fence STILL blocks", () => {
|
||||
const text = "```codex-review\nleaked AKIA1234567890ABCDEF here\n```";
|
||||
const r = scan(text, { repoVisibility: "private" });
|
||||
expect(r.counts.HIGH).toBe(1); // not degraded — live format
|
||||
});
|
||||
test("AKIA outside any fence blocks", () => {
|
||||
expect(exitCodeFor(scan("AKIA1234567890ABCDEF", {}))).toBe(3);
|
||||
});
|
||||
});
|
||||
|
||||
describe("normalization", () => {
|
||||
test("zero-width chars inside a key are stripped before matching", () => {
|
||||
const zwsp = "";
|
||||
const broken = "AKIA1234567890" + zwsp + "ABCDEF";
|
||||
expect(ids(broken)).toContain("aws.access_key");
|
||||
});
|
||||
test("HTML entity decode", () => {
|
||||
const { normalized } = normalizeWithMap("a & b");
|
||||
expect(normalized).toBe("a & b");
|
||||
});
|
||||
test("offset map points back into original", () => {
|
||||
const input = "xyz";
|
||||
const { normalized, map } = normalizeWithMap(input);
|
||||
expect(normalized).toBe("xyz");
|
||||
// 'z' is at normalized index 2, original index 3
|
||||
expect(map[2]).toBe(3);
|
||||
});
|
||||
});
|
||||
|
||||
describe("oversize fails CLOSED", () => {
|
||||
test("input over the byte cap returns a single blocking HIGH finding", () => {
|
||||
const big = "a".repeat(2000);
|
||||
const r = scan(big, { maxBytes: 1000 });
|
||||
expect(r.oversize).toBe(true);
|
||||
expect(r.counts.HIGH).toBe(1);
|
||||
expect(r.findings[0].id).toBe("engine.input_too_large");
|
||||
expect(exitCodeFor(r)).toBe(3);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validators", () => {
|
||||
test("luhn", () => {
|
||||
expect(luhnValid("4111111111111111")).toBe(true);
|
||||
expect(luhnValid("4111111111111112")).toBe(false);
|
||||
});
|
||||
test("entropy", () => {
|
||||
expect(shannonEntropy("aaaaaaaa")).toBeLessThan(1);
|
||||
expect(shannonEntropy("8Fk2pQ9vXz4wL7mN")).toBeGreaterThan(3);
|
||||
});
|
||||
test("isPublicIPv4", () => {
|
||||
expect(isPublicIPv4("8.8.8.8")).toBe(true);
|
||||
expect(isPublicIPv4("10.1.2.3")).toBe(false);
|
||||
expect(isPublicIPv4("172.16.5.5")).toBe(false);
|
||||
expect(isPublicIPv4("999.1.1.1")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("masking + purity", () => {
|
||||
test("preview never leaks more than 4 leading chars", () => {
|
||||
expect(maskPreview("AKIA1234567890ABCDEF")).toBe("AKIA********…");
|
||||
expect(maskPreview("abc")).toBe("abc");
|
||||
});
|
||||
test("scan is pure — same input twice yields identical findings", () => {
|
||||
const a = scan("AKIA1234567890ABCDEF x@corp.io", { repoVisibility: "public" });
|
||||
const b = scan("AKIA1234567890ABCDEF x@corp.io", { repoVisibility: "public" });
|
||||
expect(a).toEqual(b);
|
||||
});
|
||||
});
|
||||
|
||||
describe("taxonomy integrity", () => {
|
||||
test("every pattern has a unique id", () => {
|
||||
const set = new Set(PATTERNS.map((p) => p.id));
|
||||
expect(set.size).toBe(PATTERNS.length);
|
||||
});
|
||||
test("autoRedactable patterns have a redactToken", () => {
|
||||
for (const p of PATTERNS) {
|
||||
if (p.autoRedactable) expect(p.redactToken).toBeTruthy();
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,64 @@
|
||||
/**
|
||||
* ReDoS guard (T10) — fails CI if any taxonomy pattern has a catastrophic-
|
||||
* backtracking shape, and asserts the engine's oversize-input path fails CLOSED.
|
||||
*
|
||||
* We do two things:
|
||||
* 1. Static lint: reject nested unbounded quantifiers like (a+)+ / (a*)* /
|
||||
* (a+)* in any pattern source. These are the classic ReDoS forms.
|
||||
* 2. Runtime budget: run every pattern against a pathological input and assert
|
||||
* no single pattern takes more than a generous wall-clock budget. This
|
||||
* catches catastrophic forms the static check might miss.
|
||||
*/
|
||||
import { describe, test, expect } from "bun:test";
|
||||
import { PATTERNS } from "../lib/redact-patterns";
|
||||
import { scan } from "../lib/redact-engine";
|
||||
|
||||
// Nested-quantifier ReDoS shapes: a group ending in +/*/{n,} that is itself
|
||||
// immediately quantified by +/*/{n,}. e.g. (x+)+ (x*)* (x+)* (?:x+){2,}
|
||||
const NESTED_QUANTIFIER = /\([^)]*[+*]\)[+*]|\([^)]*[+*]\)\{\d+,?\}|\([^)]*\{\d+,\}\)[+*]/;
|
||||
|
||||
describe("pattern lint — no catastrophic backtracking", () => {
|
||||
for (const p of PATTERNS) {
|
||||
test(`${p.id} has no nested unbounded quantifier`, () => {
|
||||
expect(NESTED_QUANTIFIER.test(p.regex.source)).toBe(false);
|
||||
});
|
||||
}
|
||||
|
||||
test("a planted catastrophic pattern WOULD be caught by the linter", () => {
|
||||
// meta-test: prove the linter actually detects the bad shape
|
||||
expect(NESTED_QUANTIFIER.test("(a+)+")).toBe(true);
|
||||
expect(NESTED_QUANTIFIER.test("(\\d*)*")).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("runtime budget — pathological inputs do not hang", () => {
|
||||
// Inputs designed to stress backtracking on the real patterns.
|
||||
const adversarial = [
|
||||
"a".repeat(5000) + "!",
|
||||
"AKIA" + "A".repeat(5000),
|
||||
"eyJ" + "a".repeat(2000) + "." + "b".repeat(2000),
|
||||
"x@" + "a".repeat(3000),
|
||||
"/Users/" + "a".repeat(4000),
|
||||
("1".repeat(19) + " ").repeat(200),
|
||||
];
|
||||
|
||||
for (const [i, input] of adversarial.entries()) {
|
||||
test(`adversarial input #${i} scans within budget`, () => {
|
||||
const start = performance.now();
|
||||
scan(input, { repoVisibility: "private", maxBytes: 1024 * 1024 });
|
||||
const elapsed = performance.now() - start;
|
||||
// Generous: full taxonomy over a 5KB pathological string should be well
|
||||
// under 1s on any CI box. A catastrophic pattern would blow past this.
|
||||
expect(elapsed).toBeLessThan(1000);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
describe("oversize fails closed (the real ReDoS backstop)", () => {
|
||||
test("input over cap returns blocking HIGH, never runs the patterns", () => {
|
||||
const r = scan("a".repeat(50_000), { maxBytes: 10_000 });
|
||||
expect(r.oversize).toBe(true);
|
||||
expect(r.counts.HIGH).toBe(1);
|
||||
expect(r.findings[0].id).toBe("engine.input_too_large");
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user