feat: one-way door classifier (belt-and-suspenders safety fallback)

scripts/one-way-doors.ts — secondary keyword-pattern classifier that catches destructive questions even when the registry doesn't have an entry for them. The registry's door_type field (from scripts/question-registry.ts) is the PRIMARY safety gate. This classifier is the fallback for ad-hoc question_ids that agents generate at runtime. Classification priority: 1. Registry lookup by question_id → use declared door_type 2. Skill:category fallback (cso:approval, land-and-deploy:approval) 3. Keyword pattern match against question_summary 4. Default: treat as two-way (safer to log the miss than auto-decide unsafely) Covers 21 destructive patterns across: - File system (rm -rf, delete, wipe, purge, truncate) - Database (drop table/database/schema, delete from) - Git/VCS (force-push, reset --hard, checkout --, branch -D) - Deploy/infra (kubectl delete, terraform destroy, rollback) - Credentials (revoke/reset/rotate API key|token|secret|password) - Architecture (breaking change, schema migration, data model change) 7 new tests in test/plan-tune.test.ts covering: registry-first lookup, unknown-id fallthrough, keyword matching on destructive phrasings including embedded filler words ("rotate the API key"), skill-category fallback, benign questions defaulting to two-way, pattern-list non-empty. 27 pass, 0 fail. 1270 expect() calls. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 11:45:20 +02:00 · 2026-04-17 06:12:39 +08:00
parent 2c0c95099f
commit db3b6412b9
2 changed files with 235 additions and 0 deletions
@@ -0,0 +1,161 @@
+/**
+ * One-Way Door Classifier — belt-and-suspenders safety layer.
+ *
+ * Primary safety gate is the `door_type` field in scripts/question-registry.ts.
+ * Every registered AskUserQuestion declares whether it is one-way (always ask,
+ * never auto-decide) or two-way (can be suppressed by explicit user preference).
+ *
+ * This file is a SECONDARY keyword-pattern check for questions that fire
+ * WITHOUT a registry id (ad-hoc question_ids generated at runtime). If the
+ * question_summary contains any of the destructive keyword patterns, treat
+ * it as one-way regardless of what the (absent or unknown) registry entry says.
+ *
+ * Codex correctly pointed out (design doc Decision C) that prose-parsing is
+ * too weak to be the PRIMARY safety gate — wording can change. The registry
+ * is primary. This is the fallback for questions not yet catalogued, and it
+ * errs on the side of asking the user even when tuning preferences say skip.
+ *
+ * Ordering
+ * --------
+ * isOneWayDoor() is called by gstack-question-sensitivity --check in this
+ * order:
+ *   1. Look up registry by id → use registry.door_type if found
+ *   2. If not in registry: apply keyword patterns below
+ *   3. Default to ASK_NORMALLY (safer than AUTO_DECIDE)
+ */
+
+import { getQuestion } from './question-registry';
+
+/**
+ * Keyword patterns that identify one-way-door questions when the registry
+ * doesn't have an entry for the question_id. Case-insensitive substring match
+ * against the question_summary passed into AskUserQuestion.
+ *
+ * Additions here should be conservative — a false positive means the user
+ * gets asked an extra question they might have preferred to auto-decide.
+ * A false negative could mean auto-approving a destructive operation.
+ */
+const DESTRUCTIVE_PATTERNS: RegExp[] = [
+  // File system destruction
+  /\brm\s+-rf\b/i,
+  /\bdelete\b/i,
+  /\bremove\s+(directory|folder|files?)\b/i,
+  /\bwipe\b/i,
+  /\bpurge\b/i,
+  /\btruncate\b/i,
+
+  // Database destruction
+  /\bdrop\s+(table|database|schema|index|column)\b/i,
+  /\bdelete\s+from\b/i,
+
+  // Git / VCS destruction
+  /\bforce[- ]push\b/i,
+  /\bpush\s+--force\b/i,
+  /\bgit\s+reset\s+--hard\b/i,
+  /\bcheckout\s+--\b/i,
+  /\brestore\s+\.\b/i,
+  /\bclean\s+-f\b/i,
+  /\bbranch\s+-D\b/i,
+
+  // Deploy / infra destruction
+  /\bkubectl\s+delete\b/i,
+  /\bterraform\s+destroy\b/i,
+  /\brollback\b/i,
+
+  // Credentials / auth — allow filler words ("the", "my") between verb and noun
+  /\brevoke\s+[\w\s]*\b(api key|token|credential|access key|password)\b/i,
+  /\breset\s+[\w\s]*\b(api key|token|password|credential)\b/i,
+  /\brotate\s+[\w\s]*\b(api key|token|secret|credential|access key)\b/i,
+
+  // Scope / architecture forks (reversible with effort — still deserve confirmation)
+  /\barchitectur(e|al)\s+(change|fork|shift|decision)\b/i,
+  /\bdata\s+model\s+change\b/i,
+  /\bschema\s+migration\b/i,
+  /\bbreaking\s+change\b/i,
+];
+
+/**
+ * Skill-category combinations that are always one-way even when the question
+ * body looks benign. Matches the ownership model: certain skill actions are
+ * inherently high-stakes.
+ */
+const ONE_WAY_SKILL_CATEGORIES = new Set<string>([
+  'cso:approval', // security-audit findings
+  'land-and-deploy:approval', // anything /land-and-deploy asks
+]);
+
+export interface ClassifyInput {
+  /** Registry id OR ad-hoc id; looked up first */
+  question_id?: string;
+  /** Skill firing the question (for skill-category fallback) */
+  skill?: string;
+  /** Question category (approval | clarification | routing | cherry-pick | feedback-loop) */
+  category?: string;
+  /** Free-form question summary — pattern-matched against destructive keywords */
+  summary?: string;
+}
+
+export interface ClassifyResult {
+  /** true = treat as one-way door (always ask, never auto-decide) */
+  oneWay: boolean;
+  /** Which check triggered the classification (for audit/debug) */
+  reason: 'registry' | 'skill-category' | 'keyword' | 'default-safe' | 'default-two-way';
+  /** Matched pattern if reason is 'keyword' */
+  matched?: string;
+}
+
+/**
+ * Classify a question as one-way (always ask) or two-way (can be suppressed).
+ * Returns {oneWay: false, reason: 'default-two-way'} only when no evidence of
+ * one-way nature is found. Errs conservatively otherwise.
+ */
+export function classifyQuestion(input: ClassifyInput): ClassifyResult {
+  // 1. Registry lookup (primary)
+  if (input.question_id) {
+    const registered = getQuestion(input.question_id);
+    if (registered) {
+      return {
+        oneWay: registered.door_type === 'one-way',
+        reason: 'registry',
+      };
+    }
+  }
+
+  // 2. Skill-category fallback (certain combos are always one-way)
+  if (input.skill && input.category) {
+    const key = `${input.skill}:${input.category}`;
+    if (ONE_WAY_SKILL_CATEGORIES.has(key)) {
+      return { oneWay: true, reason: 'skill-category' };
+    }
+  }
+
+  // 3. Keyword pattern match (catch destructive questions without registry entry)
+  if (input.summary) {
+    for (const pattern of DESTRUCTIVE_PATTERNS) {
+      if (pattern.test(input.summary)) {
+        return {
+          oneWay: true,
+          reason: 'keyword',
+          matched: pattern.toString(),
+        };
+      }
+    }
+  }
+
+  // 4. No evidence either way — treat as two-way (can be preference-suppressed).
+  return { oneWay: false, reason: 'default-two-way' };
+}
+
+/**
+ * Convenience wrapper for the sensitivity check binary.
+ * Returns true if the question must be asked regardless of user preferences.
+ */
+export function isOneWayDoor(input: ClassifyInput): boolean {
+  return classifyQuestion(input).oneWay;
+}
+
+/**
+ * Export patterns for tests and audit tooling.
+ */
+export const DESTRUCTIVE_PATTERN_LIST = DESTRUCTIVE_PATTERNS;
+export const ONE_WAY_SKILL_CATEGORY_SET = ONE_WAY_SKILL_CATEGORIES;
@@ -21,6 +21,12 @@ import {
  getRegistryStats,
  type QuestionDef,
 } from '../scripts/question-registry';
+import {
+  classifyQuestion,
+  isOneWayDoor,
+  DESTRUCTIVE_PATTERN_LIST,
+  ONE_WAY_SKILL_CATEGORY_SET,
+} from '../scripts/one-way-doors';
 import * as fs from 'fs';
 import * as path from 'path';

@@ -253,6 +259,74 @@ describe('AskUserQuestion template coverage (informational)', () => {
  });
 });

+// -----------------------------------------------------------------------
+// One-way door classifier (belt-and-suspenders keyword fallback)
+// -----------------------------------------------------------------------
+
+describe('one-way-doors classifier', () => {
+  test('registry lookup wins when question_id is known', () => {
+    const result = classifyQuestion({ question_id: 'ship-test-failure-triage' });
+    expect(result.oneWay).toBe(true);
+    expect(result.reason).toBe('registry');
+
+    const safeResult = classifyQuestion({ question_id: 'ship-changelog-voice-polish' });
+    expect(safeResult.oneWay).toBe(false);
+    expect(safeResult.reason).toBe('registry');
+  });
+
+  test('unknown question_id falls through to other checks', () => {
+    const result = classifyQuestion({ question_id: 'some-ad-hoc-question-id' });
+    expect(result.reason).not.toBe('registry');
+  });
+
+  test('keyword fallback catches destructive summaries', () => {
+    const cases = [
+      'Delete this directory and all its contents?',
+      'Run rm -rf /tmp/scratch — proceed?',
+      'Force-push main?',
+      'git reset --hard origin/main — ok?',
+      'DROP TABLE users — confirm?',
+      'kubectl delete namespace prod',
+      'terraform destroy the staging cluster',
+      'rotate the API key',
+      'breaking change to the public API — ship anyway?',
+    ];
+    for (const summary of cases) {
+      const result = classifyQuestion({ summary });
+      expect(result.oneWay).toBe(true);
+      expect(result.reason).toBe('keyword');
+      expect(result.matched).toBeDefined();
+    }
+  });
+
+  test('skill-category fallback fires for cso:approval and land-and-deploy:approval', () => {
+    expect(isOneWayDoor({ skill: 'cso', category: 'approval' })).toBe(true);
+    expect(isOneWayDoor({ skill: 'land-and-deploy', category: 'approval' })).toBe(true);
+  });
+
+  test('benign questions default to two-way', () => {
+    const benign = [
+      'Want to update the changelog voice?',
+      'Which mode should plan review use?',
+      'Open the essay in your browser?',
+    ];
+    for (const summary of benign) {
+      const result = classifyQuestion({ summary });
+      expect(result.oneWay).toBe(false);
+      expect(result.reason).toBe('default-two-way');
+    }
+  });
+
+  test('keyword patterns are non-empty', () => {
+    expect(DESTRUCTIVE_PATTERN_LIST.length).toBeGreaterThan(15);
+  });
+
+  test('skill-category set covers security + deploy', () => {
+    expect(ONE_WAY_SKILL_CATEGORY_SET.has('cso:approval')).toBe(true);
+    expect(ONE_WAY_SKILL_CATEGORY_SET.has('land-and-deploy:approval')).toBe(true);
+  });
+});
+
 function findAllTemplates(): string[] {
  const results: string[] = [];
  function walk(dir: string) {