feat: per-term anonymization output via term=>replacement

A term entered as "Anonymous=>ABC" now scrubs "Anonymous" to "ABC" instead of "XXXX-N". Lets users keep anonymized identifiers valid in source code (no hyphen) and align tokens between paper text and repo. Indexing for default-mask terms is unchanged: a list of "Alpha=>AAA", "Beta" still produces XXXX-2 for Beta. Fixes #285.
2026-05-15 06:30:26 +02:00 · 2026-05-04 09:31:31 +02:00
parent db2ac5307d
commit 4bc83db416
4 changed files with 92 additions and 20 deletions
@@ -148,7 +148,7 @@
            <div class="form-group">
              <label class="paper-field-label" for="terms">Terms to redact</label>
              <textarea class="form-control" id="terms" name="terms" rows="4" ng-model="terms" ng-model-options="{ debounce: 250 }" ng-class="{'is-invalid': anonymize.terms.$invalid}"></textarea>
-              <small class="form-text text-muted">One term per line (regex allowed). Replaced by <code>{{site_options.ANONYMIZATION_MASK}}-[N]</code>.</small>
+              <small class="form-text text-muted">One term per line (regex allowed). Replaced by <code>{{site_options.ANONYMIZATION_MASK}}-[N]</code>, or use <code>term=&gt;replacement</code> to pick your own (e.g. <code>Anonymous=&gt;ABC</code>).</small>
              <div class="warning-feedback" ng-show="anonymize.terms.$error.regex">Regex characters detected. Escape them if unintentional.</div>
              <div class="invalid-feedback" ng-show="anonymize.terms.$error.format">Terms are in an invalid format.</div>
            </div>
@@ -4,7 +4,11 @@ import { StringDecoder } from "string_decoder";
 import { isText } from "istextorbinary";

 import config from "../config";
-import { termVariants, withWordBoundaries } from "./term-matching";
+import {
+  parseTermSpec,
+  termVariants,
+  withWordBoundaries,
+} from "./term-matching";

 const urlRegex =
  /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
@@ -189,11 +193,19 @@ export class ContentAnonimizer {
  private replaceTerms(content: string): string {
    const terms = this.opt.terms || [];
    for (let i = 0; i < terms.length; i++) {
-      let term = terms[i];
-      if (term.trim() == "") {
+      const spec = terms[i];
+      if (spec.trim() == "") {
        continue;
      }
-      const mask = config.ANONYMIZATION_MASK + "-" + (i + 1);
+      // #285 — entries of the form "term=>replacement" override the default
+      // XXXX-N mask so users can scrub with their preferred token (e.g.
+      // "ABC", "XYZ"), keeping anonymized identifiers valid in source code.
+      const parsed = parseTermSpec(spec);
+      let term = parsed.term;
+      const mask =
+        parsed.replacement !== null
+          ? parsed.replacement
+          : config.ANONYMIZATION_MASK + "-" + (i + 1);
      try {
        new RegExp(term, "gi");
      } catch {
@@ -239,20 +251,23 @@ export class ContentAnonimizer {

 export function anonymizePath(path: string, terms: string[]) {
  for (let i = 0; i < terms.length; i++) {
-    let term = terms[i];
-    if (term.trim() == "") {
+    const spec = terms[i];
+    if (spec.trim() == "") {
      continue;
    }
+    const parsed = parseTermSpec(spec);
+    let term = parsed.term;
+    const mask =
+      parsed.replacement !== null
+        ? parsed.replacement
+        : config.ANONYMIZATION_MASK + "-" + (i + 1);
    try {
      new RegExp(term, "gi");
    } catch {
      // escape regex characters
      term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
    }
-    path = path.replace(
-      new RegExp(term, "gi"),
-      config.ANONYMIZATION_MASK + "-" + (i + 1)
-    );
+    path = path.replace(new RegExp(term, "gi"), mask);
  }
  return path;
 }
@@ -102,3 +102,27 @@ export function termVariants(escapedTerm: string): {
    { pattern: diacriticInsensitive(stripped), sniff: stripped, unicode: true },
  ];
 }
+
+// A term can override the default `XXXX-N` mask via the syntax
+//     <term>=><replacement>
+// e.g. "Anonymous=>ABC" replaces "Anonymous" with "ABC". Whitespace around
+// `=>` is allowed. The replacement is inserted verbatim, so users can pick
+// strings without the hyphen that breaks identifiers.
+//
+// If the entry is just `=>` with no LHS, or has no separator, the original
+// term is returned and the caller falls back to the default mask.
+export function parseTermSpec(spec: string): {
+  term: string;
+  replacement: string | null;
+} {
+  const idx = spec.indexOf("=>");
+  if (idx < 0) {
+    return { term: spec, replacement: null };
+  }
+  const term = spec.slice(0, idx).replace(/\s+$/, "");
+  const replacement = spec.slice(idx + 2).replace(/^\s+/, "");
+  if (!term) {
+    return { term: spec, replacement: null };
+  }
+  return { term, replacement };
+}
@@ -5,6 +5,7 @@ require("ts-node/register/transpile-only");
 const {
  withWordBoundaries,
  termVariants,
+  parseTermSpec,
 } = require("../src/core/term-matching");

 /**
@@ -97,11 +98,16 @@ class ContentAnonimizer {
  replaceTerms(content) {
    const terms = this.opt.terms || [];
    for (let i = 0; i < terms.length; i++) {
-      let term = terms[i];
-      if (term.trim() == "") {
+      const spec = terms[i];
+      if (spec.trim() == "") {
        continue;
      }
-      const mask = ANONYMIZATION_MASK + "-" + (i + 1);
+      const parsed = parseTermSpec(spec);
+      let term = parsed.term;
+      const mask =
+        parsed.replacement !== null
+          ? parsed.replacement
+          : ANONYMIZATION_MASK + "-" + (i + 1);
      try {
        new RegExp(term, "gi");
      } catch {
@@ -140,19 +146,22 @@ class ContentAnonimizer {

 function anonymizePath(path, terms) {
  for (let i = 0; i < terms.length; i++) {
-    let term = terms[i];
-    if (term.trim() == "") {
+    const spec = terms[i];
+    if (spec.trim() == "") {
      continue;
    }
+    const parsed = parseTermSpec(spec);
+    let term = parsed.term;
+    const mask =
+      parsed.replacement !== null
+        ? parsed.replacement
+        : ANONYMIZATION_MASK + "-" + (i + 1);
    try {
      new RegExp(term, "gi");
    } catch {
      term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
    }
-    path = path.replace(
-      new RegExp(term, "gi"),
-      ANONYMIZATION_MASK + "-" + (i + 1)
-    );
+    path = path.replace(new RegExp(term, "gi"), mask);
  }
  return path;
 }
@@ -236,6 +245,30 @@ describe("ContentAnonimizer", function () {
      expect(result).to.include("XXXX-1");
    });

+    // #285 — `term=>replacement` uses the user-supplied replacement
+    // instead of XXXX-N, so anonymized identifiers can stay valid in code.
+    it("uses a custom replacement when the term is 'term=>replacement'", function () {
+      const a = new ContentAnonimizer({ terms: ["Anonymous=>ABC"] });
+      const result = a.anonymize("class Anonymous extends Base {}");
+      expect(result).to.equal("class ABC extends Base {}");
+    });
+
+    it("supports custom and default-mask terms together with stable indices", function () {
+      const a = new ContentAnonimizer({
+        terms: ["Alpha=>AAA", "Beta"],
+      });
+      const result = a.anonymize("Alpha and Beta");
+      // Beta uses XXXX-2 (its 1-based index in the list), even though
+      // Alpha had a custom replacement.
+      expect(result).to.equal("AAA and XXXX-2");
+    });
+
+    it("falls back to the default mask when the entry has no replacement", function () {
+      const a = new ContentAnonimizer({ terms: ["Foo=>"] });
+      const result = a.anonymize("Foo bar");
+      expect(result).to.equal(" bar");
+    });
+
    // #280 — accented terms should match both the accented and unaccented
    // variants so "Davó" scrubs "Davo" (and vice versa).
    it("matches accented and unaccented variants of the same term", function () {