fix: anonymize accented and unaccented variants of the same term

When a user added "Davó" to the term list, "Davo" elsewhere in the content was left untouched (and vice versa). Each term now also runs a diacritic-insensitive pass: ASCII Latin letters expand to a class covering common accented siblings, with Unicode-aware lookaround boundaries so the trailing boundary still fires next to "ó" etc. Pure helpers moved into src/core/term-matching so the test file can import them instead of duplicating the logic. Fixes #280.
2026-05-15 14:38:03 +02:00 · 2026-05-03 20:18:49 +02:00
parent d138d487f2
commit 2eb19904db
3 changed files with 166 additions and 52 deletions
@@ -1,13 +1,18 @@
 const { expect } = require("chai");
 const { Transform } = require("stream");
 const { StringDecoder } = require("string_decoder");
+require("ts-node/register/transpile-only");
+const {
+  withWordBoundaries,
+  termVariants,
+} = require("../src/core/term-matching");

 /**
 * Tests for the core anonymization utilities.
 *
 * Because anonymize-utils.ts is TypeScript that imports config (which reads
- * process.env at module load time), we replicate the pure logic here so the
- * tests run without compiling the full project or connecting to a database.
+ * process.env at module load time), we replicate the higher-level pieces
+ * here. Pure helpers live in src/core/term-matching and are imported above.
 */

 // ---------------------------------------------------------------------------
@@ -20,15 +25,6 @@ const ANONYMIZATION_MASK = "XXXX";
 const urlRegex =
  /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;

-function withWordBoundaries(termPattern) {
-  const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
-  const first = sniff.charAt(0);
-  const last = sniff.charAt(sniff.length - 1);
-  const isWord = (c) => /[A-Za-z0-9_]/.test(c);
-  const lead = first && isWord(first) ? "\\b" : "";
-  const trail = last && isWord(last) ? "\\b" : "";
-  return `${lead}${termPattern}${trail}`;
-}

 class ContentAnonimizer {
  constructor(opt) {
@@ -111,18 +107,24 @@ class ContentAnonimizer {
      } catch {
        term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
      }
-      const bounded = withWordBoundaries(term);
-      content = content.replace(urlRegex, (match) => {
-        if (new RegExp(bounded, "gi").test(match)) {
+      for (const variant of termVariants(term)) {
+        const bounded = withWordBoundaries(variant.pattern, {
+          sniffSource: variant.sniff,
+          unicode: variant.unicode,
+        });
+        const flags = variant.unicode ? "giu" : "gi";
+        content = content.replace(urlRegex, (match) => {
+          if (new RegExp(bounded, flags).test(match)) {
+            this.wasAnonymized = true;
+            return mask;
+          }
+          return match;
+        });
+        content = content.replace(new RegExp(bounded, flags), () => {
          this.wasAnonymized = true;
          return mask;
-        }
-        return match;
-      });
-      content = content.replace(new RegExp(bounded, "gi"), () => {
-        this.wasAnonymized = true;
-        return mask;
-      });
+        });
+      }
    }
    return content;
  }
@@ -234,6 +236,22 @@ describe("ContentAnonimizer", function () {
      expect(result).to.include("XXXX-1");
    });

+    // #280 — accented terms should match both the accented and unaccented
+    // variants so "Davó" scrubs "Davo" (and vice versa).
+    it("matches accented and unaccented variants of the same term", function () {
+      const a = new ContentAnonimizer({ terms: ["Davó"] });
+      const r1 = a.anonymize("Authors: Alice Davó and Bob Davo");
+      expect(r1).to.not.include("Davó");
+      expect(r1).to.not.include("Davo");
+      expect(r1.match(/XXXX-1/g).length).to.equal(2);
+
+      const b = new ContentAnonimizer({ terms: ["Davo"] });
+      const r2 = b.anonymize("Authors: Alice Davó and Bob Davo");
+      expect(r2).to.not.include("Davó");
+      expect(r2).to.not.include("Davo");
+      expect(r2.match(/XXXX-1/g).length).to.equal(2);
+    });
+
    it("does not over-match across word boundaries when the term is word-only", function () {
      // Regression: ensure withWordBoundaries still emits \b on both sides
      // for ordinary alphanumeric terms.