fix: anonymize accented and unaccented variants of the same term

When a user added "Davó" to the term list, "Davo" elsewhere in the content was left untouched (and vice versa). Each term now also runs a diacritic-insensitive pass: ASCII Latin letters expand to a class covering common accented siblings, with Unicode-aware lookaround boundaries so the trailing boundary still fires next to "ó" etc. Pure helpers moved into src/core/term-matching so the test file can import them instead of duplicating the logic. Fixes #280.
2026-05-15 14:38:03 +02:00 · 2026-05-03 20:18:49 +02:00
parent d138d487f2
commit 2eb19904db
3 changed files with 166 additions and 52 deletions
@@ -4,28 +4,11 @@ import { StringDecoder } from "string_decoder";
 import { isText } from "istextorbinary";
 import config from "../config";
 import { termVariants, withWordBoundaries } from "./term-matching";
 const urlRegex =
  /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
 // JS regex \b only fires at a word/non-word transition, where word chars are
 // [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
 // ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
 // emit \b on sides where the term has a word-char edge; otherwise the boundary
 // would never match.
 function withWordBoundaries(termPattern: string): string {
  // Strip a leading group like (?:...) or (...) when sniffing the first/last
  // significant char so users wrapping their regex in a group still get
  // boundaries applied. Best-effort — not a full parser.
  const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
  const first = sniff.charAt(0);
  const last = sniff.charAt(sniff.length - 1);
  const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
  const lead = first && isWord(first) ? "\\b" : "";
  const trail = last && isWord(last) ? "\\b" : "";
  return `${lead}${termPattern}${trail}`;
 }
 export function streamToString(stream: Readable): Promise<string> {
  const chunks: Buffer[] = [];
  return new Promise((resolve, reject) => {
@@ -217,21 +200,30 @@ export class ContentAnonimizer {
        // escape regex characters
        term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
      }
-      const bounded = withWordBoundaries(term);
+
-      // remove whole url if it contains the term
+      // Try the term verbatim first, then a diacritic-insensitive expansion
-      content = content.replace(urlRegex, (match) => {
+      // so "Davo" anonymizes "Davó" (and vice versa). See term-matching.ts.
-        if (new RegExp(bounded, "gi").test(match)) {
+      for (const variant of termVariants(term)) {
        const bounded = withWordBoundaries(variant.pattern, {
          sniffSource: variant.sniff,
          unicode: variant.unicode,
        });
        const flags = variant.unicode ? "giu" : "gi";
        // remove whole url if it contains the term
        content = content.replace(urlRegex, (match) => {
          if (new RegExp(bounded, flags).test(match)) {
            this.wasAnonymized = true;
            return mask;
          }
          return match;
        });
        // remove the term in the text
        content = content.replace(new RegExp(bounded, flags), () => {
          this.wasAnonymized = true;
          return mask;
-        }
+        });
-        return match;
+      }
      });
      // remove the term in the text
      content = content.replace(new RegExp(bounded, "gi"), () => {
        this.wasAnonymized = true;
        return mask;
      });
    }
    return content;
  }
@@ -0,0 +1,104 @@
 // Pure helpers for term-based anonymization. Extracted from anonymize-utils
 // so unit tests can import them without pulling in the config module (which
 // reads process.env at load time).
 // JS regex \b only fires at a word/non-word transition, where word chars are
 // [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
 // ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
 // emit a boundary on sides where the term has a word-char edge; otherwise the
 // boundary would never match.
 //
 // `sniffSource` lets callers decide boundaries from a different string than
 // the actual pattern — needed when the pattern is an expanded character class
 // (ends in "]") but the matched text is still a letter.
 //
 // `unicode: true` emits lookaround boundaries that treat any Unicode letter
 // as a word char, so a trailing boundary still fires next to "ó" etc. The
 // regex consuming the result must be created with the `u` flag.
 export function withWordBoundaries(
  termPattern: string,
  opt: { sniffSource?: string; unicode?: boolean } = {}
 ): string {
  // Strip a leading group like (?:...) or (...) when sniffing the first/last
  // significant char so users wrapping their regex in a group still get
  // boundaries applied. Best-effort — not a full parser.
  const sniff = (opt.sniffSource ?? termPattern).replace(
    /^\(\?[:=!]?|^\(|\)$/g,
    ""
  );
  const first = sniff.charAt(0);
  const last = sniff.charAt(sniff.length - 1);
  const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
  const before = opt.unicode ? "(?<![\\p{L}\\p{N}_])" : "\\b";
  const after = opt.unicode ? "(?![\\p{L}\\p{N}_])" : "\\b";
  const lead = first && isWord(first) ? before : "";
  const trail = last && isWord(last) ? after : "";
  return `${lead}${termPattern}${trail}`;
 }
 export function stripDiacritics(s: string): string {
  return s.normalize("NFD").replace(/\p{Diacritic}/gu, "");
 }
 // Map of base Latin letter -> regex character class covering common accented
 // variants. Used to make term matching diacritic-insensitive in both
 // directions: typing "Davo" matches "Davó" in the text, and typing "Davó"
 // (after stripping diacritics) does the same. Coverage focuses on Latin
 // letters that show up in real names — extend as needed.
 export const DIACRITIC_CLASSES: Record<string, string> = {
  a: "[aàáâãäåāăąǎ]",
  c: "[cçćĉċč]",
  d: "[dďđ]",
  e: "[eèéêëēĕėęěȩ]",
  g: "[gĝğġģǧ]",
  h: "[hĥħȟ]",
  i: "[iìíîïĩīĭįıǐ]",
  j: "[jĵǰ]",
  k: "[kķǩ]",
  l: "[lĺļľŀł]",
  n: "[nñńņňŉŋ]",
  o: "[oòóôõöōŏőøǒ]",
  r: "[rŕŗř]",
  s: "[sśŝşšș]",
  t: "[tţťŧț]",
  u: "[uùúûüũūŭůűųǔ]",
  w: "[wŵẁẃẅ]",
  y: "[yýÿŷỳ]",
  z: "[zźżž]",
 };
 // Build a regex source that matches the given (already-escaped) term in a
 // diacritic-insensitive way. ASCII letters are replaced with a character
 // class that includes their accented siblings; other chars are left alone so
 // regex metacharacters and escape sequences keep working.
 export function diacriticInsensitive(escapedTerm: string): string {
  let out = "";
  let i = 0;
  while (i < escapedTerm.length) {
    const c = escapedTerm[i];
    // Pass through backslash escapes verbatim (e.g. "\." or "\d").
    if (c === "\\" && i + 1 < escapedTerm.length) {
      out += c + escapedTerm[i + 1];
      i += 2;
      continue;
    }
    const lower = c.toLowerCase();
    out += DIACRITIC_CLASSES[lower] || c;
    i += 1;
  }
  return out;
 }
 // Build the term variants to try for one user-provided term. Each variant
 // produces a separate replacement pass.
 export function termVariants(escapedTerm: string): {
  pattern: string;
  sniff: string;
  unicode: boolean;
 }[] {
  const stripped = stripDiacritics(escapedTerm);
  return [
    { pattern: escapedTerm, sniff: escapedTerm, unicode: false },
    { pattern: diacriticInsensitive(stripped), sniff: stripped, unicode: true },
  ];
 }
@@ -1,13 +1,18 @@
 const { expect } = require("chai");
 const { Transform } = require("stream");
 const { StringDecoder } = require("string_decoder");
 require("ts-node/register/transpile-only");
 const {
  withWordBoundaries,
  termVariants,
 } = require("../src/core/term-matching");
 /**
 * Tests for the core anonymization utilities.
 *
 * Because anonymize-utils.ts is TypeScript that imports config (which reads
- * process.env at module load time), we replicate the pure logic here so the
+ * process.env at module load time), we replicate the higher-level pieces
- * tests run without compiling the full project or connecting to a database.
+ * here. Pure helpers live in src/core/term-matching and are imported above.
 */
 // ---------------------------------------------------------------------------
@@ -20,15 +25,6 @@ const ANONYMIZATION_MASK = "XXXX";
 const urlRegex =
  /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
 function withWordBoundaries(termPattern) {
  const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
  const first = sniff.charAt(0);
  const last = sniff.charAt(sniff.length - 1);
  const isWord = (c) => /[A-Za-z0-9_]/.test(c);
  const lead = first && isWord(first) ? "\\b" : "";
  const trail = last && isWord(last) ? "\\b" : "";
  return `${lead}${termPattern}${trail}`;
 }
 class ContentAnonimizer {
  constructor(opt) {
@@ -111,18 +107,24 @@ class ContentAnonimizer {
      } catch {
        term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
      }
-      const bounded = withWordBoundaries(term);
+      for (const variant of termVariants(term)) {
-      content = content.replace(urlRegex, (match) => {
+        const bounded = withWordBoundaries(variant.pattern, {
-        if (new RegExp(bounded, "gi").test(match)) {
+          sniffSource: variant.sniff,
          unicode: variant.unicode,
        });
        const flags = variant.unicode ? "giu" : "gi";
        content = content.replace(urlRegex, (match) => {
          if (new RegExp(bounded, flags).test(match)) {
            this.wasAnonymized = true;
            return mask;
          }
          return match;
        });
        content = content.replace(new RegExp(bounded, flags), () => {
          this.wasAnonymized = true;
          return mask;
-        }
+        });
-        return match;
+      }
      });
      content = content.replace(new RegExp(bounded, "gi"), () => {
        this.wasAnonymized = true;
        return mask;
      });
    }
    return content;
  }
@@ -234,6 +236,22 @@ describe("ContentAnonimizer", function () {
      expect(result).to.include("XXXX-1");
    });
    // #280 — accented terms should match both the accented and unaccented
    // variants so "Davó" scrubs "Davo" (and vice versa).
    it("matches accented and unaccented variants of the same term", function () {
      const a = new ContentAnonimizer({ terms: ["Davó"] });
      const r1 = a.anonymize("Authors: Alice Davó and Bob Davo");
      expect(r1).to.not.include("Davó");
      expect(r1).to.not.include("Davo");
      expect(r1.match(/XXXX-1/g).length).to.equal(2);
      const b = new ContentAnonimizer({ terms: ["Davo"] });
      const r2 = b.anonymize("Authors: Alice Davó and Bob Davo");
      expect(r2).to.not.include("Davó");
      expect(r2).to.not.include("Davo");
      expect(r2.match(/XXXX-1/g).length).to.equal(2);
    });
    it("does not over-match across word boundaries when the term is word-only", function () {
      // Regression: ensure withWordBoundaries still emits \b on both sides
      // for ordinary alphanumeric terms.