fix: match terms whose edges are non-word characters

Wrapping every user term as `\b${term}\b` silently dropped matches when the term started or ended with a non-word char (e.g. `@tdurieux`, `@author .*`), because JS `\b` only fires at a word/non-word transition. Replace with `withWordBoundaries()`, which only emits `\b` on the side where the term has a word-char edge. Fixes #175, #249.
2026-06-30 02:55:30 +02:00 · 2026-05-03 18:35:00 +02:00
parent 57f2cf1b11
commit 9313c42fcf
2 changed files with 71 additions and 4 deletions
@@ -8,6 +8,24 @@ import config from "../config";
 const urlRegex =
  /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;

+// JS regex \b only fires at a word/non-word transition, where word chars are
+// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
+// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
+// emit \b on sides where the term has a word-char edge; otherwise the boundary
+// would never match.
+function withWordBoundaries(termPattern: string): string {
+  // Strip a leading group like (?:...) or (...) when sniffing the first/last
+  // significant char so users wrapping their regex in a group still get
+  // boundaries applied. Best-effort — not a full parser.
+  const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
+  const first = sniff.charAt(0);
+  const last = sniff.charAt(sniff.length - 1);
+  const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
+  const lead = first && isWord(first) ? "\\b" : "";
+  const trail = last && isWord(last) ? "\\b" : "";
+  return `${lead}${termPattern}${trail}`;
+}
+
 export function streamToString(stream: Readable): Promise<string> {
  const chunks: Buffer[] = [];
  return new Promise((resolve, reject) => {
@@ -199,9 +217,10 @@ export class ContentAnonimizer {
        // escape regex characters
        term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
      }
+      const bounded = withWordBoundaries(term);
      // remove whole url if it contains the term
      content = content.replace(urlRegex, (match) => {
-        if (new RegExp(`\\b${term}\\b`, "gi").test(match)) {
+        if (new RegExp(bounded, "gi").test(match)) {
          this.wasAnonymized = true;
          return mask;
        }
@@ -209,7 +228,7 @@ export class ContentAnonimizer {
      });

      // remove the term in the text
-      content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), () => {
+      content = content.replace(new RegExp(bounded, "gi"), () => {
        this.wasAnonymized = true;
        return mask;
      });
@@ -20,6 +20,16 @@ const ANONYMIZATION_MASK = "XXXX";
 const urlRegex =
  /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;

+function withWordBoundaries(termPattern) {
+  const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
+  const first = sniff.charAt(0);
+  const last = sniff.charAt(sniff.length - 1);
+  const isWord = (c) => /[A-Za-z0-9_]/.test(c);
+  const lead = first && isWord(first) ? "\\b" : "";
+  const trail = last && isWord(last) ? "\\b" : "";
+  return `${lead}${termPattern}${trail}`;
+}
+
 class ContentAnonimizer {
  constructor(opt) {
    this.opt = opt || {};
@@ -101,14 +111,15 @@ class ContentAnonimizer {
      } catch {
        term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
      }
+      const bounded = withWordBoundaries(term);
      content = content.replace(urlRegex, (match) => {
-        if (new RegExp(`\\b${term}\\b`, "gi").test(match)) {
+        if (new RegExp(bounded, "gi").test(match)) {
          this.wasAnonymized = true;
          return mask;
        }
        return match;
      });
-      content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), () => {
+      content = content.replace(new RegExp(bounded, "gi"), () => {
        this.wasAnonymized = true;
        return mask;
      });
@@ -195,6 +206,43 @@ describe("ContentAnonimizer", function () {
      expect(() => anon.anonymize("some foo(bar here")).to.not.throw();
    });

+    // #175 — terms starting with a non-word char (e.g. "@username") were
+    // silently skipped because \b can't match between two non-word chars.
+    it("replaces terms starting with a non-word character (e.g. @user)", function () {
+      const anon = new ContentAnonimizer({ terms: ["@tdurieux"] });
+      const result = anon.anonymize('"name": "@tdurieux/anonymous"');
+      expect(result).to.not.include("@tdurieux");
+      expect(result).to.include("XXXX-1");
+    });
+
+    // #249 — regex terms ending in non-word chars (e.g. "@author .*") were
+    // also skipped due to the trailing \b.
+    it("matches a user regex that ends with a non-word pattern", function () {
+      const anon = new ContentAnonimizer({ terms: ["@author .*"] });
+      const result = anon.anonymize("/** @author julius */");
+      expect(result).to.include("XXXX-1");
+      expect(result).to.not.include("@author julius");
+    });
+
+    // #430 — IPv4-style terms have non-word boundaries on each dot but still
+    // start/end with digits, so \b on both sides is fine — guard against
+    // regression now that we tweak boundary logic.
+    it("anonymizes an IP address term", function () {
+      const anon = new ContentAnonimizer({ terms: ["192\\.168\\.1\\.1"] });
+      const result = anon.anonymize("connect to 192.168.1.1 on port 80");
+      expect(result).to.not.include("192.168.1.1");
+      expect(result).to.include("XXXX-1");
+    });
+
+    it("does not over-match across word boundaries when the term is word-only", function () {
+      // Regression: ensure withWordBoundaries still emits \b on both sides
+      // for ordinary alphanumeric terms.
+      const anon = new ContentAnonimizer({ terms: ["cat"] });
+      const result = anon.anonymize("the cat sat on a category");
+      expect(result).to.include("category");
+      expect(result).to.match(/the XXXX-1 sat/);
+    });
+
    it("replaces terms inside URLs", function () {
      const anon = new ContentAnonimizer({ terms: ["myuser"] });
      const result = anon.anonymize(