diff --git a/src/core/anonymize-utils.ts b/src/core/anonymize-utils.ts index 69da3e0..e9bec75 100644 --- a/src/core/anonymize-utils.ts +++ b/src/core/anonymize-utils.ts @@ -8,6 +8,24 @@ import config from "../config"; const urlRegex = /?/g; +// JS regex \b only fires at a word/non-word transition, where word chars are +// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or +// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only +// emit \b on sides where the term has a word-char edge; otherwise the boundary +// would never match. +function withWordBoundaries(termPattern: string): string { + // Strip a leading group like (?:...) or (...) when sniffing the first/last + // significant char so users wrapping their regex in a group still get + // boundaries applied. Best-effort — not a full parser. + const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, ""); + const first = sniff.charAt(0); + const last = sniff.charAt(sniff.length - 1); + const isWord = (c: string) => /[A-Za-z0-9_]/.test(c); + const lead = first && isWord(first) ? "\\b" : ""; + const trail = last && isWord(last) ? "\\b" : ""; + return `${lead}${termPattern}${trail}`; +} + export function streamToString(stream: Readable): Promise { const chunks: Buffer[] = []; return new Promise((resolve, reject) => { @@ -199,9 +217,10 @@ export class ContentAnonimizer { // escape regex characters term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); } + const bounded = withWordBoundaries(term); // remove whole url if it contains the term content = content.replace(urlRegex, (match) => { - if (new RegExp(`\\b${term}\\b`, "gi").test(match)) { + if (new RegExp(bounded, "gi").test(match)) { this.wasAnonymized = true; return mask; } @@ -209,7 +228,7 @@ export class ContentAnonimizer { }); // remove the term in the text - content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), () => { + content = content.replace(new RegExp(bounded, "gi"), () => { this.wasAnonymized = true; return mask; }); diff --git a/test/anonymize-utils.test.js b/test/anonymize-utils.test.js index 9345954..e9dee7e 100644 --- a/test/anonymize-utils.test.js +++ b/test/anonymize-utils.test.js @@ -20,6 +20,16 @@ const ANONYMIZATION_MASK = "XXXX"; const urlRegex = /?/g; +function withWordBoundaries(termPattern) { + const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, ""); + const first = sniff.charAt(0); + const last = sniff.charAt(sniff.length - 1); + const isWord = (c) => /[A-Za-z0-9_]/.test(c); + const lead = first && isWord(first) ? "\\b" : ""; + const trail = last && isWord(last) ? "\\b" : ""; + return `${lead}${termPattern}${trail}`; +} + class ContentAnonimizer { constructor(opt) { this.opt = opt || {}; @@ -101,14 +111,15 @@ class ContentAnonimizer { } catch { term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); } + const bounded = withWordBoundaries(term); content = content.replace(urlRegex, (match) => { - if (new RegExp(`\\b${term}\\b`, "gi").test(match)) { + if (new RegExp(bounded, "gi").test(match)) { this.wasAnonymized = true; return mask; } return match; }); - content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), () => { + content = content.replace(new RegExp(bounded, "gi"), () => { this.wasAnonymized = true; return mask; }); @@ -195,6 +206,43 @@ describe("ContentAnonimizer", function () { expect(() => anon.anonymize("some foo(bar here")).to.not.throw(); }); + // #175 — terms starting with a non-word char (e.g. "@username") were + // silently skipped because \b can't match between two non-word chars. + it("replaces terms starting with a non-word character (e.g. @user)", function () { + const anon = new ContentAnonimizer({ terms: ["@tdurieux"] }); + const result = anon.anonymize('"name": "@tdurieux/anonymous"'); + expect(result).to.not.include("@tdurieux"); + expect(result).to.include("XXXX-1"); + }); + + // #249 — regex terms ending in non-word chars (e.g. "@author .*") were + // also skipped due to the trailing \b. + it("matches a user regex that ends with a non-word pattern", function () { + const anon = new ContentAnonimizer({ terms: ["@author .*"] }); + const result = anon.anonymize("/** @author julius */"); + expect(result).to.include("XXXX-1"); + expect(result).to.not.include("@author julius"); + }); + + // #430 — IPv4-style terms have non-word boundaries on each dot but still + // start/end with digits, so \b on both sides is fine — guard against + // regression now that we tweak boundary logic. + it("anonymizes an IP address term", function () { + const anon = new ContentAnonimizer({ terms: ["192\\.168\\.1\\.1"] }); + const result = anon.anonymize("connect to 192.168.1.1 on port 80"); + expect(result).to.not.include("192.168.1.1"); + expect(result).to.include("XXXX-1"); + }); + + it("does not over-match across word boundaries when the term is word-only", function () { + // Regression: ensure withWordBoundaries still emits \b on both sides + // for ordinary alphanumeric terms. + const anon = new ContentAnonimizer({ terms: ["cat"] }); + const result = anon.anonymize("the cat sat on a category"); + expect(result).to.include("category"); + expect(result).to.match(/the XXXX-1 sat/); + }); + it("replaces terms inside URLs", function () { const anon = new ContentAnonimizer({ terms: ["myuser"] }); const result = anon.anonymize(