diff --git a/src/core/anonymize-utils.ts b/src/core/anonymize-utils.ts index e9bec75..155f501 100644 --- a/src/core/anonymize-utils.ts +++ b/src/core/anonymize-utils.ts @@ -4,28 +4,11 @@ import { StringDecoder } from "string_decoder"; import { isText } from "istextorbinary"; import config from "../config"; +import { termVariants, withWordBoundaries } from "./term-matching"; const urlRegex = /?/g; -// JS regex \b only fires at a word/non-word transition, where word chars are -// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or -// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only -// emit \b on sides where the term has a word-char edge; otherwise the boundary -// would never match. -function withWordBoundaries(termPattern: string): string { - // Strip a leading group like (?:...) or (...) when sniffing the first/last - // significant char so users wrapping their regex in a group still get - // boundaries applied. Best-effort — not a full parser. - const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, ""); - const first = sniff.charAt(0); - const last = sniff.charAt(sniff.length - 1); - const isWord = (c: string) => /[A-Za-z0-9_]/.test(c); - const lead = first && isWord(first) ? "\\b" : ""; - const trail = last && isWord(last) ? "\\b" : ""; - return `${lead}${termPattern}${trail}`; -} - export function streamToString(stream: Readable): Promise { const chunks: Buffer[] = []; return new Promise((resolve, reject) => { @@ -217,21 +200,30 @@ export class ContentAnonimizer { // escape regex characters term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); } - const bounded = withWordBoundaries(term); - // remove whole url if it contains the term - content = content.replace(urlRegex, (match) => { - if (new RegExp(bounded, "gi").test(match)) { + + // Try the term verbatim first, then a diacritic-insensitive expansion + // so "Davo" anonymizes "Davó" (and vice versa). See term-matching.ts. + for (const variant of termVariants(term)) { + const bounded = withWordBoundaries(variant.pattern, { + sniffSource: variant.sniff, + unicode: variant.unicode, + }); + const flags = variant.unicode ? "giu" : "gi"; + // remove whole url if it contains the term + content = content.replace(urlRegex, (match) => { + if (new RegExp(bounded, flags).test(match)) { + this.wasAnonymized = true; + return mask; + } + return match; + }); + + // remove the term in the text + content = content.replace(new RegExp(bounded, flags), () => { this.wasAnonymized = true; return mask; - } - return match; - }); - - // remove the term in the text - content = content.replace(new RegExp(bounded, "gi"), () => { - this.wasAnonymized = true; - return mask; - }); + }); + } } return content; } diff --git a/src/core/term-matching.ts b/src/core/term-matching.ts new file mode 100644 index 0000000..ec6565d --- /dev/null +++ b/src/core/term-matching.ts @@ -0,0 +1,104 @@ +// Pure helpers for term-based anonymization. Extracted from anonymize-utils +// so unit tests can import them without pulling in the config module (which +// reads process.env at load time). + +// JS regex \b only fires at a word/non-word transition, where word chars are +// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or +// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only +// emit a boundary on sides where the term has a word-char edge; otherwise the +// boundary would never match. +// +// `sniffSource` lets callers decide boundaries from a different string than +// the actual pattern — needed when the pattern is an expanded character class +// (ends in "]") but the matched text is still a letter. +// +// `unicode: true` emits lookaround boundaries that treat any Unicode letter +// as a word char, so a trailing boundary still fires next to "ó" etc. The +// regex consuming the result must be created with the `u` flag. +export function withWordBoundaries( + termPattern: string, + opt: { sniffSource?: string; unicode?: boolean } = {} +): string { + // Strip a leading group like (?:...) or (...) when sniffing the first/last + // significant char so users wrapping their regex in a group still get + // boundaries applied. Best-effort — not a full parser. + const sniff = (opt.sniffSource ?? termPattern).replace( + /^\(\?[:=!]?|^\(|\)$/g, + "" + ); + const first = sniff.charAt(0); + const last = sniff.charAt(sniff.length - 1); + const isWord = (c: string) => /[A-Za-z0-9_]/.test(c); + const before = opt.unicode ? "(? regex character class covering common accented +// variants. Used to make term matching diacritic-insensitive in both +// directions: typing "Davo" matches "Davó" in the text, and typing "Davó" +// (after stripping diacritics) does the same. Coverage focuses on Latin +// letters that show up in real names — extend as needed. +export const DIACRITIC_CLASSES: Record = { + a: "[aàáâãäåāăąǎ]", + c: "[cçćĉċč]", + d: "[dďđ]", + e: "[eèéêëēĕėęěȩ]", + g: "[gĝğġģǧ]", + h: "[hĥħȟ]", + i: "[iìíîïĩīĭįıǐ]", + j: "[jĵǰ]", + k: "[kķǩ]", + l: "[lĺļľŀł]", + n: "[nñńņňʼnŋ]", + o: "[oòóôõöōŏőøǒ]", + r: "[rŕŗř]", + s: "[sśŝşšș]", + t: "[tţťŧț]", + u: "[uùúûüũūŭůűųǔ]", + w: "[wŵẁẃẅ]", + y: "[yýÿŷỳ]", + z: "[zźżž]", +}; + +// Build a regex source that matches the given (already-escaped) term in a +// diacritic-insensitive way. ASCII letters are replaced with a character +// class that includes their accented siblings; other chars are left alone so +// regex metacharacters and escape sequences keep working. +export function diacriticInsensitive(escapedTerm: string): string { + let out = ""; + let i = 0; + while (i < escapedTerm.length) { + const c = escapedTerm[i]; + // Pass through backslash escapes verbatim (e.g. "\." or "\d"). + if (c === "\\" && i + 1 < escapedTerm.length) { + out += c + escapedTerm[i + 1]; + i += 2; + continue; + } + const lower = c.toLowerCase(); + out += DIACRITIC_CLASSES[lower] || c; + i += 1; + } + return out; +} + +// Build the term variants to try for one user-provided term. Each variant +// produces a separate replacement pass. +export function termVariants(escapedTerm: string): { + pattern: string; + sniff: string; + unicode: boolean; +}[] { + const stripped = stripDiacritics(escapedTerm); + return [ + { pattern: escapedTerm, sniff: escapedTerm, unicode: false }, + { pattern: diacriticInsensitive(stripped), sniff: stripped, unicode: true }, + ]; +} diff --git a/test/anonymize-utils.test.js b/test/anonymize-utils.test.js index 250f3fa..5ab5fed 100644 --- a/test/anonymize-utils.test.js +++ b/test/anonymize-utils.test.js @@ -1,13 +1,18 @@ const { expect } = require("chai"); const { Transform } = require("stream"); const { StringDecoder } = require("string_decoder"); +require("ts-node/register/transpile-only"); +const { + withWordBoundaries, + termVariants, +} = require("../src/core/term-matching"); /** * Tests for the core anonymization utilities. * * Because anonymize-utils.ts is TypeScript that imports config (which reads - * process.env at module load time), we replicate the pure logic here so the - * tests run without compiling the full project or connecting to a database. + * process.env at module load time), we replicate the higher-level pieces + * here. Pure helpers live in src/core/term-matching and are imported above. */ // --------------------------------------------------------------------------- @@ -20,15 +25,6 @@ const ANONYMIZATION_MASK = "XXXX"; const urlRegex = /?/g; -function withWordBoundaries(termPattern) { - const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, ""); - const first = sniff.charAt(0); - const last = sniff.charAt(sniff.length - 1); - const isWord = (c) => /[A-Za-z0-9_]/.test(c); - const lead = first && isWord(first) ? "\\b" : ""; - const trail = last && isWord(last) ? "\\b" : ""; - return `${lead}${termPattern}${trail}`; -} class ContentAnonimizer { constructor(opt) { @@ -111,18 +107,24 @@ class ContentAnonimizer { } catch { term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); } - const bounded = withWordBoundaries(term); - content = content.replace(urlRegex, (match) => { - if (new RegExp(bounded, "gi").test(match)) { + for (const variant of termVariants(term)) { + const bounded = withWordBoundaries(variant.pattern, { + sniffSource: variant.sniff, + unicode: variant.unicode, + }); + const flags = variant.unicode ? "giu" : "gi"; + content = content.replace(urlRegex, (match) => { + if (new RegExp(bounded, flags).test(match)) { + this.wasAnonymized = true; + return mask; + } + return match; + }); + content = content.replace(new RegExp(bounded, flags), () => { this.wasAnonymized = true; return mask; - } - return match; - }); - content = content.replace(new RegExp(bounded, "gi"), () => { - this.wasAnonymized = true; - return mask; - }); + }); + } } return content; } @@ -234,6 +236,22 @@ describe("ContentAnonimizer", function () { expect(result).to.include("XXXX-1"); }); + // #280 — accented terms should match both the accented and unaccented + // variants so "Davó" scrubs "Davo" (and vice versa). + it("matches accented and unaccented variants of the same term", function () { + const a = new ContentAnonimizer({ terms: ["Davó"] }); + const r1 = a.anonymize("Authors: Alice Davó and Bob Davo"); + expect(r1).to.not.include("Davó"); + expect(r1).to.not.include("Davo"); + expect(r1.match(/XXXX-1/g).length).to.equal(2); + + const b = new ContentAnonimizer({ terms: ["Davo"] }); + const r2 = b.anonymize("Authors: Alice Davó and Bob Davo"); + expect(r2).to.not.include("Davó"); + expect(r2).to.not.include("Davo"); + expect(r2.match(/XXXX-1/g).length).to.equal(2); + }); + it("does not over-match across word boundaries when the term is word-only", function () { // Regression: ensure withWordBoundaries still emits \b on both sides // for ordinary alphanumeric terms.