fix: anonymize accented and unaccented variants of the same term

When a user added "Davó" to the term list, "Davo" elsewhere in the
content was left untouched (and vice versa). Each term now also runs a
diacritic-insensitive pass: ASCII Latin letters expand to a class
covering common accented siblings, with Unicode-aware lookaround
boundaries so the trailing boundary still fires next to "ó" etc.

Pure helpers moved into src/core/term-matching so the test file can
import them instead of duplicating the logic.

Fixes #280.
This commit is contained in:
tdurieux
2026-05-03 20:18:49 +02:00
parent d138d487f2
commit 2eb19904db
3 changed files with 166 additions and 52 deletions
+23 -31
View File
@@ -4,28 +4,11 @@ import { StringDecoder } from "string_decoder";
import { isText } from "istextorbinary"; import { isText } from "istextorbinary";
import config from "../config"; import config from "../config";
import { termVariants, withWordBoundaries } from "./term-matching";
const urlRegex = const urlRegex =
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g; /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
// JS regex \b only fires at a word/non-word transition, where word chars are
// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
// emit \b on sides where the term has a word-char edge; otherwise the boundary
// would never match.
function withWordBoundaries(termPattern: string): string {
// Strip a leading group like (?:...) or (...) when sniffing the first/last
// significant char so users wrapping their regex in a group still get
// boundaries applied. Best-effort — not a full parser.
const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
const first = sniff.charAt(0);
const last = sniff.charAt(sniff.length - 1);
const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
const lead = first && isWord(first) ? "\\b" : "";
const trail = last && isWord(last) ? "\\b" : "";
return `${lead}${termPattern}${trail}`;
}
export function streamToString(stream: Readable): Promise<string> { export function streamToString(stream: Readable): Promise<string> {
const chunks: Buffer[] = []; const chunks: Buffer[] = [];
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
@@ -217,21 +200,30 @@ export class ContentAnonimizer {
// escape regex characters // escape regex characters
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
} }
const bounded = withWordBoundaries(term);
// remove whole url if it contains the term // Try the term verbatim first, then a diacritic-insensitive expansion
content = content.replace(urlRegex, (match) => { // so "Davo" anonymizes "Davó" (and vice versa). See term-matching.ts.
if (new RegExp(bounded, "gi").test(match)) { for (const variant of termVariants(term)) {
const bounded = withWordBoundaries(variant.pattern, {
sniffSource: variant.sniff,
unicode: variant.unicode,
});
const flags = variant.unicode ? "giu" : "gi";
// remove whole url if it contains the term
content = content.replace(urlRegex, (match) => {
if (new RegExp(bounded, flags).test(match)) {
this.wasAnonymized = true;
return mask;
}
return match;
});
// remove the term in the text
content = content.replace(new RegExp(bounded, flags), () => {
this.wasAnonymized = true; this.wasAnonymized = true;
return mask; return mask;
} });
return match; }
});
// remove the term in the text
content = content.replace(new RegExp(bounded, "gi"), () => {
this.wasAnonymized = true;
return mask;
});
} }
return content; return content;
} }
+104
View File
@@ -0,0 +1,104 @@
// Pure helpers for term-based anonymization. Extracted from anonymize-utils
// so unit tests can import them without pulling in the config module (which
// reads process.env at load time).
// JS regex \b only fires at a word/non-word transition, where word chars are
// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
// emit a boundary on sides where the term has a word-char edge; otherwise the
// boundary would never match.
//
// `sniffSource` lets callers decide boundaries from a different string than
// the actual pattern — needed when the pattern is an expanded character class
// (ends in "]") but the matched text is still a letter.
//
// `unicode: true` emits lookaround boundaries that treat any Unicode letter
// as a word char, so a trailing boundary still fires next to "ó" etc. The
// regex consuming the result must be created with the `u` flag.
export function withWordBoundaries(
termPattern: string,
opt: { sniffSource?: string; unicode?: boolean } = {}
): string {
// Strip a leading group like (?:...) or (...) when sniffing the first/last
// significant char so users wrapping their regex in a group still get
// boundaries applied. Best-effort — not a full parser.
const sniff = (opt.sniffSource ?? termPattern).replace(
/^\(\?[:=!]?|^\(|\)$/g,
""
);
const first = sniff.charAt(0);
const last = sniff.charAt(sniff.length - 1);
const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
const before = opt.unicode ? "(?<![\\p{L}\\p{N}_])" : "\\b";
const after = opt.unicode ? "(?![\\p{L}\\p{N}_])" : "\\b";
const lead = first && isWord(first) ? before : "";
const trail = last && isWord(last) ? after : "";
return `${lead}${termPattern}${trail}`;
}
export function stripDiacritics(s: string): string {
return s.normalize("NFD").replace(/\p{Diacritic}/gu, "");
}
// Map of base Latin letter -> regex character class covering common accented
// variants. Used to make term matching diacritic-insensitive in both
// directions: typing "Davo" matches "Davó" in the text, and typing "Davó"
// (after stripping diacritics) does the same. Coverage focuses on Latin
// letters that show up in real names — extend as needed.
export const DIACRITIC_CLASSES: Record<string, string> = {
a: "[aàáâãäåāăąǎ]",
c: "[cçćĉċč]",
d: "[dďđ]",
e: "[eèéêëēĕėęěȩ]",
g: "[gĝğġģǧ]",
h: "[hĥħȟ]",
i: "[iìíîïĩīĭįıǐ]",
j: "[jĵǰ]",
k: "[kķǩ]",
l: "[lĺļľŀł]",
n: "[nñńņňʼnŋ]",
o: "[oòóôõöōŏőøǒ]",
r: "[rŕŗř]",
s: "[sśŝşšș]",
t: "[tţťŧț]",
u: "[uùúûüũūŭůűųǔ]",
w: "[wŵẁẃẅ]",
y: "[yýÿŷỳ]",
z: "[zźżž]",
};
// Build a regex source that matches the given (already-escaped) term in a
// diacritic-insensitive way. ASCII letters are replaced with a character
// class that includes their accented siblings; other chars are left alone so
// regex metacharacters and escape sequences keep working.
export function diacriticInsensitive(escapedTerm: string): string {
let out = "";
let i = 0;
while (i < escapedTerm.length) {
const c = escapedTerm[i];
// Pass through backslash escapes verbatim (e.g. "\." or "\d").
if (c === "\\" && i + 1 < escapedTerm.length) {
out += c + escapedTerm[i + 1];
i += 2;
continue;
}
const lower = c.toLowerCase();
out += DIACRITIC_CLASSES[lower] || c;
i += 1;
}
return out;
}
// Build the term variants to try for one user-provided term. Each variant
// produces a separate replacement pass.
export function termVariants(escapedTerm: string): {
pattern: string;
sniff: string;
unicode: boolean;
}[] {
const stripped = stripDiacritics(escapedTerm);
return [
{ pattern: escapedTerm, sniff: escapedTerm, unicode: false },
{ pattern: diacriticInsensitive(stripped), sniff: stripped, unicode: true },
];
}
+39 -21
View File
@@ -1,13 +1,18 @@
const { expect } = require("chai"); const { expect } = require("chai");
const { Transform } = require("stream"); const { Transform } = require("stream");
const { StringDecoder } = require("string_decoder"); const { StringDecoder } = require("string_decoder");
require("ts-node/register/transpile-only");
const {
withWordBoundaries,
termVariants,
} = require("../src/core/term-matching");
/** /**
* Tests for the core anonymization utilities. * Tests for the core anonymization utilities.
* *
* Because anonymize-utils.ts is TypeScript that imports config (which reads * Because anonymize-utils.ts is TypeScript that imports config (which reads
* process.env at module load time), we replicate the pure logic here so the * process.env at module load time), we replicate the higher-level pieces
* tests run without compiling the full project or connecting to a database. * here. Pure helpers live in src/core/term-matching and are imported above.
*/ */
// --------------------------------------------------------------------------- // ---------------------------------------------------------------------------
@@ -20,15 +25,6 @@ const ANONYMIZATION_MASK = "XXXX";
const urlRegex = const urlRegex =
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g; /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
function withWordBoundaries(termPattern) {
const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
const first = sniff.charAt(0);
const last = sniff.charAt(sniff.length - 1);
const isWord = (c) => /[A-Za-z0-9_]/.test(c);
const lead = first && isWord(first) ? "\\b" : "";
const trail = last && isWord(last) ? "\\b" : "";
return `${lead}${termPattern}${trail}`;
}
class ContentAnonimizer { class ContentAnonimizer {
constructor(opt) { constructor(opt) {
@@ -111,18 +107,24 @@ class ContentAnonimizer {
} catch { } catch {
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
} }
const bounded = withWordBoundaries(term); for (const variant of termVariants(term)) {
content = content.replace(urlRegex, (match) => { const bounded = withWordBoundaries(variant.pattern, {
if (new RegExp(bounded, "gi").test(match)) { sniffSource: variant.sniff,
unicode: variant.unicode,
});
const flags = variant.unicode ? "giu" : "gi";
content = content.replace(urlRegex, (match) => {
if (new RegExp(bounded, flags).test(match)) {
this.wasAnonymized = true;
return mask;
}
return match;
});
content = content.replace(new RegExp(bounded, flags), () => {
this.wasAnonymized = true; this.wasAnonymized = true;
return mask; return mask;
} });
return match; }
});
content = content.replace(new RegExp(bounded, "gi"), () => {
this.wasAnonymized = true;
return mask;
});
} }
return content; return content;
} }
@@ -234,6 +236,22 @@ describe("ContentAnonimizer", function () {
expect(result).to.include("XXXX-1"); expect(result).to.include("XXXX-1");
}); });
// #280 — accented terms should match both the accented and unaccented
// variants so "Davó" scrubs "Davo" (and vice versa).
it("matches accented and unaccented variants of the same term", function () {
const a = new ContentAnonimizer({ terms: ["Davó"] });
const r1 = a.anonymize("Authors: Alice Davó and Bob Davo");
expect(r1).to.not.include("Davó");
expect(r1).to.not.include("Davo");
expect(r1.match(/XXXX-1/g).length).to.equal(2);
const b = new ContentAnonimizer({ terms: ["Davo"] });
const r2 = b.anonymize("Authors: Alice Davó and Bob Davo");
expect(r2).to.not.include("Davó");
expect(r2).to.not.include("Davo");
expect(r2.match(/XXXX-1/g).length).to.equal(2);
});
it("does not over-match across word boundaries when the term is word-only", function () { it("does not over-match across word boundaries when the term is word-only", function () {
// Regression: ensure withWordBoundaries still emits \b on both sides // Regression: ensure withWordBoundaries still emits \b on both sides
// for ordinary alphanumeric terms. // for ordinary alphanumeric terms.