mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 14:38:03 +02:00
fix: anonymize accented and unaccented variants of the same term
When a user added "Davó" to the term list, "Davo" elsewhere in the content was left untouched (and vice versa). Each term now also runs a diacritic-insensitive pass: ASCII Latin letters expand to a class covering common accented siblings, with Unicode-aware lookaround boundaries so the trailing boundary still fires next to "ó" etc. Pure helpers moved into src/core/term-matching so the test file can import them instead of duplicating the logic. Fixes #280.
This commit is contained in:
+23
-31
@@ -4,28 +4,11 @@ import { StringDecoder } from "string_decoder";
|
|||||||
import { isText } from "istextorbinary";
|
import { isText } from "istextorbinary";
|
||||||
|
|
||||||
import config from "../config";
|
import config from "../config";
|
||||||
|
import { termVariants, withWordBoundaries } from "./term-matching";
|
||||||
|
|
||||||
const urlRegex =
|
const urlRegex =
|
||||||
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
||||||
|
|
||||||
// JS regex \b only fires at a word/non-word transition, where word chars are
|
|
||||||
// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
|
|
||||||
// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
|
|
||||||
// emit \b on sides where the term has a word-char edge; otherwise the boundary
|
|
||||||
// would never match.
|
|
||||||
function withWordBoundaries(termPattern: string): string {
|
|
||||||
// Strip a leading group like (?:...) or (...) when sniffing the first/last
|
|
||||||
// significant char so users wrapping their regex in a group still get
|
|
||||||
// boundaries applied. Best-effort — not a full parser.
|
|
||||||
const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
|
|
||||||
const first = sniff.charAt(0);
|
|
||||||
const last = sniff.charAt(sniff.length - 1);
|
|
||||||
const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
|
|
||||||
const lead = first && isWord(first) ? "\\b" : "";
|
|
||||||
const trail = last && isWord(last) ? "\\b" : "";
|
|
||||||
return `${lead}${termPattern}${trail}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
export function streamToString(stream: Readable): Promise<string> {
|
export function streamToString(stream: Readable): Promise<string> {
|
||||||
const chunks: Buffer[] = [];
|
const chunks: Buffer[] = [];
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
@@ -217,21 +200,30 @@ export class ContentAnonimizer {
|
|||||||
// escape regex characters
|
// escape regex characters
|
||||||
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
||||||
}
|
}
|
||||||
const bounded = withWordBoundaries(term);
|
|
||||||
// remove whole url if it contains the term
|
// Try the term verbatim first, then a diacritic-insensitive expansion
|
||||||
content = content.replace(urlRegex, (match) => {
|
// so "Davo" anonymizes "Davó" (and vice versa). See term-matching.ts.
|
||||||
if (new RegExp(bounded, "gi").test(match)) {
|
for (const variant of termVariants(term)) {
|
||||||
|
const bounded = withWordBoundaries(variant.pattern, {
|
||||||
|
sniffSource: variant.sniff,
|
||||||
|
unicode: variant.unicode,
|
||||||
|
});
|
||||||
|
const flags = variant.unicode ? "giu" : "gi";
|
||||||
|
// remove whole url if it contains the term
|
||||||
|
content = content.replace(urlRegex, (match) => {
|
||||||
|
if (new RegExp(bounded, flags).test(match)) {
|
||||||
|
this.wasAnonymized = true;
|
||||||
|
return mask;
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
});
|
||||||
|
|
||||||
|
// remove the term in the text
|
||||||
|
content = content.replace(new RegExp(bounded, flags), () => {
|
||||||
this.wasAnonymized = true;
|
this.wasAnonymized = true;
|
||||||
return mask;
|
return mask;
|
||||||
}
|
});
|
||||||
return match;
|
}
|
||||||
});
|
|
||||||
|
|
||||||
// remove the term in the text
|
|
||||||
content = content.replace(new RegExp(bounded, "gi"), () => {
|
|
||||||
this.wasAnonymized = true;
|
|
||||||
return mask;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,104 @@
|
|||||||
|
// Pure helpers for term-based anonymization. Extracted from anonymize-utils
|
||||||
|
// so unit tests can import them without pulling in the config module (which
|
||||||
|
// reads process.env at load time).
|
||||||
|
|
||||||
|
// JS regex \b only fires at a word/non-word transition, where word chars are
|
||||||
|
// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
|
||||||
|
// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
|
||||||
|
// emit a boundary on sides where the term has a word-char edge; otherwise the
|
||||||
|
// boundary would never match.
|
||||||
|
//
|
||||||
|
// `sniffSource` lets callers decide boundaries from a different string than
|
||||||
|
// the actual pattern — needed when the pattern is an expanded character class
|
||||||
|
// (ends in "]") but the matched text is still a letter.
|
||||||
|
//
|
||||||
|
// `unicode: true` emits lookaround boundaries that treat any Unicode letter
|
||||||
|
// as a word char, so a trailing boundary still fires next to "ó" etc. The
|
||||||
|
// regex consuming the result must be created with the `u` flag.
|
||||||
|
export function withWordBoundaries(
|
||||||
|
termPattern: string,
|
||||||
|
opt: { sniffSource?: string; unicode?: boolean } = {}
|
||||||
|
): string {
|
||||||
|
// Strip a leading group like (?:...) or (...) when sniffing the first/last
|
||||||
|
// significant char so users wrapping their regex in a group still get
|
||||||
|
// boundaries applied. Best-effort — not a full parser.
|
||||||
|
const sniff = (opt.sniffSource ?? termPattern).replace(
|
||||||
|
/^\(\?[:=!]?|^\(|\)$/g,
|
||||||
|
""
|
||||||
|
);
|
||||||
|
const first = sniff.charAt(0);
|
||||||
|
const last = sniff.charAt(sniff.length - 1);
|
||||||
|
const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
|
||||||
|
const before = opt.unicode ? "(?<![\\p{L}\\p{N}_])" : "\\b";
|
||||||
|
const after = opt.unicode ? "(?![\\p{L}\\p{N}_])" : "\\b";
|
||||||
|
const lead = first && isWord(first) ? before : "";
|
||||||
|
const trail = last && isWord(last) ? after : "";
|
||||||
|
return `${lead}${termPattern}${trail}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function stripDiacritics(s: string): string {
|
||||||
|
return s.normalize("NFD").replace(/\p{Diacritic}/gu, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map of base Latin letter -> regex character class covering common accented
|
||||||
|
// variants. Used to make term matching diacritic-insensitive in both
|
||||||
|
// directions: typing "Davo" matches "Davó" in the text, and typing "Davó"
|
||||||
|
// (after stripping diacritics) does the same. Coverage focuses on Latin
|
||||||
|
// letters that show up in real names — extend as needed.
|
||||||
|
export const DIACRITIC_CLASSES: Record<string, string> = {
|
||||||
|
a: "[aàáâãäåāăąǎ]",
|
||||||
|
c: "[cçćĉċč]",
|
||||||
|
d: "[dďđ]",
|
||||||
|
e: "[eèéêëēĕėęěȩ]",
|
||||||
|
g: "[gĝğġģǧ]",
|
||||||
|
h: "[hĥħȟ]",
|
||||||
|
i: "[iìíîïĩīĭįıǐ]",
|
||||||
|
j: "[jĵǰ]",
|
||||||
|
k: "[kķǩ]",
|
||||||
|
l: "[lĺļľŀł]",
|
||||||
|
n: "[nñńņňʼnŋ]",
|
||||||
|
o: "[oòóôõöōŏőøǒ]",
|
||||||
|
r: "[rŕŗř]",
|
||||||
|
s: "[sśŝşšș]",
|
||||||
|
t: "[tţťŧț]",
|
||||||
|
u: "[uùúûüũūŭůűųǔ]",
|
||||||
|
w: "[wŵẁẃẅ]",
|
||||||
|
y: "[yýÿŷỳ]",
|
||||||
|
z: "[zźżž]",
|
||||||
|
};
|
||||||
|
|
||||||
|
// Build a regex source that matches the given (already-escaped) term in a
|
||||||
|
// diacritic-insensitive way. ASCII letters are replaced with a character
|
||||||
|
// class that includes their accented siblings; other chars are left alone so
|
||||||
|
// regex metacharacters and escape sequences keep working.
|
||||||
|
export function diacriticInsensitive(escapedTerm: string): string {
|
||||||
|
let out = "";
|
||||||
|
let i = 0;
|
||||||
|
while (i < escapedTerm.length) {
|
||||||
|
const c = escapedTerm[i];
|
||||||
|
// Pass through backslash escapes verbatim (e.g. "\." or "\d").
|
||||||
|
if (c === "\\" && i + 1 < escapedTerm.length) {
|
||||||
|
out += c + escapedTerm[i + 1];
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const lower = c.toLowerCase();
|
||||||
|
out += DIACRITIC_CLASSES[lower] || c;
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the term variants to try for one user-provided term. Each variant
|
||||||
|
// produces a separate replacement pass.
|
||||||
|
export function termVariants(escapedTerm: string): {
|
||||||
|
pattern: string;
|
||||||
|
sniff: string;
|
||||||
|
unicode: boolean;
|
||||||
|
}[] {
|
||||||
|
const stripped = stripDiacritics(escapedTerm);
|
||||||
|
return [
|
||||||
|
{ pattern: escapedTerm, sniff: escapedTerm, unicode: false },
|
||||||
|
{ pattern: diacriticInsensitive(stripped), sniff: stripped, unicode: true },
|
||||||
|
];
|
||||||
|
}
|
||||||
@@ -1,13 +1,18 @@
|
|||||||
const { expect } = require("chai");
|
const { expect } = require("chai");
|
||||||
const { Transform } = require("stream");
|
const { Transform } = require("stream");
|
||||||
const { StringDecoder } = require("string_decoder");
|
const { StringDecoder } = require("string_decoder");
|
||||||
|
require("ts-node/register/transpile-only");
|
||||||
|
const {
|
||||||
|
withWordBoundaries,
|
||||||
|
termVariants,
|
||||||
|
} = require("../src/core/term-matching");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests for the core anonymization utilities.
|
* Tests for the core anonymization utilities.
|
||||||
*
|
*
|
||||||
* Because anonymize-utils.ts is TypeScript that imports config (which reads
|
* Because anonymize-utils.ts is TypeScript that imports config (which reads
|
||||||
* process.env at module load time), we replicate the pure logic here so the
|
* process.env at module load time), we replicate the higher-level pieces
|
||||||
* tests run without compiling the full project or connecting to a database.
|
* here. Pure helpers live in src/core/term-matching and are imported above.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
@@ -20,15 +25,6 @@ const ANONYMIZATION_MASK = "XXXX";
|
|||||||
const urlRegex =
|
const urlRegex =
|
||||||
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
||||||
|
|
||||||
function withWordBoundaries(termPattern) {
|
|
||||||
const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
|
|
||||||
const first = sniff.charAt(0);
|
|
||||||
const last = sniff.charAt(sniff.length - 1);
|
|
||||||
const isWord = (c) => /[A-Za-z0-9_]/.test(c);
|
|
||||||
const lead = first && isWord(first) ? "\\b" : "";
|
|
||||||
const trail = last && isWord(last) ? "\\b" : "";
|
|
||||||
return `${lead}${termPattern}${trail}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
class ContentAnonimizer {
|
class ContentAnonimizer {
|
||||||
constructor(opt) {
|
constructor(opt) {
|
||||||
@@ -111,18 +107,24 @@ class ContentAnonimizer {
|
|||||||
} catch {
|
} catch {
|
||||||
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
||||||
}
|
}
|
||||||
const bounded = withWordBoundaries(term);
|
for (const variant of termVariants(term)) {
|
||||||
content = content.replace(urlRegex, (match) => {
|
const bounded = withWordBoundaries(variant.pattern, {
|
||||||
if (new RegExp(bounded, "gi").test(match)) {
|
sniffSource: variant.sniff,
|
||||||
|
unicode: variant.unicode,
|
||||||
|
});
|
||||||
|
const flags = variant.unicode ? "giu" : "gi";
|
||||||
|
content = content.replace(urlRegex, (match) => {
|
||||||
|
if (new RegExp(bounded, flags).test(match)) {
|
||||||
|
this.wasAnonymized = true;
|
||||||
|
return mask;
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
});
|
||||||
|
content = content.replace(new RegExp(bounded, flags), () => {
|
||||||
this.wasAnonymized = true;
|
this.wasAnonymized = true;
|
||||||
return mask;
|
return mask;
|
||||||
}
|
});
|
||||||
return match;
|
}
|
||||||
});
|
|
||||||
content = content.replace(new RegExp(bounded, "gi"), () => {
|
|
||||||
this.wasAnonymized = true;
|
|
||||||
return mask;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
@@ -234,6 +236,22 @@ describe("ContentAnonimizer", function () {
|
|||||||
expect(result).to.include("XXXX-1");
|
expect(result).to.include("XXXX-1");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// #280 — accented terms should match both the accented and unaccented
|
||||||
|
// variants so "Davó" scrubs "Davo" (and vice versa).
|
||||||
|
it("matches accented and unaccented variants of the same term", function () {
|
||||||
|
const a = new ContentAnonimizer({ terms: ["Davó"] });
|
||||||
|
const r1 = a.anonymize("Authors: Alice Davó and Bob Davo");
|
||||||
|
expect(r1).to.not.include("Davó");
|
||||||
|
expect(r1).to.not.include("Davo");
|
||||||
|
expect(r1.match(/XXXX-1/g).length).to.equal(2);
|
||||||
|
|
||||||
|
const b = new ContentAnonimizer({ terms: ["Davo"] });
|
||||||
|
const r2 = b.anonymize("Authors: Alice Davó and Bob Davo");
|
||||||
|
expect(r2).to.not.include("Davó");
|
||||||
|
expect(r2).to.not.include("Davo");
|
||||||
|
expect(r2.match(/XXXX-1/g).length).to.equal(2);
|
||||||
|
});
|
||||||
|
|
||||||
it("does not over-match across word boundaries when the term is word-only", function () {
|
it("does not over-match across word boundaries when the term is word-only", function () {
|
||||||
// Regression: ensure withWordBoundaries still emits \b on both sides
|
// Regression: ensure withWordBoundaries still emits \b on both sides
|
||||||
// for ordinary alphanumeric terms.
|
// for ordinary alphanumeric terms.
|
||||||
|
|||||||
Reference in New Issue
Block a user