mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 14:38:03 +02:00
fix: match terms whose edges are non-word characters
Wrapping every user term as `\b${term}\b` silently dropped matches when
the term started or ended with a non-word char (e.g. `@tdurieux`,
`@author .*`), because JS `\b` only fires at a word/non-word transition.
Replace with `withWordBoundaries()`, which only emits `\b` on the side
where the term has a word-char edge.
Fixes #175, #249.
This commit is contained in:
@@ -8,6 +8,24 @@ import config from "../config";
|
||||
const urlRegex =
|
||||
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
||||
|
||||
// JS regex \b only fires at a word/non-word transition, where word chars are
|
||||
// [A-Za-z0-9_]. So `\bterm\b` silently fails to match when the term begins or
|
||||
// ends with a non-word char (e.g. "@tdurieux", "Davó", "@author .*"). Only
|
||||
// emit \b on sides where the term has a word-char edge; otherwise the boundary
|
||||
// would never match.
|
||||
function withWordBoundaries(termPattern: string): string {
|
||||
// Strip a leading group like (?:...) or (...) when sniffing the first/last
|
||||
// significant char so users wrapping their regex in a group still get
|
||||
// boundaries applied. Best-effort — not a full parser.
|
||||
const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
|
||||
const first = sniff.charAt(0);
|
||||
const last = sniff.charAt(sniff.length - 1);
|
||||
const isWord = (c: string) => /[A-Za-z0-9_]/.test(c);
|
||||
const lead = first && isWord(first) ? "\\b" : "";
|
||||
const trail = last && isWord(last) ? "\\b" : "";
|
||||
return `${lead}${termPattern}${trail}`;
|
||||
}
|
||||
|
||||
export function streamToString(stream: Readable): Promise<string> {
|
||||
const chunks: Buffer[] = [];
|
||||
return new Promise((resolve, reject) => {
|
||||
@@ -199,9 +217,10 @@ export class ContentAnonimizer {
|
||||
// escape regex characters
|
||||
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
||||
}
|
||||
const bounded = withWordBoundaries(term);
|
||||
// remove whole url if it contains the term
|
||||
content = content.replace(urlRegex, (match) => {
|
||||
if (new RegExp(`\\b${term}\\b`, "gi").test(match)) {
|
||||
if (new RegExp(bounded, "gi").test(match)) {
|
||||
this.wasAnonymized = true;
|
||||
return mask;
|
||||
}
|
||||
@@ -209,7 +228,7 @@ export class ContentAnonimizer {
|
||||
});
|
||||
|
||||
// remove the term in the text
|
||||
content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), () => {
|
||||
content = content.replace(new RegExp(bounded, "gi"), () => {
|
||||
this.wasAnonymized = true;
|
||||
return mask;
|
||||
});
|
||||
|
||||
@@ -20,6 +20,16 @@ const ANONYMIZATION_MASK = "XXXX";
|
||||
const urlRegex =
|
||||
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
||||
|
||||
function withWordBoundaries(termPattern) {
|
||||
const sniff = termPattern.replace(/^\(\?[:=!]?|^\(|\)$/g, "");
|
||||
const first = sniff.charAt(0);
|
||||
const last = sniff.charAt(sniff.length - 1);
|
||||
const isWord = (c) => /[A-Za-z0-9_]/.test(c);
|
||||
const lead = first && isWord(first) ? "\\b" : "";
|
||||
const trail = last && isWord(last) ? "\\b" : "";
|
||||
return `${lead}${termPattern}${trail}`;
|
||||
}
|
||||
|
||||
class ContentAnonimizer {
|
||||
constructor(opt) {
|
||||
this.opt = opt || {};
|
||||
@@ -101,14 +111,15 @@ class ContentAnonimizer {
|
||||
} catch {
|
||||
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
||||
}
|
||||
const bounded = withWordBoundaries(term);
|
||||
content = content.replace(urlRegex, (match) => {
|
||||
if (new RegExp(`\\b${term}\\b`, "gi").test(match)) {
|
||||
if (new RegExp(bounded, "gi").test(match)) {
|
||||
this.wasAnonymized = true;
|
||||
return mask;
|
||||
}
|
||||
return match;
|
||||
});
|
||||
content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), () => {
|
||||
content = content.replace(new RegExp(bounded, "gi"), () => {
|
||||
this.wasAnonymized = true;
|
||||
return mask;
|
||||
});
|
||||
@@ -195,6 +206,43 @@ describe("ContentAnonimizer", function () {
|
||||
expect(() => anon.anonymize("some foo(bar here")).to.not.throw();
|
||||
});
|
||||
|
||||
// #175 — terms starting with a non-word char (e.g. "@username") were
|
||||
// silently skipped because \b can't match between two non-word chars.
|
||||
it("replaces terms starting with a non-word character (e.g. @user)", function () {
|
||||
const anon = new ContentAnonimizer({ terms: ["@tdurieux"] });
|
||||
const result = anon.anonymize('"name": "@tdurieux/anonymous"');
|
||||
expect(result).to.not.include("@tdurieux");
|
||||
expect(result).to.include("XXXX-1");
|
||||
});
|
||||
|
||||
// #249 — regex terms ending in non-word chars (e.g. "@author .*") were
|
||||
// also skipped due to the trailing \b.
|
||||
it("matches a user regex that ends with a non-word pattern", function () {
|
||||
const anon = new ContentAnonimizer({ terms: ["@author .*"] });
|
||||
const result = anon.anonymize("/** @author julius */");
|
||||
expect(result).to.include("XXXX-1");
|
||||
expect(result).to.not.include("@author julius");
|
||||
});
|
||||
|
||||
// #430 — IPv4-style terms have non-word boundaries on each dot but still
|
||||
// start/end with digits, so \b on both sides is fine — guard against
|
||||
// regression now that we tweak boundary logic.
|
||||
it("anonymizes an IP address term", function () {
|
||||
const anon = new ContentAnonimizer({ terms: ["192\\.168\\.1\\.1"] });
|
||||
const result = anon.anonymize("connect to 192.168.1.1 on port 80");
|
||||
expect(result).to.not.include("192.168.1.1");
|
||||
expect(result).to.include("XXXX-1");
|
||||
});
|
||||
|
||||
it("does not over-match across word boundaries when the term is word-only", function () {
|
||||
// Regression: ensure withWordBoundaries still emits \b on both sides
|
||||
// for ordinary alphanumeric terms.
|
||||
const anon = new ContentAnonimizer({ terms: ["cat"] });
|
||||
const result = anon.anonymize("the cat sat on a category");
|
||||
expect(result).to.include("category");
|
||||
expect(result).to.match(/the XXXX-1 sat/);
|
||||
});
|
||||
|
||||
it("replaces terms inside URLs", function () {
|
||||
const anon = new ContentAnonimizer({ terms: ["myuser"] });
|
||||
const result = anon.anonymize(
|
||||
|
||||
Reference in New Issue
Block a user