mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 14:38:03 +02:00
79f555769d
Files like .jsonl that mime-types doesn't know fell through to application/octet-stream and rendered as "Unsupported binary file" in the viewer. Replace istextorbinary with isbinaryfile for content-based detection, and use mime-types for name-based classification with a textual application/* allowlist. The streaming transformer now defers classification when the name is inconclusive and sniffs the first chunk before emitting "transform", so route.ts and AnonymizedFile.ts get a content-aware Content-Type. Whitelists .jsonl and .ndjson to short-circuit dataset files. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
704 lines
26 KiB
JavaScript
704 lines
26 KiB
JavaScript
const { expect } = require("chai");
|
|
const { Transform } = require("stream");
|
|
const { StringDecoder } = require("string_decoder");
|
|
require("ts-node/register/transpile-only");
|
|
const {
|
|
withWordBoundaries,
|
|
termVariants,
|
|
parseTermSpec,
|
|
} = require("../src/core/term-matching");
|
|
|
|
/**
|
|
* Tests for the core anonymization utilities.
|
|
*
|
|
* Because anonymize-utils.ts is TypeScript that imports config (which reads
|
|
* process.env at module load time), we replicate the higher-level pieces
|
|
* here. Pure helpers live in src/core/term-matching and are imported above.
|
|
*/
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Minimal replica of the anonymization logic under test
|
|
// (mirrors src/core/anonymize-utils.ts)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const ANONYMIZATION_MASK = "XXXX";
|
|
|
|
const urlRegex =
|
|
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
|
|
|
|
|
class ContentAnonimizer {
|
|
constructor(opt) {
|
|
this.opt = opt || {};
|
|
this.wasAnonymized = false;
|
|
}
|
|
|
|
removeImage(content) {
|
|
if (this.opt.image !== false) {
|
|
return content;
|
|
}
|
|
return content.replace(
|
|
/!\[[^\]]*\]\((?<filename>.*?)(?="|\))(?<optionalpart>".*")?\)/g,
|
|
() => {
|
|
this.wasAnonymized = true;
|
|
return ANONYMIZATION_MASK;
|
|
}
|
|
);
|
|
}
|
|
|
|
removeLink(content) {
|
|
if (this.opt.link !== false) {
|
|
return content;
|
|
}
|
|
return content.replace(urlRegex, () => {
|
|
this.wasAnonymized = true;
|
|
return ANONYMIZATION_MASK;
|
|
});
|
|
}
|
|
|
|
replaceGitHubSelfLinks(content) {
|
|
if (!this.opt.repoName || !this.opt.branchName) {
|
|
return content;
|
|
}
|
|
const repoName = this.opt.repoName;
|
|
const branchName = this.opt.branchName;
|
|
const APP_HOSTNAME = "anonymous.4open.science";
|
|
|
|
const replaceCallback = () => {
|
|
this.wasAnonymized = true;
|
|
return `https://${APP_HOSTNAME}/r/${this.opt.repoId}`;
|
|
};
|
|
content = content.replace(
|
|
new RegExp(
|
|
`https://raw.githubusercontent.com/${repoName}/${branchName}\\b`,
|
|
"gi"
|
|
),
|
|
replaceCallback
|
|
);
|
|
content = content.replace(
|
|
new RegExp(
|
|
`https://github.com/${repoName}/blob/${branchName}\\b`,
|
|
"gi"
|
|
),
|
|
replaceCallback
|
|
);
|
|
content = content.replace(
|
|
new RegExp(
|
|
`https://github.com/${repoName}/tree/${branchName}\\b`,
|
|
"gi"
|
|
),
|
|
replaceCallback
|
|
);
|
|
return content.replace(
|
|
new RegExp(`https://github.com/${repoName}`, "gi"),
|
|
replaceCallback
|
|
);
|
|
}
|
|
|
|
replaceTerms(content) {
|
|
const terms = this.opt.terms || [];
|
|
for (let i = 0; i < terms.length; i++) {
|
|
const spec = terms[i];
|
|
if (spec.trim() == "") {
|
|
continue;
|
|
}
|
|
const parsed = parseTermSpec(spec);
|
|
let term = parsed.term;
|
|
const mask =
|
|
parsed.replacement !== null
|
|
? parsed.replacement
|
|
: ANONYMIZATION_MASK + "-" + (i + 1);
|
|
try {
|
|
new RegExp(term, "gi");
|
|
} catch {
|
|
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
|
}
|
|
for (const variant of termVariants(term)) {
|
|
const bounded = withWordBoundaries(variant.pattern, {
|
|
sniffSource: variant.sniff,
|
|
unicode: variant.unicode,
|
|
});
|
|
const flags = variant.unicode ? "giu" : "gi";
|
|
content = content.replace(urlRegex, (match) => {
|
|
if (new RegExp(bounded, flags).test(match)) {
|
|
this.wasAnonymized = true;
|
|
return mask;
|
|
}
|
|
return match;
|
|
});
|
|
content = content.replace(new RegExp(bounded, flags), () => {
|
|
this.wasAnonymized = true;
|
|
return mask;
|
|
});
|
|
}
|
|
}
|
|
return content;
|
|
}
|
|
|
|
anonymize(content) {
|
|
content = this.removeImage(content);
|
|
content = this.removeLink(content);
|
|
content = this.replaceGitHubSelfLinks(content);
|
|
content = this.replaceTerms(content);
|
|
return content;
|
|
}
|
|
}
|
|
|
|
function anonymizePath(path, terms) {
|
|
for (let i = 0; i < terms.length; i++) {
|
|
const spec = terms[i];
|
|
if (spec.trim() == "") {
|
|
continue;
|
|
}
|
|
const parsed = parseTermSpec(spec);
|
|
let term = parsed.term;
|
|
const mask =
|
|
parsed.replacement !== null
|
|
? parsed.replacement
|
|
: ANONYMIZATION_MASK + "-" + (i + 1);
|
|
try {
|
|
new RegExp(term, "gi");
|
|
} catch {
|
|
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
|
}
|
|
path = path.replace(new RegExp(term, "gi"), mask);
|
|
}
|
|
return path;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
describe("ContentAnonimizer", function () {
|
|
// ---------------------------------------------------------------
|
|
// Term replacement
|
|
// ---------------------------------------------------------------
|
|
describe("replaceTerms", function () {
|
|
it("replaces a single term with a numbered mask", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["secret"] });
|
|
const result = anon.anonymize("this is a secret value");
|
|
expect(result).to.equal("this is a XXXX-1 value");
|
|
expect(anon.wasAnonymized).to.be.true;
|
|
});
|
|
|
|
it("replaces multiple terms with distinct masks", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["alice", "bob"] });
|
|
const result = anon.anonymize("alice met bob");
|
|
expect(result).to.equal("XXXX-1 met XXXX-2");
|
|
});
|
|
|
|
it("is case-insensitive", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["Secret"] });
|
|
const result = anon.anonymize("a SECRET message and a secret one");
|
|
expect(result).to.not.include("SECRET");
|
|
expect(result).to.not.include("secret");
|
|
});
|
|
|
|
it("respects word boundaries", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["cat"] });
|
|
const result = anon.anonymize("the cat sat on a category");
|
|
expect(result).to.include("XXXX-1");
|
|
// "category" should NOT be replaced because \b prevents partial match
|
|
expect(result).to.include("category");
|
|
});
|
|
|
|
it("skips empty/whitespace-only terms", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["", " ", "real"] });
|
|
const result = anon.anonymize("a real term");
|
|
expect(result).to.equal("a XXXX-3 term");
|
|
});
|
|
|
|
it("handles terms that are invalid regex by escaping them", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["foo(bar"] });
|
|
// "foo(bar" is invalid regex; the code should escape it
|
|
// Since \b won't match around '(' properly, the replacement may not fire
|
|
// on the raw term, but crucially it must not throw
|
|
expect(() => anon.anonymize("some foo(bar here")).to.not.throw();
|
|
});
|
|
|
|
// #175 — terms starting with a non-word char (e.g. "@username") were
|
|
// silently skipped because \b can't match between two non-word chars.
|
|
it("replaces terms starting with a non-word character (e.g. @user)", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["@tdurieux"] });
|
|
const result = anon.anonymize('"name": "@tdurieux/anonymous"');
|
|
expect(result).to.not.include("@tdurieux");
|
|
expect(result).to.include("XXXX-1");
|
|
});
|
|
|
|
// #249 — regex terms ending in non-word chars (e.g. "@author .*") were
|
|
// also skipped due to the trailing \b.
|
|
it("matches a user regex that ends with a non-word pattern", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["@author .*"] });
|
|
const result = anon.anonymize("/** @author julius */");
|
|
expect(result).to.include("XXXX-1");
|
|
expect(result).to.not.include("@author julius");
|
|
});
|
|
|
|
// #430 — IPv4-style terms have non-word boundaries on each dot but still
|
|
// start/end with digits, so \b on both sides is fine — guard against
|
|
// regression now that we tweak boundary logic.
|
|
it("anonymizes an IP address term", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["192\\.168\\.1\\.1"] });
|
|
const result = anon.anonymize("connect to 192.168.1.1 on port 80");
|
|
expect(result).to.not.include("192.168.1.1");
|
|
expect(result).to.include("XXXX-1");
|
|
});
|
|
|
|
// #285 — `term=>replacement` uses the user-supplied replacement
|
|
// instead of XXXX-N, so anonymized identifiers can stay valid in code.
|
|
it("uses a custom replacement when the term is 'term=>replacement'", function () {
|
|
const a = new ContentAnonimizer({ terms: ["Anonymous=>ABC"] });
|
|
const result = a.anonymize("class Anonymous extends Base {}");
|
|
expect(result).to.equal("class ABC extends Base {}");
|
|
});
|
|
|
|
it("supports custom and default-mask terms together with stable indices", function () {
|
|
const a = new ContentAnonimizer({
|
|
terms: ["Alpha=>AAA", "Beta"],
|
|
});
|
|
const result = a.anonymize("Alpha and Beta");
|
|
// Beta uses XXXX-2 (its 1-based index in the list), even though
|
|
// Alpha had a custom replacement.
|
|
expect(result).to.equal("AAA and XXXX-2");
|
|
});
|
|
|
|
it("falls back to the default mask when the entry has no replacement", function () {
|
|
const a = new ContentAnonimizer({ terms: ["Foo=>"] });
|
|
const result = a.anonymize("Foo bar");
|
|
expect(result).to.equal(" bar");
|
|
});
|
|
|
|
// #280 — accented terms should match both the accented and unaccented
|
|
// variants so "Davó" scrubs "Davo" (and vice versa).
|
|
it("matches accented and unaccented variants of the same term", function () {
|
|
const a = new ContentAnonimizer({ terms: ["Davó"] });
|
|
const r1 = a.anonymize("Authors: Alice Davó and Bob Davo");
|
|
expect(r1).to.not.include("Davó");
|
|
expect(r1).to.not.include("Davo");
|
|
expect(r1.match(/XXXX-1/g).length).to.equal(2);
|
|
|
|
const b = new ContentAnonimizer({ terms: ["Davo"] });
|
|
const r2 = b.anonymize("Authors: Alice Davó and Bob Davo");
|
|
expect(r2).to.not.include("Davó");
|
|
expect(r2).to.not.include("Davo");
|
|
expect(r2.match(/XXXX-1/g).length).to.equal(2);
|
|
});
|
|
|
|
it("does not over-match across word boundaries when the term is word-only", function () {
|
|
// Regression: ensure withWordBoundaries still emits \b on both sides
|
|
// for ordinary alphanumeric terms.
|
|
const anon = new ContentAnonimizer({ terms: ["cat"] });
|
|
const result = anon.anonymize("the cat sat on a category");
|
|
expect(result).to.include("category");
|
|
expect(result).to.match(/the XXXX-1 sat/);
|
|
});
|
|
|
|
it("replaces terms inside URLs", function () {
|
|
const anon = new ContentAnonimizer({ terms: ["myuser"] });
|
|
const result = anon.anonymize(
|
|
"visit https://github.com/myuser/project for details"
|
|
);
|
|
expect(result).to.not.include("myuser");
|
|
});
|
|
|
|
it("does not modify content when no terms provided", function () {
|
|
const anon = new ContentAnonimizer({ terms: [] });
|
|
const original = "nothing changes here";
|
|
const result = anon.anonymize(original);
|
|
expect(result).to.equal(original);
|
|
expect(anon.wasAnonymized).to.be.false;
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------
|
|
// Image removal
|
|
// ---------------------------------------------------------------
|
|
describe("removeImage", function () {
|
|
it("removes markdown images when image option is false", function () {
|
|
const anon = new ContentAnonimizer({ image: false });
|
|
const result = anon.anonymize("");
|
|
expect(result).to.equal(ANONYMIZATION_MASK);
|
|
expect(anon.wasAnonymized).to.be.true;
|
|
});
|
|
|
|
it("keeps markdown images when image option is true", function () {
|
|
const anon = new ContentAnonimizer({ image: true });
|
|
const result = anon.anonymize("");
|
|
expect(result).to.include("![alt]");
|
|
});
|
|
|
|
it("keeps markdown images when image option is undefined (default)", function () {
|
|
const anon = new ContentAnonimizer({});
|
|
const result = anon.anonymize("");
|
|
expect(result).to.include("![alt]");
|
|
});
|
|
|
|
it("removes multiple images in the same content", function () {
|
|
const anon = new ContentAnonimizer({ image: false });
|
|
const result = anon.anonymize(
|
|
" text "
|
|
);
|
|
expect(result).to.not.include("![");
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------
|
|
// Link removal
|
|
// ---------------------------------------------------------------
|
|
describe("removeLink", function () {
|
|
it("removes URLs when link option is false", function () {
|
|
const anon = new ContentAnonimizer({ link: false });
|
|
const result = anon.anonymize("visit https://example.com for info");
|
|
expect(result).to.not.include("https://example.com");
|
|
expect(result).to.include(ANONYMIZATION_MASK);
|
|
expect(anon.wasAnonymized).to.be.true;
|
|
});
|
|
|
|
it("keeps URLs when link option is true", function () {
|
|
const anon = new ContentAnonimizer({ link: true });
|
|
const result = anon.anonymize("visit https://example.com for info");
|
|
expect(result).to.include("https://example.com");
|
|
});
|
|
|
|
it("keeps URLs when link option is undefined (default)", function () {
|
|
const anon = new ContentAnonimizer({});
|
|
const result = anon.anonymize("visit https://example.com for info");
|
|
expect(result).to.include("https://example.com");
|
|
});
|
|
|
|
it("removes ftp and file URLs when link is false", function () {
|
|
const anon = new ContentAnonimizer({ link: false });
|
|
const result = anon.anonymize(
|
|
"ftp://files.example.com/a and file:///home/user/doc"
|
|
);
|
|
expect(result).to.not.include("ftp://");
|
|
expect(result).to.not.include("file:///");
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------
|
|
// GitHub self-link replacement
|
|
// ---------------------------------------------------------------
|
|
describe("replaceGitHubSelfLinks", function () {
|
|
it("replaces raw.githubusercontent.com links", function () {
|
|
const anon = new ContentAnonimizer({
|
|
repoName: "owner/repo",
|
|
branchName: "main",
|
|
repoId: "abc123",
|
|
});
|
|
const result = anon.anonymize(
|
|
"https://raw.githubusercontent.com/owner/repo/main/README.md"
|
|
);
|
|
expect(result).to.include("anonymous.4open.science/r/abc123");
|
|
expect(result).to.not.include("raw.githubusercontent.com");
|
|
});
|
|
|
|
it("replaces github.com/blob links", function () {
|
|
const anon = new ContentAnonimizer({
|
|
repoName: "owner/repo",
|
|
branchName: "main",
|
|
repoId: "abc123",
|
|
});
|
|
const result = anon.anonymize(
|
|
"https://github.com/owner/repo/blob/main/src/file.ts"
|
|
);
|
|
expect(result).to.include("anonymous.4open.science/r/abc123");
|
|
});
|
|
|
|
it("replaces github.com/tree links", function () {
|
|
const anon = new ContentAnonimizer({
|
|
repoName: "owner/repo",
|
|
branchName: "main",
|
|
repoId: "abc123",
|
|
});
|
|
const result = anon.anonymize(
|
|
"https://github.com/owner/repo/tree/main/src"
|
|
);
|
|
expect(result).to.include("anonymous.4open.science/r/abc123");
|
|
});
|
|
|
|
it("replaces generic github.com repo links", function () {
|
|
const anon = new ContentAnonimizer({
|
|
repoName: "owner/repo",
|
|
branchName: "main",
|
|
repoId: "abc123",
|
|
});
|
|
const result = anon.anonymize("https://github.com/owner/repo");
|
|
expect(result).to.include("anonymous.4open.science/r/abc123");
|
|
});
|
|
|
|
it("is case-insensitive for repo name", function () {
|
|
const anon = new ContentAnonimizer({
|
|
repoName: "Owner/Repo",
|
|
branchName: "main",
|
|
repoId: "abc123",
|
|
});
|
|
const result = anon.anonymize(
|
|
"https://github.com/owner/repo/blob/main/file"
|
|
);
|
|
expect(result).to.include("anonymous.4open.science/r/abc123");
|
|
});
|
|
|
|
it("does not replace when repoName is not set", function () {
|
|
const anon = new ContentAnonimizer({
|
|
branchName: "main",
|
|
repoId: "abc123",
|
|
});
|
|
const original = "https://github.com/owner/repo";
|
|
const result = anon.anonymize(original);
|
|
expect(result).to.equal(original);
|
|
});
|
|
|
|
it("does not replace when branchName is not set", function () {
|
|
const anon = new ContentAnonimizer({
|
|
repoName: "owner/repo",
|
|
repoId: "abc123",
|
|
});
|
|
const original = "https://github.com/owner/repo/blob/main/file";
|
|
const result = anon.anonymize(original);
|
|
expect(result).to.equal(original);
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------
|
|
// Combined anonymization
|
|
// ---------------------------------------------------------------
|
|
describe("anonymize (combined)", function () {
|
|
it("applies all transformations in sequence", function () {
|
|
const anon = new ContentAnonimizer({
|
|
image: false,
|
|
link: false,
|
|
terms: ["author"],
|
|
repoName: "author/project",
|
|
branchName: "main",
|
|
repoId: "xyz",
|
|
});
|
|
const input =
|
|
"by author:  see https://github.com/author/project";
|
|
const result = anon.anonymize(input);
|
|
expect(result).to.not.include("author");
|
|
expect(result).to.not.include("![pic]");
|
|
expect(result).to.not.include("example.com");
|
|
});
|
|
|
|
it("sets wasAnonymized to false when nothing changes", function () {
|
|
const anon = new ContentAnonimizer({
|
|
image: true,
|
|
link: true,
|
|
terms: ["nonexistent"],
|
|
});
|
|
anon.anonymize("plain text without any matching content");
|
|
expect(anon.wasAnonymized).to.be.false;
|
|
});
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// AnonymizeTransformer (streaming) — replica of src/core/anonymize-utils.ts
|
|
// ---------------------------------------------------------------------------
|
|
|
|
// Mirror of isTextFile that relies on the file extension only — the real
|
|
// impl additionally consults mime-types and isbinaryfile, but for these
|
|
// tests checking the suffix is enough to demonstrate the
|
|
// constructor-vs-post-assignment bug.
|
|
function _isTextFileFromPath(filePath) {
|
|
if (!filePath) return false;
|
|
const ext = String(filePath).split(".").pop().toLowerCase();
|
|
return new Set([
|
|
"txt", "md", "js", "ts", "tsx", "jsx", "py", "rb", "go", "java",
|
|
"c", "h", "cpp", "json", "yml", "yaml", "html", "htm", "css",
|
|
]).has(ext);
|
|
}
|
|
|
|
class AnonymizeTransformer extends Transform {
|
|
constructor(opt) {
|
|
super();
|
|
this.opt = opt || {};
|
|
// Mirror src/core/anonymize-utils.ts: isText is derived from
|
|
// opt.filePath at construction time. Mutating opt.filePath after the
|
|
// constructor runs has no effect (this was the cause of #342/#349).
|
|
this.isText = _isTextFileFromPath(this.opt.filePath);
|
|
this.anonimizer = new ContentAnonimizer(this.opt);
|
|
this.decoder = new StringDecoder("utf8");
|
|
this.pending = "";
|
|
}
|
|
static OVERLAP = 4096;
|
|
|
|
_transform(chunk, encoding, callback) {
|
|
if (!this.isText) {
|
|
this.push(chunk);
|
|
return callback();
|
|
}
|
|
this.pending += this.decoder.write(chunk);
|
|
if (this.pending.length > AnonymizeTransformer.OVERLAP) {
|
|
let split = this.pending.length - AnonymizeTransformer.OVERLAP;
|
|
const code = this.pending.charCodeAt(split);
|
|
if (code >= 0xdc00 && code <= 0xdfff) split -= 1;
|
|
const toProcess = this.pending.slice(0, split);
|
|
this.pending = this.pending.slice(split);
|
|
const out = this.anonimizer.anonymize(toProcess);
|
|
this.push(Buffer.from(out, "utf8"));
|
|
}
|
|
callback();
|
|
}
|
|
|
|
_flush(callback) {
|
|
if (this.isText) {
|
|
this.pending += this.decoder.end();
|
|
if (this.pending) {
|
|
const out = this.anonimizer.anonymize(this.pending);
|
|
this.pending = "";
|
|
this.push(Buffer.from(out, "utf8"));
|
|
}
|
|
}
|
|
callback();
|
|
}
|
|
}
|
|
|
|
function runStream(input, chunkSize, opt) {
|
|
// Default to a text filePath so existing tests keep exercising the
|
|
// anonymization path; tests verifying binary passthrough pass their own.
|
|
const merged = { filePath: "fixture.txt", ...opt };
|
|
return new Promise((resolve, reject) => {
|
|
const t = new AnonymizeTransformer(merged);
|
|
const out = [];
|
|
t.on("data", (b) => out.push(Buffer.from(b)));
|
|
t.on("end", () => resolve(Buffer.concat(out).toString("utf8")));
|
|
t.on("error", reject);
|
|
const buf = Buffer.from(input, "utf8");
|
|
for (let i = 0; i < buf.length; i += chunkSize) {
|
|
t.write(buf.slice(i, Math.min(i + chunkSize, buf.length)));
|
|
}
|
|
t.end();
|
|
});
|
|
}
|
|
|
|
describe("AnonymizeTransformer (streaming)", function () {
|
|
it("replaces all occurrences of a term across many small chunks", async function () {
|
|
// Reproduces the bug: 'Created by Alice at YYYY/MM/DD' lines split across
|
|
// chunk boundaries previously failed to match after the first ~14
|
|
// occurrences when the stream's default 16 KiB chunking aligned mid-term.
|
|
const line = "Created by Alice at 2025/01/01\n" + "x".repeat(1000) + "\n";
|
|
const input = line.repeat(50);
|
|
const expectedCount = 50;
|
|
|
|
const result = await runStream(input, 1024, { terms: ["Alice"] });
|
|
const matches = result.match(/XXXX-1/g) || [];
|
|
expect(matches.length).to.equal(expectedCount);
|
|
expect(result).to.not.include("Alice");
|
|
});
|
|
|
|
it("matches a term that lands exactly on a chunk boundary", async function () {
|
|
// Force the term 'Alice' to be split between two writes.
|
|
const prefix = "header ";
|
|
const term = "Alice";
|
|
const suffix = " trailer";
|
|
const input = prefix + term + suffix;
|
|
|
|
// First chunk ends after 'Ali', second starts at 'ce'
|
|
const splitAt = prefix.length + 3;
|
|
const t = new AnonymizeTransformer({ filePath: "fixture.txt", terms: ["Alice"] });
|
|
const out = [];
|
|
const done = new Promise((resolve, reject) => {
|
|
t.on("data", (b) => out.push(Buffer.from(b)));
|
|
t.on("end", () => resolve(Buffer.concat(out).toString("utf8")));
|
|
t.on("error", reject);
|
|
});
|
|
t.write(Buffer.from(input.slice(0, splitAt), "utf8"));
|
|
t.write(Buffer.from(input.slice(splitAt), "utf8"));
|
|
t.end();
|
|
|
|
const result = await done;
|
|
expect(result).to.equal("header XXXX-1 trailer");
|
|
});
|
|
|
|
it("preserves byte content for non-anonymized streams", async function () {
|
|
const input = "no terms match here\n".repeat(100);
|
|
const result = await runStream(input, 64, { terms: ["zzzz"] });
|
|
expect(result).to.equal(input);
|
|
});
|
|
|
|
it("flushes remaining buffered content on end", async function () {
|
|
// Total input smaller than OVERLAP — must still be processed in _flush.
|
|
const input = "Created by Alice at 2025/01/01";
|
|
const result = await runStream(input, 8, { terms: ["Alice"] });
|
|
expect(result).to.equal("Created by XXXX-1 at 2025/01/01");
|
|
});
|
|
|
|
// Regression for #342/#349: zip download was constructing the transformer
|
|
// and then assigning opt.filePath after the fact, but isText is decided in
|
|
// the constructor — so every entry was treated as binary and passed through
|
|
// unanonymized. Ensure the filePath must be set at construction time.
|
|
it("decides isText from the filePath passed at construction", function () {
|
|
const beforeFix = new AnonymizeTransformer({ terms: ["Alice"] });
|
|
beforeFix.opt.filePath = "fixture.txt"; // post-construction — too late
|
|
expect(beforeFix.isText).to.equal(false);
|
|
|
|
const afterFix = new AnonymizeTransformer({
|
|
filePath: "fixture.txt",
|
|
terms: ["Alice"],
|
|
});
|
|
expect(afterFix.isText).to.equal(true);
|
|
});
|
|
|
|
it("anonymizes a text file when filePath is supplied at construction", async function () {
|
|
const input = "Hello Alice, how are you?";
|
|
const result = await runStream(input, 8, {
|
|
filePath: "fixture.txt",
|
|
terms: ["Alice"],
|
|
});
|
|
expect(result).to.equal("Hello XXXX-1, how are you?");
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// anonymizePath
|
|
// ---------------------------------------------------------------------------
|
|
describe("anonymizePath", function () {
|
|
it("replaces a term in a file path", function () {
|
|
const result = anonymizePath("src/myproject/index.ts", ["myproject"]);
|
|
expect(result).to.equal("src/XXXX-1/index.ts");
|
|
});
|
|
|
|
it("replaces multiple terms with distinct masks", function () {
|
|
const result = anonymizePath("owner/repo/file.txt", ["owner", "repo"]);
|
|
expect(result).to.equal("XXXX-1/XXXX-2/file.txt");
|
|
});
|
|
|
|
it("is case-insensitive", function () {
|
|
const result = anonymizePath("SRC/MyProject/Main.ts", ["myproject"]);
|
|
expect(result).to.include("XXXX-1");
|
|
expect(result).to.not.include("MyProject");
|
|
});
|
|
|
|
it("skips empty terms", function () {
|
|
const result = anonymizePath("src/project/file.ts", ["", "project"]);
|
|
expect(result).to.equal("src/XXXX-2/file.ts");
|
|
});
|
|
|
|
it("handles terms with regex special characters", function () {
|
|
const result = anonymizePath("src/my.project/file.ts", ["my.project"]);
|
|
// "my.project" is valid regex where . matches any char, so it matches as-is
|
|
expect(result).to.include("XXXX-1");
|
|
});
|
|
|
|
it("replaces all occurrences of the same term", function () {
|
|
const result = anonymizePath("lib/secret/test/secret/a.js", ["secret"]);
|
|
expect(result).to.not.include("secret");
|
|
});
|
|
|
|
it("does not replace partial matches (unlike replaceTerms, anonymizePath has no word boundary)", function () {
|
|
// anonymizePath uses term directly in regex without \b,
|
|
// so "cat" inside "category" WILL be replaced in paths
|
|
const result = anonymizePath("category/cat.txt", ["cat"]);
|
|
// Both occurrences should be replaced since there are no word boundaries
|
|
expect(result).to.include("XXXX-1");
|
|
});
|
|
|
|
it("returns path unchanged when terms array is empty", function () {
|
|
const result = anonymizePath("src/file.ts", []);
|
|
expect(result).to.equal("src/file.ts");
|
|
});
|
|
});
|