From d8b129c670f9f84a9ca992852b0de1967acd507b Mon Sep 17 00:00:00 2001 From: tdurieux Date: Sun, 3 May 2026 19:47:10 +0200 Subject: [PATCH] fix: anonymize entries when downloading the full repo as a zip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The streaming zip pipeline was constructing AnonymizeTransformer first and then assigning opt.filePath afterwards. AnonymizeTransformer determines isText in its constructor from opt.filePath, so every entry was classified as binary and passed through unchanged — the downloaded zip leaked the original (un-anonymized) terms even though the web view scrubbed them. Pass filePath via the constructor so isText is computed correctly. Fixes #342, #349. --- src/core/zipStream.ts | 10 ++++++-- test/anonymize-utils.test.js | 49 +++++++++++++++++++++++++++++++++--- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/core/zipStream.ts b/src/core/zipStream.ts index 1d68cef..26c4229 100644 --- a/src/core/zipStream.ts +++ b/src/core/zipStream.ts @@ -71,8 +71,14 @@ export async function streamAnonymizedZip( entry.path.substring(entry.path.indexOf("/") + 1), opt.anonymizerOptions.terms || [] ); - const anonymizer = new AnonymizeTransformer(opt.anonymizerOptions); - anonymizer.opt.filePath = fileName; + // Pass filePath via the constructor — AnonymizeTransformer reads it + // there to decide whether the entry is text (and therefore should be + // anonymized) vs binary (passthrough). Assigning afterwards leaves + // isText=false for every file, so the zip ships unanonymized. + const anonymizer = new AnonymizeTransformer({ + ...opt.anonymizerOptions, + filePath: fileName, + }); const st = entry.pipe(anonymizer); archive.append(st, { name: fileName }); } catch (error) { diff --git a/test/anonymize-utils.test.js b/test/anonymize-utils.test.js index e9dee7e..250f3fa 100644 --- a/test/anonymize-utils.test.js +++ b/test/anonymize-utils.test.js @@ -447,11 +447,26 @@ describe("ContentAnonimizer", function () { // AnonymizeTransformer (streaming) — replica of src/core/anonymize-utils.ts // --------------------------------------------------------------------------- +// Mirror of isTextFile that relies on the file extension only — the real +// impl additionally calls istextorbinary, but for these tests checking the +// suffix is enough to demonstrate the constructor-vs-post-assignment bug. +function _isTextFileFromPath(filePath) { + if (!filePath) return false; + const ext = String(filePath).split(".").pop().toLowerCase(); + return new Set([ + "txt", "md", "js", "ts", "tsx", "jsx", "py", "rb", "go", "java", + "c", "h", "cpp", "json", "yml", "yaml", "html", "htm", "css", + ]).has(ext); +} + class AnonymizeTransformer extends Transform { constructor(opt) { super(); this.opt = opt || {}; - this.isText = true; // tests always feed text + // Mirror src/core/anonymize-utils.ts: isText is derived from + // opt.filePath at construction time. Mutating opt.filePath after the + // constructor runs has no effect (this was the cause of #342/#349). + this.isText = _isTextFileFromPath(this.opt.filePath); this.anonimizer = new ContentAnonimizer(this.opt); this.decoder = new StringDecoder("utf8"); this.pending = ""; @@ -490,8 +505,11 @@ class AnonymizeTransformer extends Transform { } function runStream(input, chunkSize, opt) { + // Default to a text filePath so existing tests keep exercising the + // anonymization path; tests verifying binary passthrough pass their own. + const merged = { filePath: "fixture.txt", ...opt }; return new Promise((resolve, reject) => { - const t = new AnonymizeTransformer(opt); + const t = new AnonymizeTransformer(merged); const out = []; t.on("data", (b) => out.push(Buffer.from(b))); t.on("end", () => resolve(Buffer.concat(out).toString("utf8"))); @@ -528,7 +546,7 @@ describe("AnonymizeTransformer (streaming)", function () { // First chunk ends after 'Ali', second starts at 'ce' const splitAt = prefix.length + 3; - const t = new AnonymizeTransformer({ terms: ["Alice"] }); + const t = new AnonymizeTransformer({ filePath: "fixture.txt", terms: ["Alice"] }); const out = []; const done = new Promise((resolve, reject) => { t.on("data", (b) => out.push(Buffer.from(b))); @@ -555,6 +573,31 @@ describe("AnonymizeTransformer (streaming)", function () { const result = await runStream(input, 8, { terms: ["Alice"] }); expect(result).to.equal("Created by XXXX-1 at 2025/01/01"); }); + + // Regression for #342/#349: zip download was constructing the transformer + // and then assigning opt.filePath after the fact, but isText is decided in + // the constructor — so every entry was treated as binary and passed through + // unanonymized. Ensure the filePath must be set at construction time. + it("decides isText from the filePath passed at construction", function () { + const beforeFix = new AnonymizeTransformer({ terms: ["Alice"] }); + beforeFix.opt.filePath = "fixture.txt"; // post-construction — too late + expect(beforeFix.isText).to.equal(false); + + const afterFix = new AnonymizeTransformer({ + filePath: "fixture.txt", + terms: ["Alice"], + }); + expect(afterFix.isText).to.equal(true); + }); + + it("anonymizes a text file when filePath is supplied at construction", async function () { + const input = "Hello Alice, how are you?"; + const result = await runStream(input, 8, { + filePath: "fixture.txt", + terms: ["Alice"], + }); + expect(result).to.equal("Hello XXXX-1, how are you?"); + }); }); // ---------------------------------------------------------------------------