diff --git a/public/i18n/locale-en.json b/public/i18n/locale-en.json index 28da65f..d23dd6c 100644 --- a/public/i18n/locale-en.json +++ b/public/i18n/locale-en.json @@ -38,6 +38,7 @@ "options_not_provided": "Anonymization options are mandatory.", "terms_not_specified": "Anonymization terms must be specified.", "invalid_terms_format": "Terms are in an invalid format.", + "missing_content": "No content was provided to the anonymization preview.", "unable_to_anonymize": "An error happened during the anonymization process. Please try later or report the issue.", "non_supported_mode": "The selected anonymization mode is invalid, only download and stream are supported.", "invalid_path": "The provided path is invalid or missing.", diff --git a/src/core/anonymize-utils.ts b/src/core/anonymize-utils.ts index 7d54e90..be7ba86 100644 --- a/src/core/anonymize-utils.ts +++ b/src/core/anonymize-utils.ts @@ -69,6 +69,13 @@ export class AnonymizeTransformer extends Transform { // markdown image patterns straddling a stream chunk boundary still match. // Must exceed the longest pattern we replace (terms + URLs + images). private pending = ""; + // Raw bytes corresponding to `pending` (plus any partial UTF-8 sequence + // currently buffered by the decoder). Kept so we can emit the original + // buffer verbatim when anonymization didn't change anything — that way + // a binary file misclassified as text, or text with a stray non-UTF-8 + // byte, isn't silently corrupted by a UTF-8 round-trip through the + // StringDecoder. See discussion in #493. + private pendingBytes: Buffer = Buffer.alloc(0); private static readonly OVERLAP = 4096; constructor( @@ -89,6 +96,14 @@ export class AnonymizeTransformer extends Transform { return this.anonimizer.wasAnonymized; } + // Whether the candidate original bytes round-trip to the same byte + // sequence as `text` re-encoded. Used by the streaming path to confirm + // it can safely use byte-length slicing. + private decodeIsLossless(text: string, candidate: Buffer): boolean { + const reencoded = Buffer.from(text, "utf8"); + return reencoded.length === candidate.length && reencoded.equals(candidate); + } + _transform(chunk: Buffer, encoding: string, callback: () => void) { if (!this.isText) { this.emit("transform", { @@ -103,6 +118,7 @@ export class AnonymizeTransformer extends Transform { // StringDecoder buffers trailing partial UTF-8 sequences across chunk // boundaries so we never decode half a codepoint into U+FFFD. this.pending += this.decoder.write(chunk); + this.pendingBytes = Buffer.concat([this.pendingBytes, chunk]); if (this.pending.length > AnonymizeTransformer.OVERLAP) { let split = this.pending.length - AnonymizeTransformer.OVERLAP; @@ -114,8 +130,30 @@ export class AnonymizeTransformer extends Transform { const toProcess = this.pending.slice(0, split); this.pending = this.pending.slice(split); + // Try to keep the original byte slice alongside the decoded text. If + // the re-encoded text matches those bytes, the decode was lossless and + // we can safely emit the original buffer when nothing changed — + // preserving lone CRs, BOMs, etc. If it doesn't match (invalid UTF-8 + // somewhere in the chunk), fall back to encoded output and resync + // pendingBytes to the canonical re-encoding of what's left. + const toProcessBytes = Buffer.from(toProcess, "utf8"); + const candidateOriginal = this.pendingBytes.slice( + 0, + toProcessBytes.length + ); const out = this.anonimizer.anonymize(toProcess); - const outChunk = Buffer.from(out, "utf8"); + const lossless = this.decodeIsLossless(toProcess, candidateOriginal); + let outChunk: Buffer; + if (out === toProcess && lossless) { + outChunk = candidateOriginal; + } else { + outChunk = Buffer.from(out, "utf8"); + } + if (lossless) { + this.pendingBytes = this.pendingBytes.slice(toProcessBytes.length); + } else { + this.pendingBytes = Buffer.from(this.pending, "utf8"); + } this.emit("transform", { isText: this.isText, @@ -132,8 +170,16 @@ export class AnonymizeTransformer extends Transform { this.pending += this.decoder.end(); if (this.pending) { const out = this.anonimizer.anonymize(this.pending); + // At end-of-stream we have every original byte buffered. If nothing + // changed, emit them verbatim regardless of whether the decode was + // lossy — preserves invalid-UTF-8 / binary content that happened + // to be classified as text and didn't match any term. + const outChunk = + out === this.pending + ? this.pendingBytes + : Buffer.from(out, "utf8"); this.pending = ""; - const outChunk = Buffer.from(out, "utf8"); + this.pendingBytes = Buffer.alloc(0); this.emit("transform", { isText: this.isText, wasAnonimized: this.wasAnonimized, diff --git a/test/anonymize-byte-safety.test.js b/test/anonymize-byte-safety.test.js new file mode 100644 index 0000000..6282977 --- /dev/null +++ b/test/anonymize-byte-safety.test.js @@ -0,0 +1,97 @@ +const { expect } = require("chai"); +require("ts-node/register/transpile-only"); +const { AnonymizeTransformer } = require("../src/core/anonymize-utils"); + +function runRaw(chunks, opt) { + return new Promise((resolve, reject) => { + const t = new AnonymizeTransformer(opt); + const out = []; + t.on("data", (b) => out.push(Buffer.from(b))); + t.on("end", () => resolve(Buffer.concat(out))); + t.on("error", reject); + for (const chunk of chunks) { + t.write(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk)); + } + t.end(); + }); +} + +describe("AnonymizeTransformer byte safety", function () { + // A binary file (or text with stray bytes) misclassified as text used to + // round-trip through StringDecoder and lose its non-UTF-8 bytes — every + // 0xC3 (invalid lead) became EF BF BD ("U+FFFD"). When no anonymization + // happens, the original buffer should now come out unchanged. + it("preserves invalid UTF-8 bytes when nothing matches", async function () { + // 0xC3 0x28 is a classic invalid sequence: lead byte without + // continuation. Mixed with valid ASCII so the file still passes the + // filename text-check. + const input = Buffer.concat([ + Buffer.from("hello "), + Buffer.from([0xc3, 0x28, 0xff, 0xfe]), + Buffer.from(" world"), + ]); + + const out = await runRaw([input], { + filePath: "fixture.txt", + terms: ["zzzz"], + }); + expect(out.equals(input)).to.equal(true); + }); + + it("preserves a UTF-16 BOM-like prefix when nothing matches", async function () { + const input = Buffer.from([0xff, 0xfe, ...Buffer.from("payload")]); + const out = await runRaw([input], { + filePath: "fixture.txt", + terms: [], + }); + expect(out.equals(input)).to.equal(true); + }); + + it("still anonymizes valid UTF-8 text", async function () { + const input = "hello Alice and Bob"; + const out = await runRaw([input], { + filePath: "fixture.txt", + terms: ["Alice"], + }); + expect(out.toString("utf8")).to.equal("hello XXXX-1 and Bob"); + }); + + it("preserves valid-UTF-8 bytes across many chunks when no anonymization", async function () { + // Non-ASCII but valid UTF-8 (CRLF, em-dash, accented chars) split into + // many small writes — exercises the OVERLAP-based slicing through the + // lossless path. + const seg = "plain — segment with CRLF\r\nDavó café résumé "; + const big = Buffer.from(seg.repeat(500), "utf8"); + const chunks = []; + for (let i = 0; i < big.length; i += 137) { + chunks.push(big.slice(i, i + 137)); + } + const out = await runRaw(chunks, { + filePath: "fixture.txt", + terms: ["nope"], + }); + expect(out.equals(big)).to.equal(true); + }); + + // Mid-stream chunks that are lossy (invalid UTF-8 splitting across the + // OVERLAP boundary) currently fall back to the encoded form — byte + // alignment between the decoded text and the original bytes is impossible + // to recover without per-character byte tracking. End-of-stream lossy + // bytes are still preserved (covered by the tests above). + + it("encodes output when anonymization does change the text, even with invalid bytes elsewhere", async function () { + // Anonymization happens; the price is that the invalid byte becomes the + // UTF-8 replacement char in the encoded output. That's the documented + // trade-off — only the no-change path is byte-preserving. + const input = Buffer.concat([ + Buffer.from("Alice "), + Buffer.from([0xc3, 0x28]), + Buffer.from(" trailer"), + ]); + const out = await runRaw([input], { + filePath: "fixture.txt", + terms: ["Alice"], + }); + expect(out.toString("utf8")).to.match(/^XXXX-1 .* trailer$/); + }); +});