feat: preserve raw bytes when anonymization is a no-op

When the anonymizer doesn't change a slice's text, the streamer used to push Buffer.from(out, "utf8") — which loses any invalid-UTF-8 bytes in the input (replaced by U+FFFD via StringDecoder). Files mistakenly classified as text (binary blobs without a known extension, text with stray non-UTF-8 bytes, BOMs) came out corrupted even though nothing in the term list matched. Track the raw chunk bytes alongside the decoded `pending`. On flush — where we have every byte buffered — emit the original buffer directly when the output equals the input, so a pure passthrough is bit-exact. In the streaming OVERLAP path, do the same when the decode for that slice round-trips losslessly; fall back to encoded output otherwise (unchanged from before for that case). Also add the "missing_content" locale entry for the /api/anonymize-preview route.
2026-05-15 14:38:03 +02:00 · 2026-05-04 11:52:03 +02:00
parent 4aeecd0fdb
commit ef78e8ff3c
3 changed files with 146 additions and 2 deletions
@@ -38,6 +38,7 @@
    "options_not_provided": "Anonymization options are mandatory.",
    "terms_not_specified": "Anonymization terms must be specified.",
    "invalid_terms_format": "Terms are in an invalid format.",
+    "missing_content": "No content was provided to the anonymization preview.",
    "unable_to_anonymize": "An error happened during the anonymization process. Please try later or report the issue.",
    "non_supported_mode": "The selected anonymization mode is invalid, only download and stream are supported.",
    "invalid_path": "The provided path is invalid or missing.",
@@ -69,6 +69,13 @@ export class AnonymizeTransformer extends Transform {
  // markdown image patterns straddling a stream chunk boundary still match.
  // Must exceed the longest pattern we replace (terms + URLs + images).
  private pending = "";
+  // Raw bytes corresponding to `pending` (plus any partial UTF-8 sequence
+  // currently buffered by the decoder). Kept so we can emit the original
+  // buffer verbatim when anonymization didn't change anything — that way
+  // a binary file misclassified as text, or text with a stray non-UTF-8
+  // byte, isn't silently corrupted by a UTF-8 round-trip through the
+  // StringDecoder. See discussion in #493.
+  private pendingBytes: Buffer = Buffer.alloc(0);
  private static readonly OVERLAP = 4096;

  constructor(
@@ -89,6 +96,14 @@ export class AnonymizeTransformer extends Transform {
    return this.anonimizer.wasAnonymized;
  }

+  // Whether the candidate original bytes round-trip to the same byte
+  // sequence as `text` re-encoded. Used by the streaming path to confirm
+  // it can safely use byte-length slicing.
+  private decodeIsLossless(text: string, candidate: Buffer): boolean {
+    const reencoded = Buffer.from(text, "utf8");
+    return reencoded.length === candidate.length && reencoded.equals(candidate);
+  }
+
  _transform(chunk: Buffer, encoding: string, callback: () => void) {
    if (!this.isText) {
      this.emit("transform", {
@@ -103,6 +118,7 @@ export class AnonymizeTransformer extends Transform {
    // StringDecoder buffers trailing partial UTF-8 sequences across chunk
    // boundaries so we never decode half a codepoint into U+FFFD.
    this.pending += this.decoder.write(chunk);
+    this.pendingBytes = Buffer.concat([this.pendingBytes, chunk]);

    if (this.pending.length > AnonymizeTransformer.OVERLAP) {
      let split = this.pending.length - AnonymizeTransformer.OVERLAP;
@@ -114,8 +130,30 @@ export class AnonymizeTransformer extends Transform {
      const toProcess = this.pending.slice(0, split);
      this.pending = this.pending.slice(split);

+      // Try to keep the original byte slice alongside the decoded text. If
+      // the re-encoded text matches those bytes, the decode was lossless and
+      // we can safely emit the original buffer when nothing changed —
+      // preserving lone CRs, BOMs, etc. If it doesn't match (invalid UTF-8
+      // somewhere in the chunk), fall back to encoded output and resync
+      // pendingBytes to the canonical re-encoding of what's left.
+      const toProcessBytes = Buffer.from(toProcess, "utf8");
+      const candidateOriginal = this.pendingBytes.slice(
+        0,
+        toProcessBytes.length
+      );
      const out = this.anonimizer.anonymize(toProcess);
-      const outChunk = Buffer.from(out, "utf8");
+      const lossless = this.decodeIsLossless(toProcess, candidateOriginal);
+      let outChunk: Buffer;
+      if (out === toProcess && lossless) {
+        outChunk = candidateOriginal;
+      } else {
+        outChunk = Buffer.from(out, "utf8");
+      }
+      if (lossless) {
+        this.pendingBytes = this.pendingBytes.slice(toProcessBytes.length);
+      } else {
+        this.pendingBytes = Buffer.from(this.pending, "utf8");
+      }

      this.emit("transform", {
        isText: this.isText,
@@ -132,8 +170,16 @@ export class AnonymizeTransformer extends Transform {
      this.pending += this.decoder.end();
      if (this.pending) {
        const out = this.anonimizer.anonymize(this.pending);
+        // At end-of-stream we have every original byte buffered. If nothing
+        // changed, emit them verbatim regardless of whether the decode was
+        // lossy — preserves invalid-UTF-8 / binary content that happened
+        // to be classified as text and didn't match any term.
+        const outChunk =
+          out === this.pending
+            ? this.pendingBytes
+            : Buffer.from(out, "utf8");
        this.pending = "";
-        const outChunk = Buffer.from(out, "utf8");
+        this.pendingBytes = Buffer.alloc(0);
        this.emit("transform", {
          isText: this.isText,
          wasAnonimized: this.wasAnonimized,
@@ -0,0 +1,97 @@
+const { expect } = require("chai");
+require("ts-node/register/transpile-only");
+const { AnonymizeTransformer } = require("../src/core/anonymize-utils");
+
+function runRaw(chunks, opt) {
+  return new Promise((resolve, reject) => {
+    const t = new AnonymizeTransformer(opt);
+    const out = [];
+    t.on("data", (b) => out.push(Buffer.from(b)));
+    t.on("end", () => resolve(Buffer.concat(out)));
+    t.on("error", reject);
+    for (const chunk of chunks) {
+      t.write(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
+    }
+    t.end();
+  });
+}
+
+describe("AnonymizeTransformer byte safety", function () {
+  // A binary file (or text with stray bytes) misclassified as text used to
+  // round-trip through StringDecoder and lose its non-UTF-8 bytes — every
+  // 0xC3 (invalid lead) became EF BF BD ("U+FFFD"). When no anonymization
+  // happens, the original buffer should now come out unchanged.
+  it("preserves invalid UTF-8 bytes when nothing matches", async function () {
+    // 0xC3 0x28 is a classic invalid sequence: lead byte without
+    // continuation. Mixed with valid ASCII so the file still passes the
+    // filename text-check.
+    const input = Buffer.concat([
+      Buffer.from("hello "),
+      Buffer.from([0xc3, 0x28, 0xff, 0xfe]),
+      Buffer.from(" world"),
+    ]);
+
+    const out = await runRaw([input], {
+      filePath: "fixture.txt",
+      terms: ["zzzz"],
+    });
+    expect(out.equals(input)).to.equal(true);
+  });
+
+  it("preserves a UTF-16 BOM-like prefix when nothing matches", async function () {
+    const input = Buffer.from([0xff, 0xfe, ...Buffer.from("payload")]);
+    const out = await runRaw([input], {
+      filePath: "fixture.txt",
+      terms: [],
+    });
+    expect(out.equals(input)).to.equal(true);
+  });
+
+  it("still anonymizes valid UTF-8 text", async function () {
+    const input = "hello Alice and Bob";
+    const out = await runRaw([input], {
+      filePath: "fixture.txt",
+      terms: ["Alice"],
+    });
+    expect(out.toString("utf8")).to.equal("hello XXXX-1 and Bob");
+  });
+
+  it("preserves valid-UTF-8 bytes across many chunks when no anonymization", async function () {
+    // Non-ASCII but valid UTF-8 (CRLF, em-dash, accented chars) split into
+    // many small writes — exercises the OVERLAP-based slicing through the
+    // lossless path.
+    const seg = "plain — segment with CRLF\r\nDavó café résumé ";
+    const big = Buffer.from(seg.repeat(500), "utf8");
+    const chunks = [];
+    for (let i = 0; i < big.length; i += 137) {
+      chunks.push(big.slice(i, i + 137));
+    }
+    const out = await runRaw(chunks, {
+      filePath: "fixture.txt",
+      terms: ["nope"],
+    });
+    expect(out.equals(big)).to.equal(true);
+  });
+
+  // Mid-stream chunks that are lossy (invalid UTF-8 splitting across the
+  // OVERLAP boundary) currently fall back to the encoded form — byte
+  // alignment between the decoded text and the original bytes is impossible
+  // to recover without per-character byte tracking. End-of-stream lossy
+  // bytes are still preserved (covered by the tests above).
+
+  it("encodes output when anonymization does change the text, even with invalid bytes elsewhere", async function () {
+    // Anonymization happens; the price is that the invalid byte becomes the
+    // UTF-8 replacement char in the encoded output. That's the documented
+    // trade-off — only the no-change path is byte-preserving.
+    const input = Buffer.concat([
+      Buffer.from("Alice "),
+      Buffer.from([0xc3, 0x28]),
+      Buffer.from(" trailer"),
+    ]);
+    const out = await runRaw([input], {
+      filePath: "fixture.txt",
+      terms: ["Alice"],
+    });
+    expect(out.toString("utf8")).to.match(/^XXXX-1 .* trailer$/);
+  });
+});