anonymous_github/test/anonymize-byte-safety.test.js

const { expect } = require("chai");
require("ts-node/register/transpile-only");
const { AnonymizeTransformer } = require("../src/core/anonymize-utils");

function runRaw(chunks, opt) {
  return new Promise((resolve, reject) => {
    const t = new AnonymizeTransformer(opt);
    const out = [];
    t.on("data", (b) => out.push(Buffer.from(b)));
    t.on("end", () => resolve(Buffer.concat(out)));
    t.on("error", reject);
    for (const chunk of chunks) {
      t.write(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
    }
    t.end();
  });
}

describe("AnonymizeTransformer byte safety", function () {
  // A binary file (or text with stray bytes) misclassified as text used to
  // round-trip through StringDecoder and lose its non-UTF-8 bytes — every
  // 0xC3 (invalid lead) became EF BF BD ("U+FFFD"). When no anonymization
  // happens, the original buffer should now come out unchanged.
  it("preserves invalid UTF-8 bytes when nothing matches", async function () {
    // 0xC3 0x28 is a classic invalid sequence: lead byte without
    // continuation. Mixed with valid ASCII so the file still passes the
    // filename text-check.
    const input = Buffer.concat([
      Buffer.from("hello "),
      Buffer.from([0xc3, 0x28, 0xff, 0xfe]),
      Buffer.from(" world"),
    ]);

    const out = await runRaw([input], {
      filePath: "fixture.txt",
      terms: ["zzzz"],
    });
    expect(out.equals(input)).to.equal(true);
  });

  it("preserves a UTF-16 BOM-like prefix when nothing matches", async function () {
    const input = Buffer.from([0xff, 0xfe, ...Buffer.from("payload")]);
    const out = await runRaw([input], {
      filePath: "fixture.txt",
      terms: [],
    });
    expect(out.equals(input)).to.equal(true);
  });

  it("still anonymizes valid UTF-8 text", async function () {
    const input = "hello Alice and Bob";
    const out = await runRaw([input], {
      filePath: "fixture.txt",
      terms: ["Alice"],
    });
    expect(out.toString("utf8")).to.equal("hello XXXX-1 and Bob");
  });

  it("preserves valid-UTF-8 bytes across many chunks when no anonymization", async function () {
    // Non-ASCII but valid UTF-8 (CRLF, em-dash, accented chars) split into
    // many small writes — exercises the OVERLAP-based slicing through the
    // lossless path.
    const seg = "plain — segment with CRLF\r\nDavó café résumé ";
    const big = Buffer.from(seg.repeat(500), "utf8");
    const chunks = [];
    for (let i = 0; i < big.length; i += 137) {
      chunks.push(big.slice(i, i + 137));
    }
    const out = await runRaw(chunks, {
      filePath: "fixture.txt",
      terms: ["nope"],
    });
    expect(out.equals(big)).to.equal(true);
  });

  // Mid-stream chunks that are lossy (invalid UTF-8 splitting across the
  // OVERLAP boundary) currently fall back to the encoded form — byte
  // alignment between the decoded text and the original bytes is impossible
  // to recover without per-character byte tracking. End-of-stream lossy
  // bytes are still preserved (covered by the tests above).

  it("encodes output when anonymization does change the text, even with invalid bytes elsewhere", async function () {
    // Anonymization happens; the price is that the invalid byte becomes the
    // UTF-8 replacement char in the encoded output. That's the documented
    // trade-off — only the no-change path is byte-preserving.
    const input = Buffer.concat([
      Buffer.from("Alice "),
      Buffer.from([0xc3, 0x28]),
      Buffer.from(" trailer"),
    ]);
    const out = await runRaw([input], {
      filePath: "fixture.txt",
      terms: ["Alice"],
    });
    expect(out.toString("utf8")).to.match(/^XXXX-1 .* trailer$/);
  });
});