mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 14:38:03 +02:00
feat: preserve raw bytes when anonymization is a no-op
When the anonymizer doesn't change a slice's text, the streamer used to push Buffer.from(out, "utf8") — which loses any invalid-UTF-8 bytes in the input (replaced by U+FFFD via StringDecoder). Files mistakenly classified as text (binary blobs without a known extension, text with stray non-UTF-8 bytes, BOMs) came out corrupted even though nothing in the term list matched. Track the raw chunk bytes alongside the decoded `pending`. On flush — where we have every byte buffered — emit the original buffer directly when the output equals the input, so a pure passthrough is bit-exact. In the streaming OVERLAP path, do the same when the decode for that slice round-trips losslessly; fall back to encoded output otherwise (unchanged from before for that case). Also add the "missing_content" locale entry for the /api/anonymize-preview route.
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
const { expect } = require("chai");
|
||||
require("ts-node/register/transpile-only");
|
||||
const { AnonymizeTransformer } = require("../src/core/anonymize-utils");
|
||||
|
||||
function runRaw(chunks, opt) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const t = new AnonymizeTransformer(opt);
|
||||
const out = [];
|
||||
t.on("data", (b) => out.push(Buffer.from(b)));
|
||||
t.on("end", () => resolve(Buffer.concat(out)));
|
||||
t.on("error", reject);
|
||||
for (const chunk of chunks) {
|
||||
t.write(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
||||
}
|
||||
t.end();
|
||||
});
|
||||
}
|
||||
|
||||
describe("AnonymizeTransformer byte safety", function () {
|
||||
// A binary file (or text with stray bytes) misclassified as text used to
|
||||
// round-trip through StringDecoder and lose its non-UTF-8 bytes — every
|
||||
// 0xC3 (invalid lead) became EF BF BD ("U+FFFD"). When no anonymization
|
||||
// happens, the original buffer should now come out unchanged.
|
||||
it("preserves invalid UTF-8 bytes when nothing matches", async function () {
|
||||
// 0xC3 0x28 is a classic invalid sequence: lead byte without
|
||||
// continuation. Mixed with valid ASCII so the file still passes the
|
||||
// filename text-check.
|
||||
const input = Buffer.concat([
|
||||
Buffer.from("hello "),
|
||||
Buffer.from([0xc3, 0x28, 0xff, 0xfe]),
|
||||
Buffer.from(" world"),
|
||||
]);
|
||||
|
||||
const out = await runRaw([input], {
|
||||
filePath: "fixture.txt",
|
||||
terms: ["zzzz"],
|
||||
});
|
||||
expect(out.equals(input)).to.equal(true);
|
||||
});
|
||||
|
||||
it("preserves a UTF-16 BOM-like prefix when nothing matches", async function () {
|
||||
const input = Buffer.from([0xff, 0xfe, ...Buffer.from("payload")]);
|
||||
const out = await runRaw([input], {
|
||||
filePath: "fixture.txt",
|
||||
terms: [],
|
||||
});
|
||||
expect(out.equals(input)).to.equal(true);
|
||||
});
|
||||
|
||||
it("still anonymizes valid UTF-8 text", async function () {
|
||||
const input = "hello Alice and Bob";
|
||||
const out = await runRaw([input], {
|
||||
filePath: "fixture.txt",
|
||||
terms: ["Alice"],
|
||||
});
|
||||
expect(out.toString("utf8")).to.equal("hello XXXX-1 and Bob");
|
||||
});
|
||||
|
||||
it("preserves valid-UTF-8 bytes across many chunks when no anonymization", async function () {
|
||||
// Non-ASCII but valid UTF-8 (CRLF, em-dash, accented chars) split into
|
||||
// many small writes — exercises the OVERLAP-based slicing through the
|
||||
// lossless path.
|
||||
const seg = "plain — segment with CRLF\r\nDavó café résumé ";
|
||||
const big = Buffer.from(seg.repeat(500), "utf8");
|
||||
const chunks = [];
|
||||
for (let i = 0; i < big.length; i += 137) {
|
||||
chunks.push(big.slice(i, i + 137));
|
||||
}
|
||||
const out = await runRaw(chunks, {
|
||||
filePath: "fixture.txt",
|
||||
terms: ["nope"],
|
||||
});
|
||||
expect(out.equals(big)).to.equal(true);
|
||||
});
|
||||
|
||||
// Mid-stream chunks that are lossy (invalid UTF-8 splitting across the
|
||||
// OVERLAP boundary) currently fall back to the encoded form — byte
|
||||
// alignment between the decoded text and the original bytes is impossible
|
||||
// to recover without per-character byte tracking. End-of-stream lossy
|
||||
// bytes are still preserved (covered by the tests above).
|
||||
|
||||
it("encodes output when anonymization does change the text, even with invalid bytes elsewhere", async function () {
|
||||
// Anonymization happens; the price is that the invalid byte becomes the
|
||||
// UTF-8 replacement char in the encoded output. That's the documented
|
||||
// trade-off — only the no-change path is byte-preserving.
|
||||
const input = Buffer.concat([
|
||||
Buffer.from("Alice "),
|
||||
Buffer.from([0xc3, 0x28]),
|
||||
Buffer.from(" trailer"),
|
||||
]);
|
||||
const out = await runRaw([input], {
|
||||
filePath: "fixture.txt",
|
||||
terms: ["Alice"],
|
||||
});
|
||||
expect(out.toString("utf8")).to.match(/^XXXX-1 .* trailer$/);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user