improve binary file detection: content sniffing + jsonl support

Files like .jsonl that mime-types doesn't know fell through to application/octet-stream and rendered as "Unsupported binary file" in the viewer. Replace istextorbinary with isbinaryfile for content-based detection, and use mime-types for name-based classification with a textual application/* allowlist. The streaming transformer now defers classification when the name is inconclusive and sniffs the first chunk before emitting "transform", so route.ts and AnonymizedFile.ts get a content-aware Content-Type. Whitelists .jsonl and .ndjson to short-circuit dataset files. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 14:38:03 +02:00 · 2026-05-06 07:52:48 +03:00
parent 18ce39e019
commit 79f555769d
6 changed files with 154 additions and 158 deletions
@@ -499,8 +499,9 @@ describe("ContentAnonimizer", function () {
 // ---------------------------------------------------------------------------

 // Mirror of isTextFile that relies on the file extension only — the real
-// impl additionally calls istextorbinary, but for these tests checking the
-// suffix is enough to demonstrate the constructor-vs-post-assignment bug.
+// impl additionally consults mime-types and isbinaryfile, but for these
+// tests checking the suffix is enough to demonstrate the
+// constructor-vs-post-assignment bug.
 function _isTextFileFromPath(filePath) {
  if (!filePath) return false;
  const ext = String(filePath).split(".").pop().toLowerCase();
@@ -3,9 +3,9 @@ require("ts-node/register/transpile-only");
 const { isTextFile } = require("../src/core/anonymize-utils");

 describe("isTextFile", function () {
-  // #493 — istextorbinary returns null for files with no extension, so a
-  // bare LICENSE / COPYING / etc. used to be classified as binary and
-  // never anonymized. Whitelist the conventional plaintext filenames.
+  // #493 — bare LICENSE / COPYING / etc. have no extension and no MIME, so
+  // we whitelist the conventional plaintext filenames to short-circuit them
+  // before falling through to content sniffing (which fails on empty files).
  it("recognizes conventional no-extension plaintext filenames", function () {
    expect(isTextFile("LICENSE")).to.equal(true);
    expect(isTextFile("license")).to.equal(true);
@@ -27,4 +27,27 @@ describe("isTextFile", function () {
    expect(isTextFile("foo.png")).to.equal(false);
    expect(isTextFile("foo.zip")).to.equal(false);
  });
+
+  it("recognizes jsonl-family dataset extensions", function () {
+    expect(isTextFile("data.jsonl")).to.equal(true);
+    expect(isTextFile("data.ndjson")).to.equal(true);
+  });
+
+  it("falls back to content sniffing for unknown extensions", function () {
+    expect(
+      isTextFile("foo.unknown", Buffer.from("hello world\nline two\n", "utf8"))
+    ).to.equal(true);
+    expect(
+      isTextFile("foo.unknown", Buffer.from([0x00, 0x01, 0x02, 0x03, 0x00, 0x05]))
+    ).to.equal(false);
+    const random = Buffer.alloc(512);
+    for (let i = 0; i < random.length; i++) random[i] = (i * 31 + 7) % 32;
+    expect(isTextFile("foo.unknown", random)).to.equal(false);
+  });
+
+  it("does not let content sniffing override a known binary extension", function () {
+    expect(
+      isTextFile("foo.png", Buffer.from("plain ascii pretending to be a png"))
+    ).to.equal(false);
+  });
 });