improve binary file detection: content sniffing + jsonl support

Files like .jsonl that mime-types doesn't know fell through to
application/octet-stream and rendered as "Unsupported binary file" in
the viewer. Replace istextorbinary with isbinaryfile for content-based
detection, and use mime-types for name-based classification with a
textual application/* allowlist.

The streaming transformer now defers classification when the name is
inconclusive and sniffs the first chunk before emitting "transform",
so route.ts and AnonymizedFile.ts get a content-aware Content-Type.
Whitelists .jsonl and .ndjson to short-circuit dataset files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
tdurieux
2026-05-06 07:52:48 +03:00
parent 18ce39e019
commit 79f555769d
6 changed files with 154 additions and 158 deletions
+18 -109
View File
@@ -28,7 +28,7 @@
"express-slow-down": "^2.0.1",
"got": "^11.8.6",
"inquirer": "^8.2.6",
"istextorbinary": "^9.5.0",
"isbinaryfile": "^6.0.0",
"marked": "^5.1.2",
"mime-types": "^2.1.35",
"mongoose": "^7.6.10",
@@ -8150,20 +8150,6 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/binaryextensions": {
"version": "6.11.0",
"resolved": "https://registry.npmjs.org/binaryextensions/-/binaryextensions-6.11.0.tgz",
"integrity": "sha512-sXnYK/Ij80TO3lcqZVV2YgfKN5QjUWIRk/XSm2J/4bd/lPko3lvk0O4ZppH6m+6hB2/GTu+ptNwVFe1xh+QLQw==",
"dependencies": {
"editions": "^6.21.0"
},
"engines": {
"node": ">=4"
},
"funding": {
"url": "https://bevry.me/fund"
}
},
"node_modules/bl": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
@@ -9292,20 +9278,6 @@
"node": ">=0.10.0"
}
},
"node_modules/editions": {
"version": "6.21.0",
"resolved": "https://registry.npmjs.org/editions/-/editions-6.21.0.tgz",
"integrity": "sha512-ofkXJtn7z0urokN62DI3SBo/5xAtF0rR7tn+S/bSYV79Ka8pTajIIl+fFQ1q88DQEImymmo97M4azY3WX/nUdg==",
"dependencies": {
"version-range": "^4.13.0"
},
"engines": {
"node": ">=4"
},
"funding": {
"url": "https://bevry.me/fund"
}
},
"node_modules/ee-first": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
@@ -11378,6 +11350,18 @@
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ=="
},
"node_modules/isbinaryfile": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-6.0.0.tgz",
"integrity": "sha512-2FN2B8MAqKv6d5TaKsLvMrwMcghxwHTpcKy0L5mhNbRqjNqo2++SpCqN6eG1lCC1GmTQgvrYJYXv2+Chvyevag==",
"license": "MIT",
"engines": {
"node": ">= 24.0.0"
},
"funding": {
"url": "https://github.com/sponsors/gjtorikian/"
}
},
"node_modules/isobject": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
@@ -11387,22 +11371,6 @@
"node": ">=0.10.0"
}
},
"node_modules/istextorbinary": {
"version": "9.5.0",
"resolved": "https://registry.npmjs.org/istextorbinary/-/istextorbinary-9.5.0.tgz",
"integrity": "sha512-5mbUj3SiZXCuRf9fT3ibzbSSEWiy63gFfksmGfdOzujPjW3k+z8WvIBxcJHBoQNlaZaiyB25deviif2+osLmLw==",
"dependencies": {
"binaryextensions": "^6.11.0",
"editions": "^6.21.0",
"textextensions": "^6.11.0"
},
"engines": {
"node": ">=4"
},
"funding": {
"url": "https://bevry.me/fund"
}
},
"node_modules/jiti": {
"version": "2.6.1",
"resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz",
@@ -13910,20 +13878,6 @@
"streamx": "^2.12.5"
}
},
"node_modules/textextensions": {
"version": "6.11.0",
"resolved": "https://registry.npmjs.org/textextensions/-/textextensions-6.11.0.tgz",
"integrity": "sha512-tXJwSr9355kFJI3lbCkPpUH5cP8/M0GGy2xLO34aZCjMXBaK3SoPnZwr/oWmo1FdCnELcs4npdCIOFtq9W3ruQ==",
"dependencies": {
"editions": "^6.21.0"
},
"engines": {
"node": ">=4"
},
"funding": {
"url": "https://bevry.me/fund"
}
},
"node_modules/through": {
"version": "2.3.8",
"resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
@@ -14392,17 +14346,6 @@
"node": ">= 0.8"
}
},
"node_modules/version-range": {
"version": "4.14.0",
"resolved": "https://registry.npmjs.org/version-range/-/version-range-4.14.0.tgz",
"integrity": "sha512-gjb0ARm9qlcBAonU4zPwkl9ecKkas+tC2CGwFfptTCWWIVTWY1YUbT2zZKsOAF1jR/tNxxyLwwG0cb42XlYcTg==",
"engines": {
"node": ">=4"
},
"funding": {
"url": "https://bevry.me/fund"
}
},
"node_modules/vinyl": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/vinyl/-/vinyl-3.0.0.tgz",
@@ -20607,14 +20550,6 @@
"integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==",
"dev": true
},
"binaryextensions": {
"version": "6.11.0",
"resolved": "https://registry.npmjs.org/binaryextensions/-/binaryextensions-6.11.0.tgz",
"integrity": "sha512-sXnYK/Ij80TO3lcqZVV2YgfKN5QjUWIRk/XSm2J/4bd/lPko3lvk0O4ZppH6m+6hB2/GTu+ptNwVFe1xh+QLQw==",
"requires": {
"editions": "^6.21.0"
}
},
"bl": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
@@ -21431,14 +21366,6 @@
}
}
},
"editions": {
"version": "6.21.0",
"resolved": "https://registry.npmjs.org/editions/-/editions-6.21.0.tgz",
"integrity": "sha512-ofkXJtn7z0urokN62DI3SBo/5xAtF0rR7tn+S/bSYV79Ka8pTajIIl+fFQ1q88DQEImymmo97M4azY3WX/nUdg==",
"requires": {
"version-range": "^4.13.0"
}
},
"ee-first": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
@@ -22919,22 +22846,17 @@
"resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
"integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ=="
},
"isbinaryfile": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-6.0.0.tgz",
"integrity": "sha512-2FN2B8MAqKv6d5TaKsLvMrwMcghxwHTpcKy0L5mhNbRqjNqo2++SpCqN6eG1lCC1GmTQgvrYJYXv2+Chvyevag=="
},
"isobject": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
"integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==",
"dev": true
},
"istextorbinary": {
"version": "9.5.0",
"resolved": "https://registry.npmjs.org/istextorbinary/-/istextorbinary-9.5.0.tgz",
"integrity": "sha512-5mbUj3SiZXCuRf9fT3ibzbSSEWiy63gFfksmGfdOzujPjW3k+z8WvIBxcJHBoQNlaZaiyB25deviif2+osLmLw==",
"requires": {
"binaryextensions": "^6.11.0",
"editions": "^6.21.0",
"textextensions": "^6.11.0"
}
},
"jiti": {
"version": "2.6.1",
"resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz",
@@ -24719,14 +24641,6 @@
"streamx": "^2.12.5"
}
},
"textextensions": {
"version": "6.11.0",
"resolved": "https://registry.npmjs.org/textextensions/-/textextensions-6.11.0.tgz",
"integrity": "sha512-tXJwSr9355kFJI3lbCkPpUH5cP8/M0GGy2xLO34aZCjMXBaK3SoPnZwr/oWmo1FdCnELcs4npdCIOFtq9W3ruQ==",
"requires": {
"editions": "^6.21.0"
}
},
"through": {
"version": "2.3.8",
"resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz",
@@ -25046,11 +24960,6 @@
"resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
"integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg=="
},
"version-range": {
"version": "4.14.0",
"resolved": "https://registry.npmjs.org/version-range/-/version-range-4.14.0.tgz",
"integrity": "sha512-gjb0ARm9qlcBAonU4zPwkl9ecKkas+tC2CGwFfptTCWWIVTWY1YUbT2zZKsOAF1jR/tNxxyLwwG0cb42XlYcTg=="
},
"vinyl": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/vinyl/-/vinyl-3.0.0.tgz",
+1 -1
View File
@@ -51,7 +51,7 @@
"express-slow-down": "^2.0.1",
"got": "^11.8.6",
"inquirer": "^8.2.6",
"istextorbinary": "^9.5.0",
"isbinaryfile": "^6.0.0",
"marked": "^5.1.2",
"mime-types": "^2.1.35",
"mongoose": "^7.6.10",
+2
View File
@@ -69,6 +69,8 @@ const config: Config = {
"out",
"sol",
"in",
"jsonl",
"ndjson",
],
STORAGE: "filesystem",
STREAMER_ENTRYPOINT: null,
+104 -43
View File
@@ -1,7 +1,8 @@
import { basename } from "path";
import { Transform, Readable } from "stream";
import { StringDecoder } from "string_decoder";
import { isText } from "istextorbinary";
import { isBinaryFileSync } from "isbinaryfile";
import { lookup as lookupMime } from "mime-types";
import config from "../config";
import {
@@ -22,47 +23,93 @@ export function streamToString(stream: Readable): Promise<string> {
});
}
// Common conventional plaintext filenames that have no extension. The
// istextorbinary package returns null (unknown) for these, which our
// `=== true` check then treats as binary — so terms in LICENSE, COPYING,
// etc. silently went through unchanged (#493).
const KNOWN_TEXT_FILENAMES = new Set(
[
"license",
"licence",
"copying",
"copyright",
"authors",
"contributors",
"readme",
"changelog",
"changes",
"notice",
"install",
"todo",
"version",
"manifest",
]
);
// Common conventional plaintext filenames that have no extension and no MIME
// match. Without this whitelist a bare LICENSE / COPYING / etc. would fall
// through to content sniffing, which is fine for non-empty files but breaks
// on zero-byte ones — so we short-circuit them here (#493).
const KNOWN_TEXT_FILENAMES = new Set([
"license",
"licence",
"copying",
"copyright",
"authors",
"contributors",
"readme",
"changelog",
"changes",
"notice",
"install",
"todo",
"version",
"manifest",
]);
export function isTextFile(filePath: string, content?: Buffer) {
const filename = basename(filePath);
const extensions = filename.split(".").reverse();
const extension = extensions[0].toLowerCase();
if (config.additionalExtensions.includes(extension)) {
return true;
}
if (KNOWN_TEXT_FILENAMES.has(filename.toLowerCase())) {
return true;
}
if (isText(filename)) {
return true;
}
return isText(filename, content);
// Application/* MIME types that carry text payloads. text/* is always text,
// application/* needs an allowlist (most are binary: zip, pdf, octet-stream).
const TEXTUAL_APPLICATION_MIMES = new Set([
"application/json",
"application/ld+json",
"application/xml",
"application/javascript",
"application/ecmascript",
"application/typescript",
"application/toml",
"application/sql",
"application/x-sql",
"application/x-sh",
"application/x-csh",
"application/x-yaml",
"application/yaml",
"application/x-httpd-php",
"application/graphql",
"application/x-tex",
"application/x-latex",
"application/x-perl",
"application/x-ruby",
"application/x-python",
]);
function isTextualMime(mime: string): boolean {
if (mime.startsWith("text/")) return true;
if (TEXTUAL_APPLICATION_MIMES.has(mime)) return true;
// application/*+json, application/*+xml, application/*+yaml
return /\+(json|xml|yaml)$/.test(mime);
}
// Name-only classification: returns true (known text), false (known binary),
// or null when the name alone is inconclusive. The streaming transformer
// resolves null by sniffing the first chunk with isbinaryfile.
function classifyByName(filePath: string): boolean | null {
const name = basename(filePath);
const extension = name.split(".").reverse()[0].toLowerCase();
if (config.additionalExtensions.includes(extension)) return true;
if (KNOWN_TEXT_FILENAMES.has(name.toLowerCase())) return true;
const mime = lookupMime(name);
if (mime === false) return null;
// mime-types treats `.ts` as video/mp2t; route.ts already special-cases it.
// Prefer text for the ambiguous extension since it matches our typical use.
if (extension === "ts") return true;
return isTextualMime(mime);
}
export function isTextFile(filePath: string, content?: Buffer): boolean {
const byName = classifyByName(filePath);
if (byName === true) return true;
if (byName === false) return false;
// Name was inconclusive — sniff the buffer if we have one. isbinaryfile
// checks for null bytes / non-printable ratio in the first 512 bytes
// and returns a decisive boolean.
if (content && content.length > 0) return !isBinaryFileSync(content);
return false;
}
export class AnonymizeTransformer extends Transform {
public isText: boolean;
// Set in the constructor for known extensions; left null until the first
// chunk arrives for unknown extensions, where it's resolved by sniffing.
// Consumers of the "transform" event always see a resolved boolean — we
// sniff before emitting.
public isText!: boolean;
private nameVerdict: boolean | null;
anonimizer: ContentAnonimizer;
private decoder = new StringDecoder("utf8");
// Trailing decoded text held back between chunks so that terms, URLs, or
@@ -84,11 +131,13 @@ export class AnonymizeTransformer extends Transform {
} & ConstructorParameters<typeof ContentAnonimizer>[0]
) {
super();
// isTextFile may return null for unknown extensions; treat unknown as
// binary. Sniffing from chunk content is unsafe — split archives,
// compressed blobs, etc. can have an ASCII-looking first 64 KB and get
// misclassified as text, which then UTF-8-round-trips and corrupts them.
this.isText = isTextFile(this.opt.filePath) === true;
// Tri-state: name-based check returns true (known text), false (known
// binary), or null (name inconclusive). For null we defer to a content
// sniff on the first chunk in _transform — known binary extensions
// (archives, compressed blobs, images) are resolved here and never
// reach the sniff path (#493).
this.nameVerdict = classifyByName(this.opt.filePath);
if (this.nameVerdict !== null) this.isText = this.nameVerdict;
this.anonimizer = new ContentAnonimizer(this.opt);
}
@@ -105,6 +154,12 @@ export class AnonymizeTransformer extends Transform {
}
_transform(chunk: Buffer, encoding: string, callback: () => void) {
if (this.nameVerdict === null) {
// Name didn't decide. isbinaryfile inspects the first 512 bytes for
// null bytes and non-printable ratio and returns a decisive boolean.
this.isText = chunk.length === 0 ? true : !isBinaryFileSync(chunk);
this.nameVerdict = this.isText;
}
if (!this.isText) {
this.emit("transform", {
isText: this.isText,
@@ -166,6 +221,12 @@ export class AnonymizeTransformer extends Transform {
}
_flush(callback: () => void) {
// Empty file with an unknown extension: no chunk arrived to trigger
// sniffing. Treat as text — there's nothing to corrupt.
if (this.nameVerdict === null) {
this.isText = true;
this.nameVerdict = true;
}
if (this.isText) {
this.pending += this.decoder.end();
if (this.pending) {
+3 -2
View File
@@ -499,8 +499,9 @@ describe("ContentAnonimizer", function () {
// ---------------------------------------------------------------------------
// Mirror of isTextFile that relies on the file extension only — the real
// impl additionally calls istextorbinary, but for these tests checking the
// suffix is enough to demonstrate the constructor-vs-post-assignment bug.
// impl additionally consults mime-types and isbinaryfile, but for these
// tests checking the suffix is enough to demonstrate the
// constructor-vs-post-assignment bug.
function _isTextFileFromPath(filePath) {
if (!filePath) return false;
const ext = String(filePath).split(".").pop().toLowerCase();
+26 -3
View File
@@ -3,9 +3,9 @@ require("ts-node/register/transpile-only");
const { isTextFile } = require("../src/core/anonymize-utils");
describe("isTextFile", function () {
// #493 — istextorbinary returns null for files with no extension, so a
// bare LICENSE / COPYING / etc. used to be classified as binary and
// never anonymized. Whitelist the conventional plaintext filenames.
// #493 — bare LICENSE / COPYING / etc. have no extension and no MIME, so
// we whitelist the conventional plaintext filenames to short-circuit them
// before falling through to content sniffing (which fails on empty files).
it("recognizes conventional no-extension plaintext filenames", function () {
expect(isTextFile("LICENSE")).to.equal(true);
expect(isTextFile("license")).to.equal(true);
@@ -27,4 +27,27 @@ describe("isTextFile", function () {
expect(isTextFile("foo.png")).to.equal(false);
expect(isTextFile("foo.zip")).to.equal(false);
});
it("recognizes jsonl-family dataset extensions", function () {
expect(isTextFile("data.jsonl")).to.equal(true);
expect(isTextFile("data.ndjson")).to.equal(true);
});
it("falls back to content sniffing for unknown extensions", function () {
expect(
isTextFile("foo.unknown", Buffer.from("hello world\nline two\n", "utf8"))
).to.equal(true);
expect(
isTextFile("foo.unknown", Buffer.from([0x00, 0x01, 0x02, 0x03, 0x00, 0x05]))
).to.equal(false);
const random = Buffer.alloc(512);
for (let i = 0; i < random.length; i++) random[i] = (i * 31 + 7) % 32;
expect(isTextFile("foo.unknown", random)).to.equal(false);
});
it("does not let content sniffing override a known binary extension", function () {
expect(
isTextFile("foo.png", Buffer.from("plain ascii pretending to be a png"))
).to.equal(false);
});
});