From 79f555769dccc05bcd0080b260ff7bb3ac6fb8a9 Mon Sep 17 00:00:00 2001 From: tdurieux Date: Wed, 6 May 2026 07:52:48 +0300 Subject: [PATCH] improve binary file detection: content sniffing + jsonl support Files like .jsonl that mime-types doesn't know fell through to application/octet-stream and rendered as "Unsupported binary file" in the viewer. Replace istextorbinary with isbinaryfile for content-based detection, and use mime-types for name-based classification with a textual application/* allowlist. The streaming transformer now defers classification when the name is inconclusive and sniffs the first chunk before emitting "transform", so route.ts and AnonymizedFile.ts get a content-aware Content-Type. Whitelists .jsonl and .ndjson to short-circuit dataset files. Co-Authored-By: Claude Opus 4.7 --- package-lock.json | 127 +++++------------------------- package.json | 2 +- src/config.ts | 2 + src/core/anonymize-utils.ts | 147 +++++++++++++++++++++++++---------- test/anonymize-utils.test.js | 5 +- test/is-text-file.test.js | 29 ++++++- 6 files changed, 154 insertions(+), 158 deletions(-) diff --git a/package-lock.json b/package-lock.json index 82b1620..8c034dd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -28,7 +28,7 @@ "express-slow-down": "^2.0.1", "got": "^11.8.6", "inquirer": "^8.2.6", - "istextorbinary": "^9.5.0", + "isbinaryfile": "^6.0.0", "marked": "^5.1.2", "mime-types": "^2.1.35", "mongoose": "^7.6.10", @@ -8150,20 +8150,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/binaryextensions": { - "version": "6.11.0", - "resolved": "https://registry.npmjs.org/binaryextensions/-/binaryextensions-6.11.0.tgz", - "integrity": "sha512-sXnYK/Ij80TO3lcqZVV2YgfKN5QjUWIRk/XSm2J/4bd/lPko3lvk0O4ZppH6m+6hB2/GTu+ptNwVFe1xh+QLQw==", - "dependencies": { - "editions": "^6.21.0" - }, - "engines": { - "node": ">=4" - }, - "funding": { - "url": "https://bevry.me/fund" - } - }, "node_modules/bl": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", @@ -9292,20 +9278,6 @@ "node": ">=0.10.0" } }, - "node_modules/editions": { - "version": "6.21.0", - "resolved": "https://registry.npmjs.org/editions/-/editions-6.21.0.tgz", - "integrity": "sha512-ofkXJtn7z0urokN62DI3SBo/5xAtF0rR7tn+S/bSYV79Ka8pTajIIl+fFQ1q88DQEImymmo97M4azY3WX/nUdg==", - "dependencies": { - "version-range": "^4.13.0" - }, - "engines": { - "node": ">=4" - }, - "funding": { - "url": "https://bevry.me/fund" - } - }, "node_modules/ee-first": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", @@ -11378,6 +11350,18 @@ "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" }, + "node_modules/isbinaryfile": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-6.0.0.tgz", + "integrity": "sha512-2FN2B8MAqKv6d5TaKsLvMrwMcghxwHTpcKy0L5mhNbRqjNqo2++SpCqN6eG1lCC1GmTQgvrYJYXv2+Chvyevag==", + "license": "MIT", + "engines": { + "node": ">= 24.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/gjtorikian/" + } + }, "node_modules/isobject": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", @@ -11387,22 +11371,6 @@ "node": ">=0.10.0" } }, - "node_modules/istextorbinary": { - "version": "9.5.0", - "resolved": "https://registry.npmjs.org/istextorbinary/-/istextorbinary-9.5.0.tgz", - "integrity": "sha512-5mbUj3SiZXCuRf9fT3ibzbSSEWiy63gFfksmGfdOzujPjW3k+z8WvIBxcJHBoQNlaZaiyB25deviif2+osLmLw==", - "dependencies": { - "binaryextensions": "^6.11.0", - "editions": "^6.21.0", - "textextensions": "^6.11.0" - }, - "engines": { - "node": ">=4" - }, - "funding": { - "url": "https://bevry.me/fund" - } - }, "node_modules/jiti": { "version": "2.6.1", "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz", @@ -13910,20 +13878,6 @@ "streamx": "^2.12.5" } }, - "node_modules/textextensions": { - "version": "6.11.0", - "resolved": "https://registry.npmjs.org/textextensions/-/textextensions-6.11.0.tgz", - "integrity": "sha512-tXJwSr9355kFJI3lbCkPpUH5cP8/M0GGy2xLO34aZCjMXBaK3SoPnZwr/oWmo1FdCnELcs4npdCIOFtq9W3ruQ==", - "dependencies": { - "editions": "^6.21.0" - }, - "engines": { - "node": ">=4" - }, - "funding": { - "url": "https://bevry.me/fund" - } - }, "node_modules/through": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", @@ -14392,17 +14346,6 @@ "node": ">= 0.8" } }, - "node_modules/version-range": { - "version": "4.14.0", - "resolved": "https://registry.npmjs.org/version-range/-/version-range-4.14.0.tgz", - "integrity": "sha512-gjb0ARm9qlcBAonU4zPwkl9ecKkas+tC2CGwFfptTCWWIVTWY1YUbT2zZKsOAF1jR/tNxxyLwwG0cb42XlYcTg==", - "engines": { - "node": ">=4" - }, - "funding": { - "url": "https://bevry.me/fund" - } - }, "node_modules/vinyl": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/vinyl/-/vinyl-3.0.0.tgz", @@ -20607,14 +20550,6 @@ "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", "dev": true }, - "binaryextensions": { - "version": "6.11.0", - "resolved": "https://registry.npmjs.org/binaryextensions/-/binaryextensions-6.11.0.tgz", - "integrity": "sha512-sXnYK/Ij80TO3lcqZVV2YgfKN5QjUWIRk/XSm2J/4bd/lPko3lvk0O4ZppH6m+6hB2/GTu+ptNwVFe1xh+QLQw==", - "requires": { - "editions": "^6.21.0" - } - }, "bl": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", @@ -21431,14 +21366,6 @@ } } }, - "editions": { - "version": "6.21.0", - "resolved": "https://registry.npmjs.org/editions/-/editions-6.21.0.tgz", - "integrity": "sha512-ofkXJtn7z0urokN62DI3SBo/5xAtF0rR7tn+S/bSYV79Ka8pTajIIl+fFQ1q88DQEImymmo97M4azY3WX/nUdg==", - "requires": { - "version-range": "^4.13.0" - } - }, "ee-first": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", @@ -22919,22 +22846,17 @@ "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" }, + "isbinaryfile": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/isbinaryfile/-/isbinaryfile-6.0.0.tgz", + "integrity": "sha512-2FN2B8MAqKv6d5TaKsLvMrwMcghxwHTpcKy0L5mhNbRqjNqo2++SpCqN6eG1lCC1GmTQgvrYJYXv2+Chvyevag==" + }, "isobject": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", "dev": true }, - "istextorbinary": { - "version": "9.5.0", - "resolved": "https://registry.npmjs.org/istextorbinary/-/istextorbinary-9.5.0.tgz", - "integrity": "sha512-5mbUj3SiZXCuRf9fT3ibzbSSEWiy63gFfksmGfdOzujPjW3k+z8WvIBxcJHBoQNlaZaiyB25deviif2+osLmLw==", - "requires": { - "binaryextensions": "^6.11.0", - "editions": "^6.21.0", - "textextensions": "^6.11.0" - } - }, "jiti": { "version": "2.6.1", "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.6.1.tgz", @@ -24719,14 +24641,6 @@ "streamx": "^2.12.5" } }, - "textextensions": { - "version": "6.11.0", - "resolved": "https://registry.npmjs.org/textextensions/-/textextensions-6.11.0.tgz", - "integrity": "sha512-tXJwSr9355kFJI3lbCkPpUH5cP8/M0GGy2xLO34aZCjMXBaK3SoPnZwr/oWmo1FdCnELcs4npdCIOFtq9W3ruQ==", - "requires": { - "editions": "^6.21.0" - } - }, "through": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", @@ -25046,11 +24960,6 @@ "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz", "integrity": "sha512-BNGbWLfd0eUPabhkXUVm0j8uuvREyTh5ovRa/dyow/BqAbZJyC+5fU+IzQOzmAKzYqYRAISoRhdQr3eIZ/PXqg==" }, - "version-range": { - "version": "4.14.0", - "resolved": "https://registry.npmjs.org/version-range/-/version-range-4.14.0.tgz", - "integrity": "sha512-gjb0ARm9qlcBAonU4zPwkl9ecKkas+tC2CGwFfptTCWWIVTWY1YUbT2zZKsOAF1jR/tNxxyLwwG0cb42XlYcTg==" - }, "vinyl": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/vinyl/-/vinyl-3.0.0.tgz", diff --git a/package.json b/package.json index 1404e88..d3257a0 100644 --- a/package.json +++ b/package.json @@ -51,7 +51,7 @@ "express-slow-down": "^2.0.1", "got": "^11.8.6", "inquirer": "^8.2.6", - "istextorbinary": "^9.5.0", + "isbinaryfile": "^6.0.0", "marked": "^5.1.2", "mime-types": "^2.1.35", "mongoose": "^7.6.10", diff --git a/src/config.ts b/src/config.ts index 3889507..d331896 100644 --- a/src/config.ts +++ b/src/config.ts @@ -69,6 +69,8 @@ const config: Config = { "out", "sol", "in", + "jsonl", + "ndjson", ], STORAGE: "filesystem", STREAMER_ENTRYPOINT: null, diff --git a/src/core/anonymize-utils.ts b/src/core/anonymize-utils.ts index ff1bdfd..7a17e01 100644 --- a/src/core/anonymize-utils.ts +++ b/src/core/anonymize-utils.ts @@ -1,7 +1,8 @@ import { basename } from "path"; import { Transform, Readable } from "stream"; import { StringDecoder } from "string_decoder"; -import { isText } from "istextorbinary"; +import { isBinaryFileSync } from "isbinaryfile"; +import { lookup as lookupMime } from "mime-types"; import config from "../config"; import { @@ -22,47 +23,93 @@ export function streamToString(stream: Readable): Promise { }); } -// Common conventional plaintext filenames that have no extension. The -// istextorbinary package returns null (unknown) for these, which our -// `=== true` check then treats as binary — so terms in LICENSE, COPYING, -// etc. silently went through unchanged (#493). -const KNOWN_TEXT_FILENAMES = new Set( - [ - "license", - "licence", - "copying", - "copyright", - "authors", - "contributors", - "readme", - "changelog", - "changes", - "notice", - "install", - "todo", - "version", - "manifest", - ] -); +// Common conventional plaintext filenames that have no extension and no MIME +// match. Without this whitelist a bare LICENSE / COPYING / etc. would fall +// through to content sniffing, which is fine for non-empty files but breaks +// on zero-byte ones — so we short-circuit them here (#493). +const KNOWN_TEXT_FILENAMES = new Set([ + "license", + "licence", + "copying", + "copyright", + "authors", + "contributors", + "readme", + "changelog", + "changes", + "notice", + "install", + "todo", + "version", + "manifest", +]); -export function isTextFile(filePath: string, content?: Buffer) { - const filename = basename(filePath); - const extensions = filename.split(".").reverse(); - const extension = extensions[0].toLowerCase(); - if (config.additionalExtensions.includes(extension)) { - return true; - } - if (KNOWN_TEXT_FILENAMES.has(filename.toLowerCase())) { - return true; - } - if (isText(filename)) { - return true; - } - return isText(filename, content); +// Application/* MIME types that carry text payloads. text/* is always text, +// application/* needs an allowlist (most are binary: zip, pdf, octet-stream). +const TEXTUAL_APPLICATION_MIMES = new Set([ + "application/json", + "application/ld+json", + "application/xml", + "application/javascript", + "application/ecmascript", + "application/typescript", + "application/toml", + "application/sql", + "application/x-sql", + "application/x-sh", + "application/x-csh", + "application/x-yaml", + "application/yaml", + "application/x-httpd-php", + "application/graphql", + "application/x-tex", + "application/x-latex", + "application/x-perl", + "application/x-ruby", + "application/x-python", +]); + +function isTextualMime(mime: string): boolean { + if (mime.startsWith("text/")) return true; + if (TEXTUAL_APPLICATION_MIMES.has(mime)) return true; + // application/*+json, application/*+xml, application/*+yaml + return /\+(json|xml|yaml)$/.test(mime); +} + +// Name-only classification: returns true (known text), false (known binary), +// or null when the name alone is inconclusive. The streaming transformer +// resolves null by sniffing the first chunk with isbinaryfile. +function classifyByName(filePath: string): boolean | null { + const name = basename(filePath); + const extension = name.split(".").reverse()[0].toLowerCase(); + if (config.additionalExtensions.includes(extension)) return true; + if (KNOWN_TEXT_FILENAMES.has(name.toLowerCase())) return true; + const mime = lookupMime(name); + if (mime === false) return null; + // mime-types treats `.ts` as video/mp2t; route.ts already special-cases it. + // Prefer text for the ambiguous extension since it matches our typical use. + if (extension === "ts") return true; + return isTextualMime(mime); +} + +export function isTextFile(filePath: string, content?: Buffer): boolean { + const byName = classifyByName(filePath); + if (byName === true) return true; + if (byName === false) return false; + // Name was inconclusive — sniff the buffer if we have one. isbinaryfile + // checks for null bytes / non-printable ratio in the first 512 bytes + // and returns a decisive boolean. + if (content && content.length > 0) return !isBinaryFileSync(content); + return false; } export class AnonymizeTransformer extends Transform { - public isText: boolean; + // Set in the constructor for known extensions; left null until the first + // chunk arrives for unknown extensions, where it's resolved by sniffing. + // Consumers of the "transform" event always see a resolved boolean — we + // sniff before emitting. + public isText!: boolean; + private nameVerdict: boolean | null; anonimizer: ContentAnonimizer; private decoder = new StringDecoder("utf8"); // Trailing decoded text held back between chunks so that terms, URLs, or @@ -84,11 +131,13 @@ export class AnonymizeTransformer extends Transform { } & ConstructorParameters[0] ) { super(); - // isTextFile may return null for unknown extensions; treat unknown as - // binary. Sniffing from chunk content is unsafe — split archives, - // compressed blobs, etc. can have an ASCII-looking first 64 KB and get - // misclassified as text, which then UTF-8-round-trips and corrupts them. - this.isText = isTextFile(this.opt.filePath) === true; + // Tri-state: name-based check returns true (known text), false (known + // binary), or null (name inconclusive). For null we defer to a content + // sniff on the first chunk in _transform — known binary extensions + // (archives, compressed blobs, images) are resolved here and never + // reach the sniff path (#493). + this.nameVerdict = classifyByName(this.opt.filePath); + if (this.nameVerdict !== null) this.isText = this.nameVerdict; this.anonimizer = new ContentAnonimizer(this.opt); } @@ -105,6 +154,12 @@ export class AnonymizeTransformer extends Transform { } _transform(chunk: Buffer, encoding: string, callback: () => void) { + if (this.nameVerdict === null) { + // Name didn't decide. isbinaryfile inspects the first 512 bytes for + // null bytes and non-printable ratio and returns a decisive boolean. + this.isText = chunk.length === 0 ? true : !isBinaryFileSync(chunk); + this.nameVerdict = this.isText; + } if (!this.isText) { this.emit("transform", { isText: this.isText, @@ -166,6 +221,12 @@ export class AnonymizeTransformer extends Transform { } _flush(callback: () => void) { + // Empty file with an unknown extension: no chunk arrived to trigger + // sniffing. Treat as text — there's nothing to corrupt. + if (this.nameVerdict === null) { + this.isText = true; + this.nameVerdict = true; + } if (this.isText) { this.pending += this.decoder.end(); if (this.pending) { diff --git a/test/anonymize-utils.test.js b/test/anonymize-utils.test.js index 550e4e1..469cd69 100644 --- a/test/anonymize-utils.test.js +++ b/test/anonymize-utils.test.js @@ -499,8 +499,9 @@ describe("ContentAnonimizer", function () { // --------------------------------------------------------------------------- // Mirror of isTextFile that relies on the file extension only — the real -// impl additionally calls istextorbinary, but for these tests checking the -// suffix is enough to demonstrate the constructor-vs-post-assignment bug. +// impl additionally consults mime-types and isbinaryfile, but for these +// tests checking the suffix is enough to demonstrate the +// constructor-vs-post-assignment bug. function _isTextFileFromPath(filePath) { if (!filePath) return false; const ext = String(filePath).split(".").pop().toLowerCase(); diff --git a/test/is-text-file.test.js b/test/is-text-file.test.js index 8c94dac..8a96990 100644 --- a/test/is-text-file.test.js +++ b/test/is-text-file.test.js @@ -3,9 +3,9 @@ require("ts-node/register/transpile-only"); const { isTextFile } = require("../src/core/anonymize-utils"); describe("isTextFile", function () { - // #493 — istextorbinary returns null for files with no extension, so a - // bare LICENSE / COPYING / etc. used to be classified as binary and - // never anonymized. Whitelist the conventional plaintext filenames. + // #493 — bare LICENSE / COPYING / etc. have no extension and no MIME, so + // we whitelist the conventional plaintext filenames to short-circuit them + // before falling through to content sniffing (which fails on empty files). it("recognizes conventional no-extension plaintext filenames", function () { expect(isTextFile("LICENSE")).to.equal(true); expect(isTextFile("license")).to.equal(true); @@ -27,4 +27,27 @@ describe("isTextFile", function () { expect(isTextFile("foo.png")).to.equal(false); expect(isTextFile("foo.zip")).to.equal(false); }); + + it("recognizes jsonl-family dataset extensions", function () { + expect(isTextFile("data.jsonl")).to.equal(true); + expect(isTextFile("data.ndjson")).to.equal(true); + }); + + it("falls back to content sniffing for unknown extensions", function () { + expect( + isTextFile("foo.unknown", Buffer.from("hello world\nline two\n", "utf8")) + ).to.equal(true); + expect( + isTextFile("foo.unknown", Buffer.from([0x00, 0x01, 0x02, 0x03, 0x00, 0x05])) + ).to.equal(false); + const random = Buffer.alloc(512); + for (let i = 0; i < random.length; i++) random[i] = (i * 31 + 7) % 32; + expect(isTextFile("foo.unknown", random)).to.equal(false); + }); + + it("does not let content sniffing override a known binary extension", function () { + expect( + isTextFile("foo.png", Buffer.from("plain ascii pretending to be a png")) + ).to.equal(false); + }); });