improve binary file detection: content sniffing + jsonl support

Files like .jsonl that mime-types doesn't know fell through to
application/octet-stream and rendered as "Unsupported binary file" in
the viewer. Replace istextorbinary with isbinaryfile for content-based
detection, and use mime-types for name-based classification with a
textual application/* allowlist.

The streaming transformer now defers classification when the name is
inconclusive and sniffs the first chunk before emitting "transform",
so route.ts and AnonymizedFile.ts get a content-aware Content-Type.
Whitelists .jsonl and .ndjson to short-circuit dataset files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
tdurieux
2026-05-06 07:52:48 +03:00
parent 18ce39e019
commit 79f555769d
6 changed files with 154 additions and 158 deletions
+2
View File
@@ -69,6 +69,8 @@ const config: Config = {
"out",
"sol",
"in",
"jsonl",
"ndjson",
],
STORAGE: "filesystem",
STREAMER_ENTRYPOINT: null,
+104 -43
View File
@@ -1,7 +1,8 @@
import { basename } from "path";
import { Transform, Readable } from "stream";
import { StringDecoder } from "string_decoder";
import { isText } from "istextorbinary";
import { isBinaryFileSync } from "isbinaryfile";
import { lookup as lookupMime } from "mime-types";
import config from "../config";
import {
@@ -22,47 +23,93 @@ export function streamToString(stream: Readable): Promise<string> {
});
}
// Common conventional plaintext filenames that have no extension. The
// istextorbinary package returns null (unknown) for these, which our
// `=== true` check then treats as binary — so terms in LICENSE, COPYING,
// etc. silently went through unchanged (#493).
const KNOWN_TEXT_FILENAMES = new Set(
[
"license",
"licence",
"copying",
"copyright",
"authors",
"contributors",
"readme",
"changelog",
"changes",
"notice",
"install",
"todo",
"version",
"manifest",
]
);
// Common conventional plaintext filenames that have no extension and no MIME
// match. Without this whitelist a bare LICENSE / COPYING / etc. would fall
// through to content sniffing, which is fine for non-empty files but breaks
// on zero-byte ones — so we short-circuit them here (#493).
const KNOWN_TEXT_FILENAMES = new Set([
"license",
"licence",
"copying",
"copyright",
"authors",
"contributors",
"readme",
"changelog",
"changes",
"notice",
"install",
"todo",
"version",
"manifest",
]);
export function isTextFile(filePath: string, content?: Buffer) {
const filename = basename(filePath);
const extensions = filename.split(".").reverse();
const extension = extensions[0].toLowerCase();
if (config.additionalExtensions.includes(extension)) {
return true;
}
if (KNOWN_TEXT_FILENAMES.has(filename.toLowerCase())) {
return true;
}
if (isText(filename)) {
return true;
}
return isText(filename, content);
// Application/* MIME types that carry text payloads. text/* is always text,
// application/* needs an allowlist (most are binary: zip, pdf, octet-stream).
const TEXTUAL_APPLICATION_MIMES = new Set([
"application/json",
"application/ld+json",
"application/xml",
"application/javascript",
"application/ecmascript",
"application/typescript",
"application/toml",
"application/sql",
"application/x-sql",
"application/x-sh",
"application/x-csh",
"application/x-yaml",
"application/yaml",
"application/x-httpd-php",
"application/graphql",
"application/x-tex",
"application/x-latex",
"application/x-perl",
"application/x-ruby",
"application/x-python",
]);
function isTextualMime(mime: string): boolean {
if (mime.startsWith("text/")) return true;
if (TEXTUAL_APPLICATION_MIMES.has(mime)) return true;
// application/*+json, application/*+xml, application/*+yaml
return /\+(json|xml|yaml)$/.test(mime);
}
// Name-only classification: returns true (known text), false (known binary),
// or null when the name alone is inconclusive. The streaming transformer
// resolves null by sniffing the first chunk with isbinaryfile.
function classifyByName(filePath: string): boolean | null {
const name = basename(filePath);
const extension = name.split(".").reverse()[0].toLowerCase();
if (config.additionalExtensions.includes(extension)) return true;
if (KNOWN_TEXT_FILENAMES.has(name.toLowerCase())) return true;
const mime = lookupMime(name);
if (mime === false) return null;
// mime-types treats `.ts` as video/mp2t; route.ts already special-cases it.
// Prefer text for the ambiguous extension since it matches our typical use.
if (extension === "ts") return true;
return isTextualMime(mime);
}
export function isTextFile(filePath: string, content?: Buffer): boolean {
const byName = classifyByName(filePath);
if (byName === true) return true;
if (byName === false) return false;
// Name was inconclusive — sniff the buffer if we have one. isbinaryfile
// checks for null bytes / non-printable ratio in the first 512 bytes
// and returns a decisive boolean.
if (content && content.length > 0) return !isBinaryFileSync(content);
return false;
}
export class AnonymizeTransformer extends Transform {
public isText: boolean;
// Set in the constructor for known extensions; left null until the first
// chunk arrives for unknown extensions, where it's resolved by sniffing.
// Consumers of the "transform" event always see a resolved boolean — we
// sniff before emitting.
public isText!: boolean;
private nameVerdict: boolean | null;
anonimizer: ContentAnonimizer;
private decoder = new StringDecoder("utf8");
// Trailing decoded text held back between chunks so that terms, URLs, or
@@ -84,11 +131,13 @@ export class AnonymizeTransformer extends Transform {
} & ConstructorParameters<typeof ContentAnonimizer>[0]
) {
super();
// isTextFile may return null for unknown extensions; treat unknown as
// binary. Sniffing from chunk content is unsafe — split archives,
// compressed blobs, etc. can have an ASCII-looking first 64 KB and get
// misclassified as text, which then UTF-8-round-trips and corrupts them.
this.isText = isTextFile(this.opt.filePath) === true;
// Tri-state: name-based check returns true (known text), false (known
// binary), or null (name inconclusive). For null we defer to a content
// sniff on the first chunk in _transform — known binary extensions
// (archives, compressed blobs, images) are resolved here and never
// reach the sniff path (#493).
this.nameVerdict = classifyByName(this.opt.filePath);
if (this.nameVerdict !== null) this.isText = this.nameVerdict;
this.anonimizer = new ContentAnonimizer(this.opt);
}
@@ -105,6 +154,12 @@ export class AnonymizeTransformer extends Transform {
}
_transform(chunk: Buffer, encoding: string, callback: () => void) {
if (this.nameVerdict === null) {
// Name didn't decide. isbinaryfile inspects the first 512 bytes for
// null bytes and non-printable ratio and returns a decisive boolean.
this.isText = chunk.length === 0 ? true : !isBinaryFileSync(chunk);
this.nameVerdict = this.isText;
}
if (!this.isText) {
this.emit("transform", {
isText: this.isText,
@@ -166,6 +221,12 @@ export class AnonymizeTransformer extends Transform {
}
_flush(callback: () => void) {
// Empty file with an unknown extension: no chunk arrived to trigger
// sniffing. Treat as text — there's nothing to corrupt.
if (this.nameVerdict === null) {
this.isText = true;
this.nameVerdict = true;
}
if (this.isText) {
this.pending += this.decoder.end();
if (this.pending) {