improve binary file detection: content sniffing + jsonl support

Files like .jsonl that mime-types doesn't know fell through to application/octet-stream and rendered as "Unsupported binary file" in the viewer. Replace istextorbinary with isbinaryfile for content-based detection, and use mime-types for name-based classification with a textual application/* allowlist. The streaming transformer now defers classification when the name is inconclusive and sniffs the first chunk before emitting "transform", so route.ts and AnonymizedFile.ts get a content-aware Content-Type. Whitelists .jsonl and .ndjson to short-circuit dataset files. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 06:49:09 +02:00 · 2026-05-06 07:52:48 +03:00
parent 18ce39e019
commit 79f555769d
6 changed files with 154 additions and 158 deletions
@@ -69,6 +69,8 @@ const config: Config = {
    "out",
    "sol",
    "in",
+    "jsonl",
+    "ndjson",
  ],
  STORAGE: "filesystem",
  STREAMER_ENTRYPOINT: null,
@@ -1,7 +1,8 @@
 import { basename } from "path";
 import { Transform, Readable } from "stream";
 import { StringDecoder } from "string_decoder";
-import { isText } from "istextorbinary";
+import { isBinaryFileSync } from "isbinaryfile";
+import { lookup as lookupMime } from "mime-types";

 import config from "../config";
 import {
@@ -22,47 +23,93 @@ export function streamToString(stream: Readable): Promise<string> {
  });
 }

-// Common conventional plaintext filenames that have no extension. The
-// istextorbinary package returns null (unknown) for these, which our
-// `=== true` check then treats as binary — so terms in LICENSE, COPYING,
-// etc. silently went through unchanged (#493).
-const KNOWN_TEXT_FILENAMES = new Set(
-  [
-    "license",
-    "licence",
-    "copying",
-    "copyright",
-    "authors",
-    "contributors",
-    "readme",
-    "changelog",
-    "changes",
-    "notice",
-    "install",
-    "todo",
-    "version",
-    "manifest",
-  ]
-);
+// Common conventional plaintext filenames that have no extension and no MIME
+// match. Without this whitelist a bare LICENSE / COPYING / etc. would fall
+// through to content sniffing, which is fine for non-empty files but breaks
+// on zero-byte ones — so we short-circuit them here (#493).
+const KNOWN_TEXT_FILENAMES = new Set([
+  "license",
+  "licence",
+  "copying",
+  "copyright",
+  "authors",
+  "contributors",
+  "readme",
+  "changelog",
+  "changes",
+  "notice",
+  "install",
+  "todo",
+  "version",
+  "manifest",
+]);

-export function isTextFile(filePath: string, content?: Buffer) {
-  const filename = basename(filePath);
-  const extensions = filename.split(".").reverse();
-  const extension = extensions[0].toLowerCase();
-  if (config.additionalExtensions.includes(extension)) {
-    return true;
-  }
-  if (KNOWN_TEXT_FILENAMES.has(filename.toLowerCase())) {
-    return true;
-  }
-  if (isText(filename)) {
-    return true;
-  }
-  return isText(filename, content);
+// Application/* MIME types that carry text payloads. text/* is always text,
+// application/* needs an allowlist (most are binary: zip, pdf, octet-stream).
+const TEXTUAL_APPLICATION_MIMES = new Set([
+  "application/json",
+  "application/ld+json",
+  "application/xml",
+  "application/javascript",
+  "application/ecmascript",
+  "application/typescript",
+  "application/toml",
+  "application/sql",
+  "application/x-sql",
+  "application/x-sh",
+  "application/x-csh",
+  "application/x-yaml",
+  "application/yaml",
+  "application/x-httpd-php",
+  "application/graphql",
+  "application/x-tex",
+  "application/x-latex",
+  "application/x-perl",
+  "application/x-ruby",
+  "application/x-python",
+]);
+
+function isTextualMime(mime: string): boolean {
+  if (mime.startsWith("text/")) return true;
+  if (TEXTUAL_APPLICATION_MIMES.has(mime)) return true;
+  // application/*+json, application/*+xml, application/*+yaml
+  return /\+(json|xml|yaml)$/.test(mime);
+}
+
+// Name-only classification: returns true (known text), false (known binary),
+// or null when the name alone is inconclusive. The streaming transformer
+// resolves null by sniffing the first chunk with isbinaryfile.
+function classifyByName(filePath: string): boolean | null {
+  const name = basename(filePath);
+  const extension = name.split(".").reverse()[0].toLowerCase();
+  if (config.additionalExtensions.includes(extension)) return true;
+  if (KNOWN_TEXT_FILENAMES.has(name.toLowerCase())) return true;
+  const mime = lookupMime(name);
+  if (mime === false) return null;
+  // mime-types treats `.ts` as video/mp2t; route.ts already special-cases it.
+  // Prefer text for the ambiguous extension since it matches our typical use.
+  if (extension === "ts") return true;
+  return isTextualMime(mime);
+}
+
+export function isTextFile(filePath: string, content?: Buffer): boolean {
+  const byName = classifyByName(filePath);
+  if (byName === true) return true;
+  if (byName === false) return false;
+  // Name was inconclusive — sniff the buffer if we have one. isbinaryfile
+  // checks for null bytes / non-printable ratio in the first 512 bytes
+  // and returns a decisive boolean.
+  if (content && content.length > 0) return !isBinaryFileSync(content);
+  return false;
 }

 export class AnonymizeTransformer extends Transform {
-  public isText: boolean;
+  // Set in the constructor for known extensions; left null until the first
+  // chunk arrives for unknown extensions, where it's resolved by sniffing.
+  // Consumers of the "transform" event always see a resolved boolean — we
+  // sniff before emitting.
+  public isText!: boolean;
+  private nameVerdict: boolean | null;
  anonimizer: ContentAnonimizer;
  private decoder = new StringDecoder("utf8");
  // Trailing decoded text held back between chunks so that terms, URLs, or
@@ -84,11 +131,13 @@ export class AnonymizeTransformer extends Transform {
    } & ConstructorParameters<typeof ContentAnonimizer>[0]
  ) {
    super();
-    // isTextFile may return null for unknown extensions; treat unknown as
-    // binary. Sniffing from chunk content is unsafe — split archives,
-    // compressed blobs, etc. can have an ASCII-looking first 64 KB and get
-    // misclassified as text, which then UTF-8-round-trips and corrupts them.
-    this.isText = isTextFile(this.opt.filePath) === true;
+    // Tri-state: name-based check returns true (known text), false (known
+    // binary), or null (name inconclusive). For null we defer to a content
+    // sniff on the first chunk in _transform — known binary extensions
+    // (archives, compressed blobs, images) are resolved here and never
+    // reach the sniff path (#493).
+    this.nameVerdict = classifyByName(this.opt.filePath);
+    if (this.nameVerdict !== null) this.isText = this.nameVerdict;
    this.anonimizer = new ContentAnonimizer(this.opt);
  }

@@ -105,6 +154,12 @@ export class AnonymizeTransformer extends Transform {
  }

  _transform(chunk: Buffer, encoding: string, callback: () => void) {
+    if (this.nameVerdict === null) {
+      // Name didn't decide. isbinaryfile inspects the first 512 bytes for
+      // null bytes and non-printable ratio and returns a decisive boolean.
+      this.isText = chunk.length === 0 ? true : !isBinaryFileSync(chunk);
+      this.nameVerdict = this.isText;
+    }
    if (!this.isText) {
      this.emit("transform", {
        isText: this.isText,
@@ -166,6 +221,12 @@ export class AnonymizeTransformer extends Transform {
  }

  _flush(callback: () => void) {
+    // Empty file with an unknown extension: no chunk arrived to trigger
+    // sniffing. Treat as text — there's nothing to corrupt.
+    if (this.nameVerdict === null) {
+      this.isText = true;
+      this.nameVerdict = true;
+    }
    if (this.isText) {
      this.pending += this.decoder.end();
      if (this.pending) {