mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 14:38:03 +02:00
79f555769d
Files like .jsonl that mime-types doesn't know fell through to application/octet-stream and rendered as "Unsupported binary file" in the viewer. Replace istextorbinary with isbinaryfile for content-based detection, and use mime-types for name-based classification with a textual application/* allowlist. The streaming transformer now defers classification when the name is inconclusive and sniffs the first chunk before emitting "transform", so route.ts and AnonymizedFile.ts get a content-aware Content-Type. Whitelists .jsonl and .ndjson to short-circuit dataset files. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
54 lines
2.1 KiB
JavaScript
54 lines
2.1 KiB
JavaScript
const { expect } = require("chai");
|
|
require("ts-node/register/transpile-only");
|
|
const { isTextFile } = require("../src/core/anonymize-utils");
|
|
|
|
describe("isTextFile", function () {
|
|
// #493 — bare LICENSE / COPYING / etc. have no extension and no MIME, so
|
|
// we whitelist the conventional plaintext filenames to short-circuit them
|
|
// before falling through to content sniffing (which fails on empty files).
|
|
it("recognizes conventional no-extension plaintext filenames", function () {
|
|
expect(isTextFile("LICENSE")).to.equal(true);
|
|
expect(isTextFile("license")).to.equal(true);
|
|
expect(isTextFile("COPYING")).to.equal(true);
|
|
expect(isTextFile("AUTHORS")).to.equal(true);
|
|
expect(isTextFile("README")).to.equal(true);
|
|
expect(isTextFile("CHANGELOG")).to.equal(true);
|
|
expect(isTextFile("NOTICE")).to.equal(true);
|
|
expect(isTextFile("path/to/LICENSE")).to.equal(true);
|
|
});
|
|
|
|
it("still recognizes well-known text extensions", function () {
|
|
expect(isTextFile("foo.txt")).to.equal(true);
|
|
expect(isTextFile("foo.md")).to.equal(true);
|
|
expect(isTextFile("foo.js")).to.equal(true);
|
|
});
|
|
|
|
it("does not classify binary files as text", function () {
|
|
expect(isTextFile("foo.png")).to.equal(false);
|
|
expect(isTextFile("foo.zip")).to.equal(false);
|
|
});
|
|
|
|
it("recognizes jsonl-family dataset extensions", function () {
|
|
expect(isTextFile("data.jsonl")).to.equal(true);
|
|
expect(isTextFile("data.ndjson")).to.equal(true);
|
|
});
|
|
|
|
it("falls back to content sniffing for unknown extensions", function () {
|
|
expect(
|
|
isTextFile("foo.unknown", Buffer.from("hello world\nline two\n", "utf8"))
|
|
).to.equal(true);
|
|
expect(
|
|
isTextFile("foo.unknown", Buffer.from([0x00, 0x01, 0x02, 0x03, 0x00, 0x05]))
|
|
).to.equal(false);
|
|
const random = Buffer.alloc(512);
|
|
for (let i = 0; i < random.length; i++) random[i] = (i * 31 + 7) % 32;
|
|
expect(isTextFile("foo.unknown", random)).to.equal(false);
|
|
});
|
|
|
|
it("does not let content sniffing override a known binary extension", function () {
|
|
expect(
|
|
isTextFile("foo.png", Buffer.from("plain ascii pretending to be a png"))
|
|
).to.equal(false);
|
|
});
|
|
});
|