feat: resolve Git LFS pointers via the raw URL endpoint

Files tracked by Git LFS used to come out as the pointer text:

    version https://git-lfs.github.com/spec/v1
    oid sha256:...
    size ...

…because GitHub's blob API returns the pointer, not the resolved
content. Detect that prefix on the first ~150 bytes of the blob stream
and switch to a fresh fetch via the web raw URL
(github.com/<owner>/<repo>/raw/<commit>/<path>), which auto-redirects
to media.githubusercontent.com and resolves the LFS object — auth
header carries through. Non-LFS files are forwarded through the
existing pipeline unchanged.

Fixes #95.
This commit is contained in:
tdurieux
2026-05-04 12:18:55 +02:00
parent 7ace730960
commit f0f6436370
2 changed files with 119 additions and 4 deletions
+77 -4
View File
@@ -51,6 +51,77 @@ export default class GitHubStream extends GitHubBase {
}
}
// GitHub's web raw URL auto-resolves Git LFS pointers via redirect to
// media.githubusercontent.com, with the auth header carried through. The
// blob endpoint above returns the raw pointer text instead, so we use this
// as the fallback for LFS files (#95).
private downloadFileViaRaw(token: string, filePath: string) {
const url = `https://github.com/${this.data.organization}/${this.data.repoName}/raw/${this.data.commit}/${filePath}`;
console.log("[GHStream] Downloading via raw URL (LFS)", url);
return got.stream(url, {
headers: { authorization: `token ${token}` },
followRedirect: true,
});
}
// Wrap a blob stream so that if its first ~150 bytes look like a Git LFS
// pointer, the bytes are dropped and replaced by a fresh fetch from the
// raw URL endpoint (which resolves LFS automatically). Non-LFS files are
// forwarded unchanged.
private resolveLfsPointer(
blobStream: stream.Readable,
token: string,
filePath: string
): stream.Readable {
const out = new stream.PassThrough();
let probe = Buffer.alloc(0);
let decided = false;
const PROBE_BYTES = 150;
const LFS_PREFIX = "version https://git-lfs.github.com/spec/";
const decide = (extra?: Buffer) => {
if (decided) return;
decided = true;
const head = probe.toString(
"utf8",
0,
Math.min(probe.length, LFS_PREFIX.length)
);
if (head === LFS_PREFIX) {
blobStream.destroy();
const lfsStream = this.downloadFileViaRaw(token, filePath);
lfsStream.on("error", (err) => out.destroy(err));
lfsStream.pipe(out);
} else {
out.write(probe);
if (extra && extra.length) out.write(extra);
blobStream.on("data", (c) => out.write(c));
blobStream.on("end", () => out.end());
blobStream.on("error", (err) => out.destroy(err));
}
};
blobStream.on("data", (chunk: Buffer) => {
if (decided) return;
const remaining = PROBE_BYTES - probe.length;
if (chunk.length <= remaining) {
probe = Buffer.concat([probe, chunk]);
if (probe.length >= PROBE_BYTES) decide();
} else {
probe = Buffer.concat([probe, chunk.slice(0, remaining)]);
decide(chunk.slice(remaining));
}
});
blobStream.on("end", () => decide());
blobStream.on("error", (err) => {
if (decided) return;
decided = true;
out.destroy(err);
});
return out;
}
async getFileContentCache(
filePath: string,
repoId: string,
@@ -65,10 +136,12 @@ export default class GitHubStream extends GitHubBase {
object: filePath,
});
}
const content = this.downloadFile(
await this.data.getToken(),
await fileSha()
);
const token = await this.data.getToken();
const blobStream = this.downloadFile(token, await fileSha());
// If the blob is a Git LFS pointer, swap to a raw-URL fetch so the
// file content (not the pointer text) makes it into the pipeline. See
// #95 — Support for Git LFS.
const content = this.resolveLfsPointer(blobStream, token, filePath);
// duplicate the stream to write it to the storage
const stream1 = content.pipe(new stream.PassThrough());
+42
View File
@@ -0,0 +1,42 @@
const { expect } = require("chai");
const { Readable } = require("stream");
// Standalone test of the LFS-pointer detection shape used in
// GitHubStream#resolveLfsPointer. We can't easily import that method (it's
// private and the file pulls in heavy GitHub plumbing), so this mirrors the
// detection logic to confirm the head-bytes check.
const LFS_PREFIX = "version https://git-lfs.github.com/spec/";
function isLfsPointer(buf) {
return (
buf.length >= LFS_PREFIX.length &&
buf.toString("utf8", 0, LFS_PREFIX.length) === LFS_PREFIX
);
}
describe("LFS pointer detection (#95)", function () {
it("recognizes the standard pointer prefix", function () {
const pointer = Buffer.from(
"version https://git-lfs.github.com/spec/v1\n" +
"oid sha256:abc123\nsize 12345\n"
);
expect(isLfsPointer(pointer)).to.equal(true);
});
it("doesn't false-positive on plain text starting with 'version'", function () {
const fake = Buffer.from(
"version 1.2.3\nA short release notes file mentioning git-lfs.\n"
);
expect(isLfsPointer(fake)).to.equal(false);
});
it("doesn't false-positive on binary headers", function () {
const elf = Buffer.from([0x7f, 0x45, 0x4c, 0x46, ...new Array(100).fill(0)]);
expect(isLfsPointer(elf)).to.equal(false);
});
it("handles short streams below the prefix length", function () {
expect(isLfsPointer(Buffer.from("vers"))).to.equal(false);
});
});