From f0f643637061db8418cbf0c50595b1ec3ab47f75 Mon Sep 17 00:00:00 2001 From: tdurieux Date: Mon, 4 May 2026 12:18:55 +0200 Subject: [PATCH] feat: resolve Git LFS pointers via the raw URL endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Files tracked by Git LFS used to come out as the pointer text: version https://git-lfs.github.com/spec/v1 oid sha256:... size ... …because GitHub's blob API returns the pointer, not the resolved content. Detect that prefix on the first ~150 bytes of the blob stream and switch to a fresh fetch via the web raw URL (github.com///raw//), which auto-redirects to media.githubusercontent.com and resolves the LFS object — auth header carries through. Non-LFS files are forwarded through the existing pipeline unchanged. Fixes #95. --- src/core/source/GitHubStream.ts | 81 +++++++++++++++++++++++++++++++-- test/lfs-detect.test.js | 42 +++++++++++++++++ 2 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 test/lfs-detect.test.js diff --git a/src/core/source/GitHubStream.ts b/src/core/source/GitHubStream.ts index ca025eb..e048cd4 100644 --- a/src/core/source/GitHubStream.ts +++ b/src/core/source/GitHubStream.ts @@ -51,6 +51,77 @@ export default class GitHubStream extends GitHubBase { } } + // GitHub's web raw URL auto-resolves Git LFS pointers via redirect to + // media.githubusercontent.com, with the auth header carried through. The + // blob endpoint above returns the raw pointer text instead, so we use this + // as the fallback for LFS files (#95). + private downloadFileViaRaw(token: string, filePath: string) { + const url = `https://github.com/${this.data.organization}/${this.data.repoName}/raw/${this.data.commit}/${filePath}`; + console.log("[GHStream] Downloading via raw URL (LFS)", url); + return got.stream(url, { + headers: { authorization: `token ${token}` }, + followRedirect: true, + }); + } + + // Wrap a blob stream so that if its first ~150 bytes look like a Git LFS + // pointer, the bytes are dropped and replaced by a fresh fetch from the + // raw URL endpoint (which resolves LFS automatically). Non-LFS files are + // forwarded unchanged. + private resolveLfsPointer( + blobStream: stream.Readable, + token: string, + filePath: string + ): stream.Readable { + const out = new stream.PassThrough(); + let probe = Buffer.alloc(0); + let decided = false; + const PROBE_BYTES = 150; + const LFS_PREFIX = "version https://git-lfs.github.com/spec/"; + + const decide = (extra?: Buffer) => { + if (decided) return; + decided = true; + const head = probe.toString( + "utf8", + 0, + Math.min(probe.length, LFS_PREFIX.length) + ); + if (head === LFS_PREFIX) { + blobStream.destroy(); + const lfsStream = this.downloadFileViaRaw(token, filePath); + lfsStream.on("error", (err) => out.destroy(err)); + lfsStream.pipe(out); + } else { + out.write(probe); + if (extra && extra.length) out.write(extra); + blobStream.on("data", (c) => out.write(c)); + blobStream.on("end", () => out.end()); + blobStream.on("error", (err) => out.destroy(err)); + } + }; + + blobStream.on("data", (chunk: Buffer) => { + if (decided) return; + const remaining = PROBE_BYTES - probe.length; + if (chunk.length <= remaining) { + probe = Buffer.concat([probe, chunk]); + if (probe.length >= PROBE_BYTES) decide(); + } else { + probe = Buffer.concat([probe, chunk.slice(0, remaining)]); + decide(chunk.slice(remaining)); + } + }); + blobStream.on("end", () => decide()); + blobStream.on("error", (err) => { + if (decided) return; + decided = true; + out.destroy(err); + }); + + return out; + } + async getFileContentCache( filePath: string, repoId: string, @@ -65,10 +136,12 @@ export default class GitHubStream extends GitHubBase { object: filePath, }); } - const content = this.downloadFile( - await this.data.getToken(), - await fileSha() - ); + const token = await this.data.getToken(); + const blobStream = this.downloadFile(token, await fileSha()); + // If the blob is a Git LFS pointer, swap to a raw-URL fetch so the + // file content (not the pointer text) makes it into the pipeline. See + // #95 — Support for Git LFS. + const content = this.resolveLfsPointer(blobStream, token, filePath); // duplicate the stream to write it to the storage const stream1 = content.pipe(new stream.PassThrough()); diff --git a/test/lfs-detect.test.js b/test/lfs-detect.test.js new file mode 100644 index 0000000..3d048f0 --- /dev/null +++ b/test/lfs-detect.test.js @@ -0,0 +1,42 @@ +const { expect } = require("chai"); +const { Readable } = require("stream"); + +// Standalone test of the LFS-pointer detection shape used in +// GitHubStream#resolveLfsPointer. We can't easily import that method (it's +// private and the file pulls in heavy GitHub plumbing), so this mirrors the +// detection logic to confirm the head-bytes check. + +const LFS_PREFIX = "version https://git-lfs.github.com/spec/"; + +function isLfsPointer(buf) { + return ( + buf.length >= LFS_PREFIX.length && + buf.toString("utf8", 0, LFS_PREFIX.length) === LFS_PREFIX + ); +} + +describe("LFS pointer detection (#95)", function () { + it("recognizes the standard pointer prefix", function () { + const pointer = Buffer.from( + "version https://git-lfs.github.com/spec/v1\n" + + "oid sha256:abc123\nsize 12345\n" + ); + expect(isLfsPointer(pointer)).to.equal(true); + }); + + it("doesn't false-positive on plain text starting with 'version'", function () { + const fake = Buffer.from( + "version 1.2.3\nA short release notes file mentioning git-lfs.\n" + ); + expect(isLfsPointer(fake)).to.equal(false); + }); + + it("doesn't false-positive on binary headers", function () { + const elf = Buffer.from([0x7f, 0x45, 0x4c, 0x46, ...new Array(100).fill(0)]); + expect(isLfsPointer(elf)).to.equal(false); + }); + + it("handles short streams below the prefix length", function () { + expect(isLfsPointer(Buffer.from("vers"))).to.equal(false); + }); +});