import AnonymizedFile from "../AnonymizedFile"; import GitHubBase, { GitHubBaseData } from "./GitHubBase"; import storage from "../storage"; import * as path from "path"; import got from "got"; import { basename, dirname } from "path"; import * as stream from "stream"; import AnonymousError from "../AnonymousError"; import { FILE_TYPE } from "../storage/Storage"; import { octokit } from "../GitHubUtils"; import FileModel from "../model/files/files.model"; import { IFile } from "../model/files/files.types"; export default class GitHubStream extends GitHubBase { type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubStream"; private _truncatedFolders: string[] = []; constructor(data: GitHubBaseData) { super(data); } get truncatedFolderList(): string[] { return this._truncatedFolders; } downloadFile(token: string, sha: string) { const oct = octokit(token); try { const { url } = oct.rest.git.getBlob.endpoint({ owner: this.data.organization, repo: this.data.repoName, file_sha: sha, }); console.log("[GHStream] Downloading file", url); return got.stream(url, { headers: { "X-GitHub-Api-Version": "2022-11-28", accept: "application/vnd.github.raw+json", authorization: `token ${token}`, }, }); } catch (error) { console.error(error); throw new AnonymousError("repo_not_accessible", { httpStatus: 404, object: this.data, cause: error as Error, }); } } // GitHub's web raw URL auto-resolves Git LFS pointers via redirect to // media.githubusercontent.com, with the auth header carried through. The // blob endpoint above returns the raw pointer text instead, so we use this // as the fallback for LFS files (#95). private downloadFileViaRaw(token: string, filePath: string) { const url = `https://github.com/${this.data.organization}/${this.data.repoName}/raw/${this.data.commit}/${filePath}`; console.log("[GHStream] Downloading via raw URL (LFS)", url); return got.stream(url, { headers: { authorization: `token ${token}` }, followRedirect: true, }); } // Wrap a blob stream so that if its first ~150 bytes look like a Git LFS // pointer, the bytes are dropped and replaced by a fresh fetch from the // raw URL endpoint (which resolves LFS automatically). Non-LFS files are // forwarded unchanged. private resolveLfsPointer( blobStream: stream.Readable, token: string, filePath: string ): stream.Readable { const out = new stream.PassThrough(); let probe = Buffer.alloc(0); let decided = false; const PROBE_BYTES = 150; const LFS_PREFIX = "version https://git-lfs.github.com/spec/"; const decide = (extra?: Buffer) => { if (decided) return; decided = true; const head = probe.toString( "utf8", 0, Math.min(probe.length, LFS_PREFIX.length) ); if (head === LFS_PREFIX) { blobStream.destroy(); const lfsStream = this.downloadFileViaRaw(token, filePath); lfsStream.on("error", (err) => out.destroy(err)); lfsStream.pipe(out); } else { out.write(probe); if (extra && extra.length) out.write(extra); blobStream.on("data", (c) => out.write(c)); blobStream.on("end", () => out.end()); blobStream.on("error", (err) => out.destroy(err)); } }; blobStream.on("data", (chunk: Buffer) => { if (decided) return; const remaining = PROBE_BYTES - probe.length; if (chunk.length <= remaining) { probe = Buffer.concat([probe, chunk]); if (probe.length >= PROBE_BYTES) decide(); } else { probe = Buffer.concat([probe, chunk.slice(0, remaining)]); decide(chunk.slice(remaining)); } }); blobStream.on("end", () => decide()); blobStream.on("error", (err) => { if (decided) return; decided = true; out.destroy(err); }); return out; } async getFileContentCache( filePath: string, repoId: string, fileMeta: () => | Promise<{ sha: string; size?: number }> | { sha: string; size?: number } | Promise | string ) { const meta = await fileMeta(); const expected: { sha: string; size?: number } = typeof meta === "string" ? { sha: meta } : meta; const fileInfo = await storage.exists(repoId, filePath); if (fileInfo == FILE_TYPE.FILE) { // If we know the upstream size, validate the cached entry. A cached // file smaller than the upstream size means a previous fetch was // truncated — likely a network error during the GitHub fetch left a // 0-byte or partial blob behind. Treat it as a miss and re-fetch. // Cached size >= expected is accepted: equal for normal files, and // larger for Git LFS files where FileModel.size is the pointer's // size but the cached bytes are the resolved LFS content. if (expected.size != null && expected.size > 0) { try { const stat = await storage.fileInfo(repoId, filePath); if (stat.size != null && stat.size < expected.size) { await storage.rm(repoId, filePath); } else { return storage.read(repoId, filePath); } } catch { // fall through and re-fetch } } else { return storage.read(repoId, filePath); } } else if (fileInfo == FILE_TYPE.FOLDER) { throw new AnonymousError("folder_not_supported", { httpStatus: 400, object: filePath, }); } const token = await this.data.getToken(); const blobStream = this.downloadFile(token, expected.sha); // If the blob is a Git LFS pointer, swap to a raw-URL fetch so the // file content (not the pointer text) makes it into the pipeline. See // #95 — Support for Git LFS. const content = this.resolveLfsPointer(blobStream, token, filePath); // duplicate the stream to write it to the storage const stream1 = content.pipe(new stream.PassThrough()); const stream2 = content.pipe(new stream.PassThrough()); content.on("error", (error) => { error = new AnonymousError("file_not_found", { httpStatus: (error as { status?: number; httpStatus?: number }).status || (error as { httpStatus?: number }).httpStatus, cause: error as Error, object: filePath, }); stream1.emit("error", error); stream2.emit("error", error); }); storage.write(repoId, filePath, stream1, this.type); return stream2; } async getFileContent(file: AnonymizedFile): Promise { try { void file.filePath; } catch { // compute the original path if ambiguous await file.originalPath(); } return this.getFileContentCache( file.filePath, file.repository.repoId, async () => { const fileSha = await file.sha(); if (!fileSha) { throw new AnonymousError("file_not_accessible", { httpStatus: 404, object: file, }); } return { sha: fileSha, size: await file.size() }; } ); } async getFiles(progress?: (status: string) => void) { this._truncatedFolders = []; return this.getTruncatedTree(this.data.commit, progress); } private async getGHTree( sha: string, count = { request: 0, file: 0 }, opt = { recursive: true, callback: () => {} } ) { const oct = octokit(await this.data.getToken()); const ghRes = await oct.git.getTree({ owner: this.data.organization, repo: this.data.repoName, tree_sha: sha, recursive: opt.recursive === true ? "1" : undefined, }); count.request++; count.file += ghRes.data.tree.length; if (opt.callback) { opt.callback(); } return ghRes.data; } private async getTruncatedTree( sha: string, progress?: (status: string) => void, parentPath: string = "" ) { const count = { request: 0, file: 0, }; const output: IFile[] = []; let data; try { data = await this.getGHTree(sha, count, { recursive: false, callback: () => { if (progress) { progress("List file: " + count.file); } }, }); if (data.truncated) { this._truncatedFolders.push(parentPath); } output.push(...this.tree2Tree(data.tree, parentPath)); } catch (error) { console.log(error); const status = (error as { status?: number }).status; if (status === 409) { throw new AnonymousError("repo_empty", { httpStatus: 409, object: this.data, cause: error as Error, }); } if (status === 404) { throw new AnonymousError("repo_not_found", { httpStatus: 404, object: this.data, cause: error as Error, }); } throw new AnonymousError("repo_not_found", { httpStatus: status || 500, object: this.data, cause: error as Error, }); } const promises: ReturnType[] = []; const parentPaths: string[] = []; for (const file of data.tree) { if (file.type == "tree" && file.path && file.sha) { const elementPath = path.join(parentPath, file.path); parentPaths.push(elementPath); promises.push( this.getGHTree(file.sha, count, { recursive: true, callback: () => { if (progress) { progress("List file: " + count.file); } }, }) ); } } (await Promise.all(promises)).forEach((data, i) => { if (data.truncated) { this._truncatedFolders.push(parentPaths[i]); } output.push(...this.tree2Tree(data.tree, parentPaths[i])); }); return output; } private tree2Tree( tree: { path?: string; mode?: string; type?: string; sha?: string; size?: number; url?: string; }[], parentPath: string = "" ) { return tree.map((elem) => { const fullPath = path.join(parentPath, elem.path || ""); let pathFile = dirname(fullPath); if (pathFile === ".") { pathFile = ""; } return new FileModel({ name: basename(fullPath), path: pathFile, repoId: this.data.repoId, size: elem.size, sha: elem.sha, }); }); } }