mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 22:48:00 +02:00
509 lines
16 KiB
TypeScript
509 lines
16 KiB
TypeScript
import AnonymizedFile from "../AnonymizedFile";
|
|
import GitHubBase, {
|
|
GitHubBaseData,
|
|
classifyGitHubMissError,
|
|
} from "./GitHubBase";
|
|
import storage from "../storage";
|
|
import * as path from "path";
|
|
import got from "got";
|
|
import { basename, dirname } from "path";
|
|
|
|
import * as stream from "stream";
|
|
import AnonymousError from "../AnonymousError";
|
|
import { FILE_TYPE } from "../storage/Storage";
|
|
import { octokit, waitForTokenGate } from "../GitHubUtils";
|
|
import FileModel from "../model/files/files.model";
|
|
import { IFile } from "../model/files/files.types";
|
|
import { createLogger, serializeError } from "../logger";
|
|
import config from "../../config";
|
|
|
|
|
|
const logger = createLogger("gh-stream");
|
|
|
|
const GH_API_CONCURRENCY = 6;
|
|
|
|
async function pMap<T, R>(
|
|
items: T[],
|
|
fn: (item: T, index: number) => Promise<R>,
|
|
concurrency: number
|
|
): Promise<R[]> {
|
|
const results: R[] = new Array(items.length);
|
|
let next = 0;
|
|
async function worker() {
|
|
while (next < items.length) {
|
|
const i = next++;
|
|
results[i] = await fn(items[i], i);
|
|
}
|
|
}
|
|
await Promise.all(
|
|
Array.from({ length: Math.min(concurrency, items.length) }, () => worker())
|
|
);
|
|
return results;
|
|
}
|
|
|
|
export function githubRawFileUrl(
|
|
owner: string,
|
|
repo: string,
|
|
commit: string,
|
|
filePath: string
|
|
): string {
|
|
const encodedPath = filePath
|
|
.split("/")
|
|
.map((segment) => encodeURIComponent(segment))
|
|
.join("/");
|
|
return `https://github.com/${encodeURIComponent(owner)}/${encodeURIComponent(
|
|
repo
|
|
)}/raw/${encodeURIComponent(commit)}/${encodedPath}`;
|
|
}
|
|
|
|
export default class GitHubStream extends GitHubBase {
|
|
type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubStream";
|
|
|
|
private _truncatedFolders: string[] = [];
|
|
|
|
constructor(data: GitHubBaseData) {
|
|
super(data);
|
|
}
|
|
|
|
get truncatedFolderList(): string[] {
|
|
return this._truncatedFolders;
|
|
}
|
|
|
|
downloadFile(token: string, sha: string) {
|
|
const oct = octokit(token);
|
|
try {
|
|
const { url } = oct.rest.git.getBlob.endpoint({
|
|
owner: this.data.organization,
|
|
repo: this.data.repoName,
|
|
file_sha: sha,
|
|
});
|
|
logger.debug("downloading file", { url });
|
|
return got.stream(url, {
|
|
headers: {
|
|
"X-GitHub-Api-Version": "2022-11-28",
|
|
accept: "application/vnd.github.raw+json",
|
|
authorization: `token ${token}`,
|
|
},
|
|
});
|
|
} catch (error) {
|
|
logger.error("downloadFile failed", serializeError(error));
|
|
throw new AnonymousError("repo_not_accessible", {
|
|
httpStatus: 404,
|
|
object: this.data,
|
|
cause: error as Error,
|
|
});
|
|
}
|
|
}
|
|
|
|
// GitHub's web raw URL auto-resolves Git LFS pointers via redirect to
|
|
// media.githubusercontent.com, with the auth header carried through. The
|
|
// blob endpoint above returns the raw pointer text instead, so we use this
|
|
// as the fallback for LFS files (#95).
|
|
private downloadFileViaRaw(token: string, filePath: string) {
|
|
const url = githubRawFileUrl(
|
|
this.data.organization,
|
|
this.data.repoName,
|
|
this.data.commit,
|
|
filePath
|
|
);
|
|
logger.debug("downloading via raw URL (LFS)", { url });
|
|
return got.stream(url, {
|
|
headers: { authorization: `token ${token}` },
|
|
followRedirect: true,
|
|
});
|
|
}
|
|
|
|
// Try the blob API, then fall back to the raw URL on statuses where the
|
|
// path-based endpoint can still succeed. 422 is the blob endpoint's size
|
|
// cap; 404 can happen with stale/invalid blob SHAs while the path still
|
|
// exists at the requested commit.
|
|
private downloadWithFallback(
|
|
token: string,
|
|
sha: string,
|
|
filePath: string
|
|
): Promise<stream.Readable> {
|
|
return new Promise<stream.Readable>((resolve) => {
|
|
const blobStream = this.downloadFile(token, sha);
|
|
let settled = false;
|
|
|
|
const fallbackStatuses = new Set([404, 422]);
|
|
const fallbackToRaw = (statusCode?: number) => {
|
|
settled = true;
|
|
logger.info("blob API failed, falling back to raw URL", {
|
|
filePath,
|
|
statusCode,
|
|
});
|
|
resolve(this.downloadFileViaRaw(token, filePath));
|
|
};
|
|
|
|
blobStream.on("error", (err) => {
|
|
if (settled) return;
|
|
const statusCode = (
|
|
err as { response?: { statusCode?: number } }
|
|
)?.response?.statusCode;
|
|
if (statusCode && fallbackStatuses.has(statusCode)) {
|
|
fallbackToRaw(statusCode);
|
|
return;
|
|
}
|
|
// Other errors: let the normal pipeline handle them.
|
|
settled = true;
|
|
const passthrough = new stream.PassThrough();
|
|
passthrough.destroy(err);
|
|
resolve(passthrough);
|
|
});
|
|
|
|
blobStream.on("response", (response) => {
|
|
if (settled) return;
|
|
if (fallbackStatuses.has(response.statusCode || 0)) {
|
|
blobStream.destroy();
|
|
fallbackToRaw(response.statusCode);
|
|
return;
|
|
}
|
|
settled = true;
|
|
resolve(this.resolveLfsPointer(blobStream, token, filePath));
|
|
});
|
|
});
|
|
}
|
|
|
|
// Wrap a blob stream so that if its first ~150 bytes look like a Git LFS
|
|
// pointer, the bytes are dropped and replaced by a fresh fetch from the
|
|
// raw URL endpoint (which resolves LFS automatically). Non-LFS files are
|
|
// forwarded unchanged.
|
|
private resolveLfsPointer(
|
|
blobStream: stream.Readable,
|
|
token: string,
|
|
filePath: string
|
|
): stream.Readable {
|
|
const out = new stream.PassThrough();
|
|
let probe = Buffer.alloc(0);
|
|
let decided = false;
|
|
const PROBE_BYTES = 150;
|
|
const LFS_PREFIX = "version https://git-lfs.github.com/spec/";
|
|
|
|
const decide = (extra?: Buffer, sourceEnded = false) => {
|
|
if (decided) return;
|
|
decided = true;
|
|
const head = probe.toString(
|
|
"utf8",
|
|
0,
|
|
Math.min(probe.length, LFS_PREFIX.length)
|
|
);
|
|
if (head === LFS_PREFIX) {
|
|
blobStream.destroy();
|
|
const lfsStream = this.downloadFileViaRaw(token, filePath);
|
|
lfsStream.on("error", (err) => out.destroy(err));
|
|
lfsStream.pipe(out);
|
|
return;
|
|
}
|
|
out.write(probe);
|
|
if (extra && extra.length) out.write(extra);
|
|
if (sourceEnded) {
|
|
out.end();
|
|
return;
|
|
}
|
|
blobStream.on("data", (c) => out.write(c));
|
|
blobStream.on("end", () => out.end());
|
|
blobStream.on("error", (err) => out.destroy(err));
|
|
};
|
|
|
|
blobStream.on("data", (chunk: Buffer) => {
|
|
if (decided) return;
|
|
const remaining = PROBE_BYTES - probe.length;
|
|
if (chunk.length <= remaining) {
|
|
probe = Buffer.concat([probe, chunk]);
|
|
if (probe.length >= PROBE_BYTES) decide();
|
|
} else {
|
|
probe = Buffer.concat([probe, chunk.slice(0, remaining)]);
|
|
decide(chunk.slice(remaining));
|
|
}
|
|
});
|
|
blobStream.on("end", () => decide(undefined, true));
|
|
blobStream.on("error", (err) => {
|
|
// Always propagate — pre-decision this is the only listener; once a
|
|
// non-LFS decision is made, the inner branch attaches its own
|
|
// listener that will also fire, but we shouldn't rely on that being
|
|
// there if the code is later refactored.
|
|
decided = true;
|
|
out.destroy(err);
|
|
});
|
|
|
|
return out;
|
|
}
|
|
|
|
async getFileContentCache(
|
|
filePath: string,
|
|
repoId: string,
|
|
fileMeta: () =>
|
|
| Promise<{ sha: string; size?: number }>
|
|
| { sha: string; size?: number }
|
|
| Promise<string>
|
|
| string
|
|
) {
|
|
const meta = await fileMeta();
|
|
const expected: { sha: string; size?: number } =
|
|
typeof meta === "string" ? { sha: meta } : meta;
|
|
const fileInfo = await storage.exists(repoId, filePath);
|
|
if (fileInfo == FILE_TYPE.FILE) {
|
|
// If we know the upstream size, validate the cached entry. A cached
|
|
// file smaller than the upstream size means a previous fetch was
|
|
// truncated — likely a network error during the GitHub fetch left a
|
|
// 0-byte or partial blob behind. Treat it as a miss and re-fetch.
|
|
// Cached size >= expected is accepted: equal for normal files, and
|
|
// larger for Git LFS files where FileModel.size is the pointer's
|
|
// size but the cached bytes are the resolved LFS content.
|
|
if (expected.size != null && expected.size > 0) {
|
|
try {
|
|
const stat = await storage.fileInfo(repoId, filePath);
|
|
if (stat.size != null && stat.size < expected.size) {
|
|
await storage.rm(repoId, filePath);
|
|
} else {
|
|
return storage.read(repoId, filePath);
|
|
}
|
|
} catch {
|
|
// fall through and re-fetch
|
|
}
|
|
} else {
|
|
return storage.read(repoId, filePath);
|
|
}
|
|
} else if (fileInfo == FILE_TYPE.FOLDER) {
|
|
throw new AnonymousError("folder_not_supported", {
|
|
httpStatus: 400,
|
|
object: filePath,
|
|
});
|
|
}
|
|
|
|
// GitHub's blob API rejects blobs larger than 100 MB with HTTP 422.
|
|
// Skip the download entirely when the tree already tells us the file is
|
|
// over the cap, so we surface a clean `file_too_big` instead of paying
|
|
// the round-trip just to translate a 422.
|
|
if (expected.size != null && expected.size > config.MAX_FILE_SIZE) {
|
|
throw new AnonymousError("file_too_big", {
|
|
httpStatus: 413,
|
|
object: filePath,
|
|
});
|
|
}
|
|
const token = await this.data.getToken();
|
|
|
|
// Try the blob API first, but fall back to the raw URL on recoverable
|
|
// blob misses/caps while still preserving LFS pointer handling.
|
|
const content = await this.downloadWithFallback(
|
|
token,
|
|
expected.sha,
|
|
filePath
|
|
);
|
|
|
|
// duplicate the stream to write it to the storage
|
|
const stream1 = content.pipe(new stream.PassThrough());
|
|
const stream2 = content.pipe(new stream.PassThrough());
|
|
|
|
// Safety net: guarantee an `error` listener exists on both branches
|
|
// before any error can be emitted. storage.write attaches its listener
|
|
// only after an `await mk(...)`, and the route handler attaches its
|
|
// listener after awaiting this function — both leave a window where
|
|
// an upstream error would have no listener and escalate to
|
|
// uncaughtException, crashing the streamer.
|
|
const noop = () => {};
|
|
stream1.on("error", noop);
|
|
stream2.on("error", noop);
|
|
|
|
content.on("error", (error) => {
|
|
const httpStatus =
|
|
(error as { response?: { statusCode?: number } })?.response
|
|
?.statusCode ??
|
|
(error as { status?: number })?.status ??
|
|
(error as { httpStatus?: number })?.httpStatus;
|
|
const errCode = (error as { code?: string })?.code;
|
|
const isTransient =
|
|
!httpStatus &&
|
|
(errCode === "ECONNRESET" ||
|
|
errCode === "ETIMEDOUT" ||
|
|
errCode === "ERR_BODY_PARSE_FAILURE" ||
|
|
error.name === "ReadError");
|
|
const code =
|
|
httpStatus === 422
|
|
? "file_too_big"
|
|
: httpStatus === 403
|
|
? "file_not_accessible"
|
|
: isTransient
|
|
? "upstream_error"
|
|
: "file_not_found";
|
|
const wrapped = new AnonymousError(code, {
|
|
httpStatus: isTransient ? 502 : httpStatus,
|
|
cause: error as Error,
|
|
object: filePath,
|
|
});
|
|
stream1.destroy(wrapped);
|
|
stream2.destroy(wrapped);
|
|
});
|
|
|
|
// Fire-and-forget: storage.write logs its own failures inside FileSystem
|
|
// (`[fs] write failed`). Swallow the rejection here so an upstream error
|
|
// (e.g. GitHub 422 on a too-big blob) doesn't surface as an unhandled
|
|
// promise rejection and crash the streamer process.
|
|
storage
|
|
.write(repoId, filePath, stream1, this.type, expected.size)
|
|
.catch(() => {});
|
|
return stream2;
|
|
}
|
|
|
|
async getFileContent(file: AnonymizedFile): Promise<stream.Readable> {
|
|
try {
|
|
void file.filePath;
|
|
} catch {
|
|
// compute the original path if ambiguous
|
|
await file.originalPath();
|
|
}
|
|
return this.getFileContentCache(
|
|
file.filePath,
|
|
file.repository.repoId,
|
|
async () => {
|
|
const fileSha = await file.sha();
|
|
if (!fileSha) {
|
|
throw new AnonymousError("file_not_accessible", {
|
|
httpStatus: 404,
|
|
object: file,
|
|
});
|
|
}
|
|
return { sha: fileSha, size: await file.size() };
|
|
}
|
|
);
|
|
}
|
|
|
|
async getFiles(progress?: (status: string) => void) {
|
|
this._truncatedFolders = [];
|
|
return this.getTruncatedTree(this.data.commit, progress);
|
|
}
|
|
|
|
private async getGHTree(
|
|
oct: ReturnType<typeof octokit>,
|
|
token: string,
|
|
sha: string,
|
|
count = { request: 0, file: 0 },
|
|
opt = { recursive: true, callback: () => {} }
|
|
) {
|
|
await waitForTokenGate(token);
|
|
const ghRes = await oct.git.getTree({
|
|
owner: this.data.organization,
|
|
repo: this.data.repoName,
|
|
tree_sha: sha,
|
|
recursive: opt.recursive === true ? "1" : undefined,
|
|
});
|
|
count.request++;
|
|
count.file += ghRes.data.tree.length;
|
|
if (opt.callback) {
|
|
opt.callback();
|
|
}
|
|
return ghRes.data;
|
|
}
|
|
|
|
private async getTruncatedTree(
|
|
sha: string,
|
|
progress?: (status: string) => void,
|
|
parentPath: string = ""
|
|
) {
|
|
const token = await this.data.getToken();
|
|
const oct = octokit(token);
|
|
const count = {
|
|
request: 0,
|
|
file: 0,
|
|
};
|
|
const output: IFile[] = [];
|
|
let data;
|
|
try {
|
|
data = await this.getGHTree(oct, token, sha, count, {
|
|
recursive: false,
|
|
callback: () => {
|
|
if (progress) {
|
|
progress("List file: " + count.file);
|
|
}
|
|
},
|
|
});
|
|
if (data.truncated) {
|
|
this._truncatedFolders.push(parentPath);
|
|
}
|
|
output.push(...this.tree2Tree(data.tree, parentPath));
|
|
} catch (error) {
|
|
const status = (error as { status?: number }).status;
|
|
if (status === 409) {
|
|
logger.debug("getTree empty repo", serializeError(error));
|
|
throw new AnonymousError("repo_empty", {
|
|
httpStatus: 409,
|
|
object: this.data,
|
|
cause: error as Error,
|
|
});
|
|
}
|
|
if (status === 404) {
|
|
logger.debug("getTree miss", serializeError(error));
|
|
const code = await classifyGitHubMissError(error, this.data);
|
|
throw new AnonymousError(code, {
|
|
httpStatus: 404,
|
|
object: this.data,
|
|
cause: error as Error,
|
|
});
|
|
}
|
|
logger.warn("getTree failed", serializeError(error));
|
|
throw new AnonymousError("repo_not_found", {
|
|
httpStatus: status || 404,
|
|
object: this.data,
|
|
cause: error as Error,
|
|
});
|
|
}
|
|
const subtrees: { sha: string; parentPath: string }[] = [];
|
|
for (const file of data.tree) {
|
|
if (file.type == "tree" && file.path && file.sha) {
|
|
subtrees.push({
|
|
sha: file.sha,
|
|
parentPath: path.join(parentPath, file.path),
|
|
});
|
|
}
|
|
}
|
|
const results = await pMap(
|
|
subtrees,
|
|
async (entry) =>
|
|
this.getGHTree(oct, token, entry.sha, count, {
|
|
recursive: true,
|
|
callback: () => {
|
|
if (progress) {
|
|
progress("List file: " + count.file);
|
|
}
|
|
},
|
|
}),
|
|
GH_API_CONCURRENCY
|
|
);
|
|
results.forEach((data, i) => {
|
|
if (data.truncated) {
|
|
this._truncatedFolders.push(subtrees[i].parentPath);
|
|
}
|
|
output.push(...this.tree2Tree(data.tree, subtrees[i].parentPath));
|
|
});
|
|
return output;
|
|
}
|
|
|
|
private tree2Tree(
|
|
tree: {
|
|
path?: string;
|
|
mode?: string;
|
|
type?: string;
|
|
sha?: string;
|
|
size?: number;
|
|
url?: string;
|
|
}[],
|
|
parentPath: string = ""
|
|
) {
|
|
return tree.map((elem) => {
|
|
const fullPath = path.join(parentPath, elem.path || "");
|
|
let pathFile = dirname(fullPath);
|
|
if (pathFile === ".") {
|
|
pathFile = "";
|
|
}
|
|
return new FileModel({
|
|
name: basename(fullPath),
|
|
path: pathFile,
|
|
repoId: this.data.repoId,
|
|
size: elem.size,
|
|
sha: elem.sha,
|
|
});
|
|
});
|
|
}
|
|
}
|