mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-20 00:14:43 +02:00
195 lines
5.9 KiB
TypeScript
195 lines
5.9 KiB
TypeScript
import got from "got";
|
|
import { Parse } from "unzip-stream";
|
|
import archiver = require("archiver");
|
|
|
|
import GitHubDownload from "./source/GitHubDownload";
|
|
import { classifyGitHubMissError } from "./source/GitHubBase";
|
|
import AnonymousError from "./AnonymousError";
|
|
import {
|
|
AnonymizeTransformer,
|
|
anonymizePathCompiled,
|
|
compileTerms,
|
|
} from "./anonymize-utils";
|
|
import { createLogger, serializeError } from "./logger";
|
|
|
|
const logger = createLogger("zip-stream");
|
|
|
|
export interface StreamAnonymizedZipOptions {
|
|
repoId: string;
|
|
organization: string;
|
|
repoName: string;
|
|
commit: string;
|
|
getToken: () => string | Promise<string>;
|
|
anonymizerOptions: ConstructorParameters<typeof AnonymizeTransformer>[0];
|
|
/**
|
|
* Per-repo content gates. Matches Repository.options — `image: true`
|
|
* includes images, `pdf: true` includes PDFs. The single-file `/file/...`
|
|
* endpoint enforces these via AnonymizedFile.isFileSupported; without
|
|
* the same gate here, the ZIP shipped a superset of what the per-file
|
|
* API exposes, which is privacy-relevant when a maintainer toggles
|
|
* image=false to suppress identifying screenshots.
|
|
*/
|
|
contentOptions?: {
|
|
image?: boolean;
|
|
pdf?: boolean;
|
|
};
|
|
}
|
|
|
|
const IMAGE_EXTENSIONS = new Set([
|
|
"png",
|
|
"jpg",
|
|
"jpeg",
|
|
"gif",
|
|
"svg",
|
|
"ico",
|
|
"bmp",
|
|
"tiff",
|
|
"tif",
|
|
"webp",
|
|
"avif",
|
|
"heif",
|
|
"heic",
|
|
]);
|
|
|
|
function isEntryAllowed(
|
|
filename: string,
|
|
contentOptions?: { image?: boolean; pdf?: boolean }
|
|
): boolean {
|
|
if (!contentOptions) return true;
|
|
const ext = filename.split(".").pop()?.toLowerCase() ?? "";
|
|
if (contentOptions.pdf === false && ext === "pdf") return false;
|
|
if (contentOptions.image === false && IMAGE_EXTENSIONS.has(ext)) return false;
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Stream the GitHub source zip for a repository, anonymize each entry on the
|
|
* fly, and pipe the resulting archive into the provided writable response.
|
|
*
|
|
* No data is written to local storage — the zip flows GitHub → unzip → per
|
|
* file anonymizer → archiver → response.
|
|
*/
|
|
export async function streamAnonymizedZip(
|
|
opt: StreamAnonymizedZipOptions,
|
|
res: NodeJS.WritableStream & {
|
|
on(event: string, listener: (...args: unknown[]) => void): unknown;
|
|
}
|
|
): Promise<void> {
|
|
const source = new GitHubDownload({
|
|
repoId: opt.repoId,
|
|
organization: opt.organization,
|
|
repoName: opt.repoName,
|
|
commit: opt.commit,
|
|
getToken: opt.getToken,
|
|
});
|
|
|
|
let response;
|
|
try {
|
|
response = await source.getZipUrl();
|
|
} catch (error) {
|
|
const code = await classifyGitHubMissError(error, {
|
|
organization: opt.organization,
|
|
repoName: opt.repoName,
|
|
repoId: opt.repoId,
|
|
commit: opt.commit,
|
|
getToken: opt.getToken,
|
|
});
|
|
const status = (error as { status?: number }).status;
|
|
throw new AnonymousError(code, {
|
|
httpStatus: status && status >= 500 ? 502 : status || 502,
|
|
cause: error as Error,
|
|
object: {
|
|
repoId: opt.repoId,
|
|
fullName: `${opt.organization}/${opt.repoName}`,
|
|
},
|
|
});
|
|
}
|
|
const downloadStream = got.stream(response.url);
|
|
|
|
res.on("error", (error) => {
|
|
logger.error("response stream error", serializeError(error));
|
|
downloadStream.destroy();
|
|
});
|
|
res.on("close", () => {
|
|
downloadStream.destroy();
|
|
});
|
|
|
|
const archive = archiver("zip", {});
|
|
const compiledTerms = compileTerms(opt.anonymizerOptions.terms || []);
|
|
|
|
// Track whether the upstream zipball finished cleanly. If it didn't,
|
|
// we must NOT finalize the archive — finalizing while bytes are still
|
|
// flowing to the response produces a valid-looking ZIP that's missing
|
|
// entries, which the client has no way to detect (status 200, archive
|
|
// opens). Destroy the response instead so the client sees a connection
|
|
// drop and knows the download failed. Same class of silent-truncation
|
|
// bug as #694.
|
|
let upstreamSucceeded = false;
|
|
const fail = (error: Error) => {
|
|
logger.error("upstream zipball failed", serializeError(error));
|
|
archive.abort();
|
|
const destroyable = res as unknown as {
|
|
destroy?: (err?: Error) => void;
|
|
end?: () => void;
|
|
};
|
|
if (typeof destroyable.destroy === "function") {
|
|
destroyable.destroy(error);
|
|
} else if (typeof destroyable.end === "function") {
|
|
destroyable.end();
|
|
}
|
|
};
|
|
|
|
downloadStream
|
|
.on("error", fail)
|
|
.pipe(Parse())
|
|
.on("entry", (entry: NodeJS.ReadableStream & { type: string; path: string; autodrain: () => void }) => {
|
|
if (entry.type === "File") {
|
|
try {
|
|
const fileName = anonymizePathCompiled(
|
|
entry.path.substring(entry.path.indexOf("/") + 1),
|
|
compiledTerms
|
|
);
|
|
if (!isEntryAllowed(fileName, opt.contentOptions)) {
|
|
entry.autodrain();
|
|
return;
|
|
}
|
|
// Pass filePath via the constructor — AnonymizeTransformer reads it
|
|
// there to decide whether the entry is text (and therefore should be
|
|
// anonymized) vs binary (passthrough). Assigning afterwards leaves
|
|
// isText=false for every file, so the zip ships unanonymized.
|
|
const anonymizer = new AnonymizeTransformer({
|
|
...opt.anonymizerOptions,
|
|
filePath: fileName,
|
|
});
|
|
const st = entry.pipe(anonymizer);
|
|
archive.append(st, { name: fileName });
|
|
} catch (error) {
|
|
entry.autodrain();
|
|
logger.error("entry transform failed", serializeError(error));
|
|
}
|
|
} else {
|
|
entry.autodrain();
|
|
}
|
|
})
|
|
.on("error", fail)
|
|
.on("finish", () => {
|
|
upstreamSucceeded = true;
|
|
try {
|
|
archive.finalize();
|
|
} catch {
|
|
/* ignored */
|
|
}
|
|
});
|
|
|
|
archive.pipe(res).on("error", (error) => {
|
|
logger.error("archive pipe error", serializeError(error));
|
|
if (!upstreamSucceeded) {
|
|
// archive errored while we were still depending on upstream bytes:
|
|
// treat as failure rather than truncating.
|
|
fail(error);
|
|
return;
|
|
}
|
|
(res as { end?: () => void }).end?.();
|
|
});
|
|
}
|