diff --git a/public/i18n/locale-en.json b/public/i18n/locale-en.json index c5a0755..2c57c60 100644 --- a/public/i18n/locale-en.json +++ b/public/i18n/locale-en.json @@ -44,6 +44,7 @@ "invalid_path": "The provided path is invalid or missing.", "path_not_specified": "A file path must be specified.", "path_not_defined": "The file path has not been resolved yet.", + "invalid_file_path": "The requested file path is not valid.", "no_file_selected": "Please select a file.", "file_not_found": "The requested file is not found.", "file_not_accessible": "The requested file is not accessible.", diff --git a/src/core/AnonymizedFile.ts b/src/core/AnonymizedFile.ts index 3225a35..c58d7de 100644 --- a/src/core/AnonymizedFile.ts +++ b/src/core/AnonymizedFile.ts @@ -264,7 +264,29 @@ export default class AnonymizedFile { res ); }); + // Forward Content-Type from the streamer's upstream response. + // got.stream(...).pipe(res) forwards body bytes only — without + // this, the parent response has no Content-Type and the browser + // guesses (text renders as download, images as octet-stream). + resStream.on("response", (upstream: { headers: Record }) => { + if (res.headersSent) return; + const ct = upstream.headers["content-type"]; + if (typeof ct === "string") { + res.contentType(ct); + } else { + const fallback = lookup(this.anonymizedPath); + if (fallback) res.contentType(fallback); + else if (isTextFile(this.anonymizedPath)) res.contentType("text/plain"); + } + }); resStream.pipe(res); + // Resolve as soon as the response is fully written rather than + // waiting for the socket to close — keep-alive sockets stay open + // long after the body is delivered, and we don't want to delay + // post-send work like countView() that long. + res.on("finish", () => { + resolve(); + }); res.on("close", () => { resolve(); }); @@ -311,6 +333,15 @@ export default class AnonymizedFile { .pipe(anonymizer) .pipe(res) .on("error", handleStreamError) + .on("finish", () => { + // resolve on body fully written rather than waiting for the + // socket to close — keep-alive can hold the socket open long + // after the response is delivered, delaying post-send work. + if (!content.closed && !content.destroyed) { + content.destroy(); + } + resolve(); + }) .on("close", () => { if (!content.closed && !content.destroyed) { content.destroy(); diff --git a/src/core/GitHubUtils.ts b/src/core/GitHubUtils.ts index 6dd9c1d..c7197e8 100644 --- a/src/core/GitHubUtils.ts +++ b/src/core/GitHubUtils.ts @@ -75,17 +75,36 @@ export async function getToken(repository: Repository) { ).toString("base64"), }, }); - const resBody = (await res.json()) as { token: string }; - repository.owner.model.accessTokens.github = resBody.token; - if (!repository.owner.model.accessTokenDates) { - repository.owner.model.accessTokenDates = { - github: new Date(), - }; - } else { - repository.owner.model.accessTokenDates.github = new Date(); + // Only persist a refreshed token if GitHub actually returned a + // valid one. Without this guard, a 4xx/5xx error body (revoked + // OAuth, rate limit, transient outage) silently overwrites the + // user's stored token with `undefined`, which then propagates as + // `Authorization: token undefined` to every subsequent API call — + // 401 even on public repos, and the config.GITHUB_TOKEN fallback + // below is unreachable because the token field is no longer falsy. + if (res.ok) { + const resBody = (await res.json().catch(() => null)) as + | { token?: unknown } + | null; + const refreshed = + resBody && typeof resBody.token === "string" && resBody.token.length > 0 + ? resBody.token + : null; + if (refreshed) { + repository.owner.model.accessTokens.github = refreshed; + if (!repository.owner.model.accessTokenDates) { + repository.owner.model.accessTokenDates = { github: new Date() }; + } else { + repository.owner.model.accessTokenDates.github = new Date(); + } + await repository.owner.model.save(); + return refreshed; + } } - await repository.owner.model.save(); - return resBody.token; + console.warn( + `[getToken] refresh failed for ${repository.owner.model.username} (status ${res.status}); falling back` + ); + // fall through to the checkToken path / config.GITHUB_TOKEN } const check = await checkToken(ownerAccessToken); if (check) { diff --git a/src/core/Repository.ts b/src/core/Repository.ts index c2bbbc3..3ca4f62 100644 --- a/src/core/Repository.ts +++ b/src/core/Repository.ts @@ -5,7 +5,7 @@ import * as sha1 from "crypto-js/sha1"; import User from "./User"; import GitHubStream from "./source/GitHubStream"; import Zip from "./source/Zip"; -import { anonymizePath } from "./anonymize-utils"; +import { anonymizePathCompiled, compileTerms } from "./anonymize-utils"; import UserModel from "./model/users/users.model"; import { IAnonymizedRepositoryDocument } from "./model/anonymizedRepositories/anonymizedRepositories.types"; import { AnonymizeTransformer } from "./anonymize-utils"; @@ -35,10 +35,11 @@ function anonymizeTreeRecursive( includeSha: false, } ): Partial[] { + const compiled = compileTerms(terms); return tree.map((file) => { return { - name: anonymizePath(file.name, terms), - path: anonymizePath(file.path, terms), + name: anonymizePathCompiled(file.name, compiled), + path: anonymizePathCompiled(file.path, compiled), size: file.size, sha: opt.includeSha ? file.sha @@ -293,8 +294,17 @@ export default class Repository { force: true, }); - // update the repository name if it has changed - this.model.source.repositoryName = ghRepo.fullName; + // update the repository name if it has changed. Persist it + // immediately — otherwise, when the commit is unchanged and the + // function returns early below, the renamed value lives in this + // in-memory model only and the next request reloads the stale + // name from MongoDB and re-runs the rename detection forever. + if (this.model.source.repositoryName !== ghRepo.fullName) { + this.model.source.repositoryName = ghRepo.fullName; + if (isConnected) { + await this._model.save(); + } + } const branches = await ghRepo.branches({ force: true, accessToken: token, @@ -359,20 +369,17 @@ export default class Repository { } this.model.increment(); await this.updateStatus(RepositoryStatus.DOWNLOAD); - const files = await this.files({ + await this.files({ force: false, progress, recursive: false, }); - if (files.length === 0) { - // create a dummy file when the repo is empty to avoid errors - await new FileModel({ - repoId: this.repoId, - path: "", - name: "", - size: 0, - }).save(); - } + // Previously inserted a dummy {path:"", name:"", size:0} FileModel + // here for empty repos "to avoid errors" — but that record collides + // with the special-case in AnonymizedFile.getFileInfo for the empty + // path, surfaces in unfiltered file listings, and breaks anything + // that assumes FileModel rows correspond to real files. Empty repos + // are handled by the route layer; nothing to materialise here. await this.updateStatus(RepositoryStatus.READY); await this.computeSize(); } diff --git a/src/core/zipStream.ts b/src/core/zipStream.ts index 26c4229..8887aef 100644 --- a/src/core/zipStream.ts +++ b/src/core/zipStream.ts @@ -3,7 +3,11 @@ import { Parse } from "unzip-stream"; import archiver = require("archiver"); import GitHubDownload from "./source/GitHubDownload"; -import { AnonymizeTransformer, anonymizePath } from "./anonymize-utils"; +import { + AnonymizeTransformer, + anonymizePathCompiled, + compileTerms, +} from "./anonymize-utils"; export interface StreamAnonymizedZipOptions { repoId: string; @@ -12,6 +16,45 @@ export interface StreamAnonymizedZipOptions { commit: string; getToken: () => string | Promise; anonymizerOptions: ConstructorParameters[0]; + /** + * Per-repo content gates. Matches Repository.options — `image: true` + * includes images, `pdf: true` includes PDFs. The single-file `/file/...` + * endpoint enforces these via AnonymizedFile.isFileSupported; without + * the same gate here, the ZIP shipped a superset of what the per-file + * API exposes, which is privacy-relevant when a maintainer toggles + * image=false to suppress identifying screenshots. + */ + contentOptions?: { + image?: boolean; + pdf?: boolean; + }; +} + +const IMAGE_EXTENSIONS = new Set([ + "png", + "jpg", + "jpeg", + "gif", + "svg", + "ico", + "bmp", + "tiff", + "tif", + "webp", + "avif", + "heif", + "heic", +]); + +function isEntryAllowed( + filename: string, + contentOptions?: { image?: boolean; pdf?: boolean } +): boolean { + if (!contentOptions) return true; + const ext = filename.split(".").pop()?.toLowerCase() ?? ""; + if (contentOptions.pdf === false && ext === "pdf") return false; + if (contentOptions.image === false && IMAGE_EXTENSIONS.has(ext)) return false; + return true; } /** @@ -47,30 +90,44 @@ export async function streamAnonymizedZip( }); const archive = archiver("zip", {}); + const compiledTerms = compileTerms(opt.anonymizerOptions.terms || []); + + // Track whether the upstream zipball finished cleanly. If it didn't, + // we must NOT finalize the archive — finalizing while bytes are still + // flowing to the response produces a valid-looking ZIP that's missing + // entries, which the client has no way to detect (status 200, archive + // opens). Destroy the response instead so the client sees a connection + // drop and knows the download failed. Same class of silent-truncation + // bug as #694. + let upstreamSucceeded = false; + const fail = (error: Error) => { + console.error(error); + archive.abort(); + const destroyable = res as unknown as { + destroy?: (err?: Error) => void; + end?: () => void; + }; + if (typeof destroyable.destroy === "function") { + destroyable.destroy(error); + } else if (typeof destroyable.end === "function") { + destroyable.end(); + } + }; + downloadStream - .on("error", (error) => { - console.error(error); - try { - archive.finalize(); - } catch { - /* ignored */ - } - }) - .on("close", () => { - try { - archive.finalize(); - } catch { - /* ignored */ - } - }) + .on("error", fail) .pipe(Parse()) .on("entry", (entry: NodeJS.ReadableStream & { type: string; path: string; autodrain: () => void }) => { if (entry.type === "File") { try { - const fileName = anonymizePath( + const fileName = anonymizePathCompiled( entry.path.substring(entry.path.indexOf("/") + 1), - opt.anonymizerOptions.terms || [] + compiledTerms ); + if (!isEntryAllowed(fileName, opt.contentOptions)) { + entry.autodrain(); + return; + } // Pass filePath via the constructor — AnonymizeTransformer reads it // there to decide whether the entry is text (and therefore should be // anonymized) vs binary (passthrough). Assigning afterwards leaves @@ -89,15 +146,9 @@ export async function streamAnonymizedZip( entry.autodrain(); } }) - .on("error", (error: Error) => { - console.error(error); - try { - archive.finalize(); - } catch { - /* ignored */ - } - }) + .on("error", fail) .on("finish", () => { + upstreamSucceeded = true; try { archive.finalize(); } catch { @@ -107,6 +158,12 @@ export async function streamAnonymizedZip( archive.pipe(res).on("error", (error) => { console.error(error); + if (!upstreamSucceeded) { + // archive errored while we were still depending on upstream bytes: + // treat as failure rather than truncating. + fail(error); + return; + } (res as { end?: () => void }).end?.(); }); } diff --git a/src/server/routes/repository-public.ts b/src/server/routes/repository-public.ts index b2ab9ee..7849032 100644 --- a/src/server/routes/repository-public.ts +++ b/src/server/routes/repository-public.ts @@ -63,6 +63,10 @@ router.get( branch: repo.model.source.branch, repoId: repo.repoId, anonymizerOptions: anonymizer.opt, + contentOptions: { + image: repo.options.image, + pdf: repo.options.pdf, + }, }, }) .on("error", () => { @@ -104,6 +108,10 @@ router.get( commit: repo.model.source.commit || "HEAD", getToken: () => repo.getToken(), anonymizerOptions: anonymizer.opt, + contentOptions: { + image: repo.options.image, + pdf: repo.options.pdf, + }, }, res ); diff --git a/src/streamer/route.ts b/src/streamer/route.ts index ef0cfd8..ea9dfcc 100644 --- a/src/streamer/route.ts +++ b/src/streamer/route.ts @@ -18,6 +18,7 @@ router.post( const repoId = req.body.repoId; const commit = req.body.commit; const anonymizerOptions = req.body.anonymizerOptions; + const contentOptions = req.body.contentOptions; try { await streamAnonymizedZip( @@ -28,6 +29,7 @@ router.post( commit, getToken: () => token, anonymizerOptions, + contentOptions, }, res ); @@ -45,10 +47,25 @@ router.post("/", async (req: express.Request, res: express.Response) => { const fileSize: number | undefined = typeof req.body.size === "number" ? req.body.size : undefined; const commit = req.body.commit; - const filePath = req.body.filePath; + const filePath: string = req.body.filePath; const anonymizerOptions = req.body.anonymizerOptions; const anonymizer = new AnonymizeTransformer(anonymizerOptions); + // Defence in depth: the parent server validates filePath against + // FileModel before calling us, but the streamer joins this directly + // into the storage path on disk. Reject any segment that could escape + // the repo root, in case the streamer is ever exposed beyond the + // internal network or a buggy caller forwards an unvalidated path. + if ( + typeof filePath !== "string" || + filePath.length === 0 || + filePath + .split(/[\\/]/) + .some((segment) => segment === ".." || segment === "") + ) { + return res.status(400).json({ error: "invalid_file_path" }); + } + const source = new GitHubStream({ repoId, organization: repoFullName[0],