fix: silent-truncation, token-refresh, and content-type bugs across hot paths

Follow-up review pass after the cache fixes turned up several bugs in the same family — silent failures that look like success to the client, plus content-correctness issues in the ZIP and per-file delivery paths. - zipStream: stop calling archive.finalize() on upstream/parser errors. That produced a valid-looking ZIP (200 OK, archive opens) silently missing entries — same class as #694, but worse because the user has no signal anything went wrong. Destroy the response on failure instead so the client sees a connection drop. - zipStream: apply per-repo image/pdf gates inside the entry handler. The single-file /file/... endpoint refuses to serve those types via AnonymizedFile.isFileSupported when image=false / pdf=false, but the ZIP shipped them anyway — privacy-relevant for maintainers who toggle image=false to suppress identifying screenshots. Threaded contentOptions through both ZIP entry points (direct and streamer). - GitHubUtils.getToken: validate the OAuth token-refresh response before persisting. On a non-2xx response or a body without a string token, we used to overwrite the stored token with `undefined`, which then propagated as `Authorization: token undefined` to every API call — 401 even on public repos, with the config.GITHUB_TOKEN fallback unreachable because the field was no longer falsy. - AnonymizedFile.send (streamer branch): forward Content-Type from the upstream streamer response. got.stream(...).pipe(res) carries body bytes only, so the parent response had no Content-Type and browsers guessed (text rendered as download, etc.). Also resolve on res.on("finish") in addition to "close" — keep-alive sockets stay open long after the response is delivered, delaying countView(). - Repository.updateIfNeeded: persist a renamed source.repositoryName even when the commit hasn't changed. Previously the new value lived in memory only and was overwritten on the next reload, so the rename detection ran every request. - Repository.anonymize: stop materialising a dummy {path:"",name:""} FileModel for empty repos. That row collided with the special case in AnonymizedFile.getFileInfo and surfaced in unfiltered listings. - streamer/route POST /: reject filePath segments containing ".." or empty parts. Defence in depth — the parent server validates against FileModel before calling, but the streamer joins filePath straight into the storage path, so any future caller forwarding an unvalidated path could traverse out of the repo root. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 22:48:00 +02:00 · 2026-05-05 09:19:05 +03:00
parent f413a30313
commit 5b72b630c4
7 changed files with 192 additions and 52 deletions
@@ -44,6 +44,7 @@
    "invalid_path": "The provided path is invalid or missing.",
    "path_not_specified": "A file path must be specified.",
    "path_not_defined": "The file path has not been resolved yet.",
+    "invalid_file_path": "The requested file path is not valid.",
    "no_file_selected": "Please select a file.",
    "file_not_found": "The requested file is not found.",
    "file_not_accessible": "The requested file is not accessible.",
@@ -264,7 +264,29 @@ export default class AnonymizedFile {
                res
              );
            });
+          // Forward Content-Type from the streamer's upstream response.
+          // got.stream(...).pipe(res) forwards body bytes only — without
+          // this, the parent response has no Content-Type and the browser
+          // guesses (text renders as download, images as octet-stream).
+          resStream.on("response", (upstream: { headers: Record<string, string | string[] | undefined> }) => {
+            if (res.headersSent) return;
+            const ct = upstream.headers["content-type"];
+            if (typeof ct === "string") {
+              res.contentType(ct);
+            } else {
+              const fallback = lookup(this.anonymizedPath);
+              if (fallback) res.contentType(fallback);
+              else if (isTextFile(this.anonymizedPath)) res.contentType("text/plain");
+            }
+          });
          resStream.pipe(res);
+          // Resolve as soon as the response is fully written rather than
+          // waiting for the socket to close — keep-alive sockets stay open
+          // long after the body is delivered, and we don't want to delay
+          // post-send work like countView() that long.
+          res.on("finish", () => {
+            resolve();
+          });
          res.on("close", () => {
            resolve();
          });
@@ -311,6 +333,15 @@ export default class AnonymizedFile {
          .pipe(anonymizer)
          .pipe(res)
          .on("error", handleStreamError)
+          .on("finish", () => {
+            // resolve on body fully written rather than waiting for the
+            // socket to close — keep-alive can hold the socket open long
+            // after the response is delivered, delaying post-send work.
+            if (!content.closed && !content.destroyed) {
+              content.destroy();
+            }
+            resolve();
+          })
          .on("close", () => {
            if (!content.closed && !content.destroyed) {
              content.destroy();
@@ -75,17 +75,36 @@ export async function getToken(repository: Repository) {
            ).toString("base64"),
        },
      });
-      const resBody = (await res.json()) as { token: string };
-      repository.owner.model.accessTokens.github = resBody.token;
-      if (!repository.owner.model.accessTokenDates) {
-        repository.owner.model.accessTokenDates = {
-          github: new Date(),
-        };
-      } else {
-        repository.owner.model.accessTokenDates.github = new Date();
+      // Only persist a refreshed token if GitHub actually returned a
+      // valid one. Without this guard, a 4xx/5xx error body (revoked
+      // OAuth, rate limit, transient outage) silently overwrites the
+      // user's stored token with `undefined`, which then propagates as
+      // `Authorization: token undefined` to every subsequent API call —
+      // 401 even on public repos, and the config.GITHUB_TOKEN fallback
+      // below is unreachable because the token field is no longer falsy.
+      if (res.ok) {
+        const resBody = (await res.json().catch(() => null)) as
+          | { token?: unknown }
+          | null;
+        const refreshed =
+          resBody && typeof resBody.token === "string" && resBody.token.length > 0
+            ? resBody.token
+            : null;
+        if (refreshed) {
+          repository.owner.model.accessTokens.github = refreshed;
+          if (!repository.owner.model.accessTokenDates) {
+            repository.owner.model.accessTokenDates = { github: new Date() };
+          } else {
+            repository.owner.model.accessTokenDates.github = new Date();
+          }
+          await repository.owner.model.save();
+          return refreshed;
+        }
      }
-      await repository.owner.model.save();
-      return resBody.token;
+      console.warn(
+        `[getToken] refresh failed for ${repository.owner.model.username} (status ${res.status}); falling back`
+      );
+      // fall through to the checkToken path / config.GITHUB_TOKEN
    }
    const check = await checkToken(ownerAccessToken);
    if (check) {
@@ -5,7 +5,7 @@ import * as sha1 from "crypto-js/sha1";
 import User from "./User";
 import GitHubStream from "./source/GitHubStream";
 import Zip from "./source/Zip";
-import { anonymizePath } from "./anonymize-utils";
+import { anonymizePathCompiled, compileTerms } from "./anonymize-utils";
 import UserModel from "./model/users/users.model";
 import { IAnonymizedRepositoryDocument } from "./model/anonymizedRepositories/anonymizedRepositories.types";
 import { AnonymizeTransformer } from "./anonymize-utils";
@@ -35,10 +35,11 @@ function anonymizeTreeRecursive(
    includeSha: false,
  }
 ): Partial<IFile>[] {
+  const compiled = compileTerms(terms);
  return tree.map((file) => {
    return {
-      name: anonymizePath(file.name, terms),
-      path: anonymizePath(file.path, terms),
+      name: anonymizePathCompiled(file.name, compiled),
+      path: anonymizePathCompiled(file.path, compiled),
      size: file.size,
      sha: opt.includeSha
        ? file.sha
@@ -293,8 +294,17 @@ export default class Repository {
          force: true,
        });

-        // update the repository name if it has changed
-        this.model.source.repositoryName = ghRepo.fullName;
+        // update the repository name if it has changed. Persist it
+        // immediately — otherwise, when the commit is unchanged and the
+        // function returns early below, the renamed value lives in this
+        // in-memory model only and the next request reloads the stale
+        // name from MongoDB and re-runs the rename detection forever.
+        if (this.model.source.repositoryName !== ghRepo.fullName) {
+          this.model.source.repositoryName = ghRepo.fullName;
+          if (isConnected) {
+            await this._model.save();
+          }
+        }
        const branches = await ghRepo.branches({
          force: true,
          accessToken: token,
@@ -359,20 +369,17 @@ export default class Repository {
    }
    this.model.increment();
    await this.updateStatus(RepositoryStatus.DOWNLOAD);
-    const files = await this.files({
+    await this.files({
      force: false,
      progress,
      recursive: false,
    });
-    if (files.length === 0) {
-      // create a dummy file when the repo is empty to avoid errors
-      await new FileModel({
-        repoId: this.repoId,
-        path: "",
-        name: "",
-        size: 0,
-      }).save();
-    }
+    // Previously inserted a dummy {path:"", name:"", size:0} FileModel
+    // here for empty repos "to avoid errors" — but that record collides
+    // with the special-case in AnonymizedFile.getFileInfo for the empty
+    // path, surfaces in unfiltered file listings, and breaks anything
+    // that assumes FileModel rows correspond to real files. Empty repos
+    // are handled by the route layer; nothing to materialise here.
    await this.updateStatus(RepositoryStatus.READY);
    await this.computeSize();
  }
@@ -3,7 +3,11 @@ import { Parse } from "unzip-stream";
 import archiver = require("archiver");

 import GitHubDownload from "./source/GitHubDownload";
-import { AnonymizeTransformer, anonymizePath } from "./anonymize-utils";
+import {
+  AnonymizeTransformer,
+  anonymizePathCompiled,
+  compileTerms,
+} from "./anonymize-utils";

 export interface StreamAnonymizedZipOptions {
  repoId: string;
@@ -12,6 +16,45 @@ export interface StreamAnonymizedZipOptions {
  commit: string;
  getToken: () => string | Promise<string>;
  anonymizerOptions: ConstructorParameters<typeof AnonymizeTransformer>[0];
+  /**
+   * Per-repo content gates. Matches Repository.options — `image: true`
+   * includes images, `pdf: true` includes PDFs. The single-file `/file/...`
+   * endpoint enforces these via AnonymizedFile.isFileSupported; without
+   * the same gate here, the ZIP shipped a superset of what the per-file
+   * API exposes, which is privacy-relevant when a maintainer toggles
+   * image=false to suppress identifying screenshots.
+   */
+  contentOptions?: {
+    image?: boolean;
+    pdf?: boolean;
+  };
+}
+
+const IMAGE_EXTENSIONS = new Set([
+  "png",
+  "jpg",
+  "jpeg",
+  "gif",
+  "svg",
+  "ico",
+  "bmp",
+  "tiff",
+  "tif",
+  "webp",
+  "avif",
+  "heif",
+  "heic",
+]);
+
+function isEntryAllowed(
+  filename: string,
+  contentOptions?: { image?: boolean; pdf?: boolean }
+): boolean {
+  if (!contentOptions) return true;
+  const ext = filename.split(".").pop()?.toLowerCase() ?? "";
+  if (contentOptions.pdf === false && ext === "pdf") return false;
+  if (contentOptions.image === false && IMAGE_EXTENSIONS.has(ext)) return false;
+  return true;
 }

 /**
@@ -47,30 +90,44 @@ export async function streamAnonymizedZip(
  });

  const archive = archiver("zip", {});
+  const compiledTerms = compileTerms(opt.anonymizerOptions.terms || []);
+
+  // Track whether the upstream zipball finished cleanly. If it didn't,
+  // we must NOT finalize the archive — finalizing while bytes are still
+  // flowing to the response produces a valid-looking ZIP that's missing
+  // entries, which the client has no way to detect (status 200, archive
+  // opens). Destroy the response instead so the client sees a connection
+  // drop and knows the download failed. Same class of silent-truncation
+  // bug as #694.
+  let upstreamSucceeded = false;
+  const fail = (error: Error) => {
+    console.error(error);
+    archive.abort();
+    const destroyable = res as unknown as {
+      destroy?: (err?: Error) => void;
+      end?: () => void;
+    };
+    if (typeof destroyable.destroy === "function") {
+      destroyable.destroy(error);
+    } else if (typeof destroyable.end === "function") {
+      destroyable.end();
+    }
+  };
+
  downloadStream
-    .on("error", (error) => {
-      console.error(error);
-      try {
-        archive.finalize();
-      } catch {
-        /* ignored */
-      }
-    })
-    .on("close", () => {
-      try {
-        archive.finalize();
-      } catch {
-        /* ignored */
-      }
-    })
+    .on("error", fail)
    .pipe(Parse())
    .on("entry", (entry: NodeJS.ReadableStream & { type: string; path: string; autodrain: () => void }) => {
      if (entry.type === "File") {
        try {
-          const fileName = anonymizePath(
+          const fileName = anonymizePathCompiled(
            entry.path.substring(entry.path.indexOf("/") + 1),
-            opt.anonymizerOptions.terms || []
+            compiledTerms
          );
+          if (!isEntryAllowed(fileName, opt.contentOptions)) {
+            entry.autodrain();
+            return;
+          }
          // Pass filePath via the constructor — AnonymizeTransformer reads it
          // there to decide whether the entry is text (and therefore should be
          // anonymized) vs binary (passthrough). Assigning afterwards leaves
@@ -89,15 +146,9 @@ export async function streamAnonymizedZip(
        entry.autodrain();
      }
    })
-    .on("error", (error: Error) => {
-      console.error(error);
-      try {
-        archive.finalize();
-      } catch {
-        /* ignored */
-      }
-    })
+    .on("error", fail)
    .on("finish", () => {
+      upstreamSucceeded = true;
      try {
        archive.finalize();
      } catch {
@@ -107,6 +158,12 @@ export async function streamAnonymizedZip(

  archive.pipe(res).on("error", (error) => {
    console.error(error);
+    if (!upstreamSucceeded) {
+      // archive errored while we were still depending on upstream bytes:
+      // treat as failure rather than truncating.
+      fail(error);
+      return;
+    }
    (res as { end?: () => void }).end?.();
  });
 }
@@ -63,6 +63,10 @@ router.get(
              branch: repo.model.source.branch,
              repoId: repo.repoId,
              anonymizerOptions: anonymizer.opt,
+              contentOptions: {
+                image: repo.options.image,
+                pdf: repo.options.pdf,
+              },
            },
          })
          .on("error", () => {
@@ -104,6 +108,10 @@ router.get(
          commit: repo.model.source.commit || "HEAD",
          getToken: () => repo.getToken(),
          anonymizerOptions: anonymizer.opt,
+          contentOptions: {
+            image: repo.options.image,
+            pdf: repo.options.pdf,
+          },
        },
        res
      );
@@ -18,6 +18,7 @@ router.post(
    const repoId = req.body.repoId;
    const commit = req.body.commit;
    const anonymizerOptions = req.body.anonymizerOptions;
+    const contentOptions = req.body.contentOptions;

    try {
      await streamAnonymizedZip(
@@ -28,6 +29,7 @@ router.post(
          commit,
          getToken: () => token,
          anonymizerOptions,
+          contentOptions,
        },
        res
      );
@@ -45,10 +47,25 @@ router.post("/", async (req: express.Request, res: express.Response) => {
  const fileSize: number | undefined =
    typeof req.body.size === "number" ? req.body.size : undefined;
  const commit = req.body.commit;
-  const filePath = req.body.filePath;
+  const filePath: string = req.body.filePath;
  const anonymizerOptions = req.body.anonymizerOptions;
  const anonymizer = new AnonymizeTransformer(anonymizerOptions);

+  // Defence in depth: the parent server validates filePath against
+  // FileModel before calling us, but the streamer joins this directly
+  // into the storage path on disk. Reject any segment that could escape
+  // the repo root, in case the streamer is ever exposed beyond the
+  // internal network or a buggy caller forwards an unvalidated path.
+  if (
+    typeof filePath !== "string" ||
+    filePath.length === 0 ||
+    filePath
+      .split(/[\\/]/)
+      .some((segment) => segment === ".." || segment === "")
+  ) {
+    return res.status(400).json({ error: "invalid_file_path" });
+  }
+
  const source = new GitHubStream({
    repoId,
    organization: repoFullName[0],