fix(cache): make Zip-source caches atomic and robust to partial state

Follow-up to the GitHubStream cache fixes. The same poisoned-cache class existed in the GitHubDownload path and a few related spots: - GitHubDownload.download: wipe pre-existing state before extracting and write a .anon-complete marker only after a successful extract. On error, rm the partial cache so a retry starts clean. getFileContent and getFiles now gate on the marker instead of "any file/folder exists," so a half-extracted tree can never be served as canonical. - GitHubDownload.getFileContent: validate cached file size against the upstream FileModel size (via the new AnonymizedFile.size()), same guard as GitHubStream. getFiles filters the marker from the listing. - FileSystem.listFiles: drop the bogus stats.ino.toString() as sha. An inode isn't a content hash; anything comparing it to a Git blob sha would silently disagree. Leave undefined. - S3.write: remove the fire-and-forget data.on("error") -> this.rm(...). Multipart Upload doesn't commit partial objects, so there was nothing to clean up, and the handler raced retries and could delete a previously-good object on a transient source-stream hiccup. The size-validated read path recovers from any other undersized objects. - GitHubStream.resolveLfsPointer: drop the post-decision early-return in blobStream.on("error"). Currently redundant with the inner listener, but removes the future-refactor footgun. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 22:48:00 +02:00 · 2026-05-05 08:54:42 +03:00
parent 9adff11e74
commit f413a30313
4 changed files with 74 additions and 13 deletions
@@ -36,6 +36,10 @@ export default class GitHubDownload extends GitHubBase {
        cause: error as Error,
      });
    }
+    // Wipe any partial state from a previous failed extraction. Without
+    // this, half-extracted trees would survive and later be served as the
+    // canonical listing.
+    await storage.rm(this.data.repoId);
    await storage.mk(this.data.repoId);
    try {
      const downloadStream = got.stream(response.url);
@@ -53,7 +57,15 @@ export default class GitHubDownload extends GitHubBase {
        downloadStream,
        this.type
      );
+      // Write the completion marker last. Its presence — and only its
+      // presence — means "the cache for this repo is fully extracted and
+      // safe to serve." Any code path that finds files but no marker must
+      // treat the cache as incomplete and re-download.
+      await storage.write(this.data.repoId, COMPLETE_MARKER, "ok", this.type);
    } catch (error) {
+      // Best-effort cleanup of the partial extraction so the next call
+      // doesn't trip over half-written files.
+      await storage.rm(this.data.repoId).catch(() => undefined);
      throw new AnonymousError("unable_to_download", {
        httpStatus: 500,
        cause: error as Error,
@@ -62,12 +74,48 @@ export default class GitHubDownload extends GitHubBase {
    }
  }

+  private async isCacheComplete(): Promise<boolean> {
+    return (
+      (await storage.exists(this.data.repoId, COMPLETE_MARKER)) ===
+      FILE_TYPE.FILE
+    );
+  }
+
  async getFileContent(
    file: AnonymizedFile,
    progress?: (status: string) => void
  ): Promise<Readable> {
+    if (!(await this.isCacheComplete())) {
+      // will throw an error if the file is not in the repository
+      await file.originalPath();
+      await this.download(progress);
+      return storage.read(this.data.repoId, file.filePath);
+    }
    const exists = await storage.exists(this.data.repoId, file.filePath);
    if (exists === FILE_TYPE.FILE) {
+      // Validate the cached file size against the upstream tree size when
+      // it's known — guards against the same poisoned-cache class as in
+      // GitHubStream. cached.size >= expected is accepted (equal for
+      // normal files, larger for LFS-resolved blobs).
+      let expectedSize: number | undefined;
+      try {
+        expectedSize = await file.size();
+      } catch {
+        // not all callers populate a FileModel; fall back to reading
+      }
+      if (expectedSize != null && expectedSize > 0) {
+        try {
+          const stat = await storage.fileInfo(this.data.repoId, file.filePath);
+          if (stat.size != null && stat.size < expectedSize) {
+            await storage.rm(this.data.repoId, file.filePath);
+            await file.originalPath();
+            await this.download(progress);
+            return storage.read(this.data.repoId, file.filePath);
+          }
+        } catch {
+          // fall through to read; if read fails the caller surfaces the error
+        }
+      }
      return storage.read(this.data.repoId, file.filePath);
    } else if (exists === FILE_TYPE.FOLDER) {
      throw new AnonymousError("folder_not_supported", {
@@ -75,20 +123,19 @@ export default class GitHubDownload extends GitHubBase {
        object: file,
      });
    }
-    // will throw an error if the file is not in the repository
+    // marker present but file missing — shouldn't happen, but redownload
+    // rather than 404 on a stale partial cache.
    await file.originalPath();
-
-    // the cache is not ready, we need to download the repository
    await this.download(progress);
    return storage.read(this.data.repoId, file.filePath);
  }

  async getFiles(progress?: (status: string) => void) {
-    if ((await storage.exists(this.data.repoId)) === FILE_TYPE.NOT_FOUND) {
+    if (!(await this.isCacheComplete())) {
      await this.download(progress);
    }
    let nbFiles = 0;
-    return storage.listFiles(this.data.repoId, "", {
+    const all = await storage.listFiles(this.data.repoId, "", {
      onEntry: () => {
        if (progress) {
          nbFiles++;
@@ -96,9 +143,15 @@ export default class GitHubDownload extends GitHubBase {
        }
      },
    });
+    return all.filter((f) => f.name !== COMPLETE_MARKER);
  }
 }

+// Sentinel object/file written at the root of an extracted repo cache
+// once the extraction has finished successfully. Its presence means the
+// cache is complete. Hidden so it doesn't show up in user listings.
+const COMPLETE_MARKER = ".anon-complete";
+
 function humanFileSize(bytes: number, si = false, dp = 1) {
  const thresh = si ? 1000 : 1024;

@@ -114,7 +114,10 @@ export default class GitHubStream extends GitHubBase {
    });
    blobStream.on("end", () => decide());
    blobStream.on("error", (err) => {
-      if (decided) return;
+      // Always propagate — pre-decision this is the only listener; once a
+      // non-LFS decision is made, the inner branch attaches its own
+      // listener that will also fire, but we shouldn't rely on that being
+      // there if the code is later refactored.
      decided = true;
      out.destroy(err);
    });
@@ -153,13 +153,17 @@ export default class FileSystem extends StorageBase {
              size: stats.size,
            });
          }
+          // Don't synthesise a sha here. The previous value (stats.ino)
+          // wasn't a content hash — just an inode number — and any code
+          // that compared it to an upstream Git blob sha would silently
+          // disagree. Leave it undefined so callers either look up the
+          // real sha from FileModel/GitHub or skip sha-keyed paths.
          output2.push(
            new FileModel({
              name: file,
              path: dir,
              repoId: repoId,
              size: stats.size,
-              sha: stats.ino.toString(),
            })
          );
        }
@@ -175,12 +175,13 @@ export default class S3Storage extends StorageBase {
  ): Promise<void> {
    if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");

-    if (data instanceof Readable) {
-      data.on("error", (err) => {
-        console.error(`[ERROR] S3 write ${path}`, err);
-        this.rm(repoId, path);
-      });
-    }
+    // No fire-and-forget rm on stream error: Upload uses multipart and
+    // does not commit a partially-uploaded object, so there's nothing to
+    // clean up. The previous handler raced with retries and could delete
+    // a previously-good object on a transient source-stream hiccup. The
+    // size-validated read path in GitHubStream.getFileContentCache
+    // recovers from any object that does end up undersized for any
+    // reason.

    const params: PutObjectCommandInput = {
      Bucket: config.S3_BUCKET,