fix(cache): atomic file writes and size-validated cache reads

A failed/interrupted GitHub fetch could leave a 0-byte or truncated file in the local cache. Subsequent reads happily streamed the empty content as the file's body — visible to users as an "Empty file" with HTTP 200. Reproduced on artifact-70B6/Lethe/configs.py (#694). - FileSystem.write: stream into a sibling .tmp and rename into place only on finish. Stream errors discard the tmp and leave any prior cached file untouched. Drop the utf-8 encoding that was silently corrupting binary blobs. - GitHubStream.getFileContentCache: accept an expected size and treat cached.size < expected as a poisoned cache (truncated fetch) → rm and re-fetch. cached.size >= expected is accepted, which keeps Git LFS-resolved files (whose FileModel.size is the pointer size) working. - AnonymizedFile: expose size() and pass it through to the streamer alongside sha so the cache check has the upstream size. Existing poisoned entries self-heal on next access. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 14:38:03 +02:00 · 2026-05-05 08:47:41 +03:00
parent 53959f677c
commit 9adff11e74
4 changed files with 76 additions and 11 deletions
@@ -39,6 +39,12 @@ export default class AnonymizedFile {
    return this._file.sha?.replace(/"/g, "");
  }

+  async size(): Promise<number | undefined> {
+    if (this._file) return this._file.size;
+    this._file = await this.getFileInfo();
+    return this._file.size;
+  }
+
  async getFileInfo(): Promise<IFile> {
    if (this._file) return this._file;
    let fileDir = dirname(this.anonymizedPath);
@@ -200,6 +206,7 @@ export default class AnonymizedFile {
        repoId: this.repository.repoId,
        filePath: this.filePath,
        sha: await this.sha(),
+        size: await this.size(),
        anonymizerOptions: anonymizer.opt,
      },
    });
@@ -228,8 +235,9 @@ export default class AnonymizedFile {
      try {
        if (config.STREAMER_ENTRYPOINT) {
          // use the streamer service
-          const [sha, token] = await Promise.all([
+          const [sha, size, token] = await Promise.all([
            this.sha(),
+            this.size(),
            this.repository.getToken(),
          ]);
          const resStream = got
@@ -237,6 +245,7 @@ export default class AnonymizedFile {
              method: "POST",
              json: {
                sha,
+                size,
                token,
                repoFullName: this.repository.model.source.repositoryName,
                commit: this.repository.model.source.commit,
@@ -125,11 +125,38 @@ export default class GitHubStream extends GitHubBase {
  async getFileContentCache(
    filePath: string,
    repoId: string,
-    fileSha: () => Promise<string> | string
+    fileMeta: () =>
+      | Promise<{ sha: string; size?: number }>
+      | { sha: string; size?: number }
+      | Promise<string>
+      | string
  ) {
+    const meta = await fileMeta();
+    const expected: { sha: string; size?: number } =
+      typeof meta === "string" ? { sha: meta } : meta;
    const fileInfo = await storage.exists(repoId, filePath);
    if (fileInfo == FILE_TYPE.FILE) {
-      return storage.read(repoId, filePath);
+      // If we know the upstream size, validate the cached entry. A cached
+      // file smaller than the upstream size means a previous fetch was
+      // truncated — likely a network error during the GitHub fetch left a
+      // 0-byte or partial blob behind. Treat it as a miss and re-fetch.
+      // Cached size >= expected is accepted: equal for normal files, and
+      // larger for Git LFS files where FileModel.size is the pointer's
+      // size but the cached bytes are the resolved LFS content.
+      if (expected.size != null && expected.size > 0) {
+        try {
+          const stat = await storage.fileInfo(repoId, filePath);
+          if (stat.size != null && stat.size < expected.size) {
+            await storage.rm(repoId, filePath);
+          } else {
+            return storage.read(repoId, filePath);
+          }
+        } catch {
+          // fall through and re-fetch
+        }
+      } else {
+        return storage.read(repoId, filePath);
+      }
    } else if (fileInfo == FILE_TYPE.FOLDER) {
      throw new AnonymousError("folder_not_supported", {
        httpStatus: 400,
@@ -137,7 +164,7 @@ export default class GitHubStream extends GitHubBase {
      });
    }
    const token = await this.data.getToken();
-    const blobStream = this.downloadFile(token, await fileSha());
+    const blobStream = this.downloadFile(token, expected.sha);
    // If the blob is a Git LFS pointer, swap to a raw-URL fetch so the
    // file content (not the pointer text) makes it into the pipeline. See
    // #95 — Support for Git LFS.
@@ -179,7 +206,7 @@ export default class GitHubStream extends GitHubBase {
            object: file,
          });
        }
-        return fileSha;
+        return { sha: fileSha, size: await file.size() };
      }
    );
  }
@@ -62,16 +62,43 @@ export default class FileSystem extends StorageBase {
    data: string | Readable
  ): Promise<void> {
    const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
+    // Atomic write: stream into a sibling .tmp and only rename into place
+    // when the source stream finishes successfully. If the source errors
+    // mid-flight (transient GitHub 5xx, socket reset, etc.), we drop the
+    // tmp and leave any pre-existing cached file untouched. Without this,
+    // a partial fetch would commit a 0-byte or truncated cache entry that
+    // future reads would happily serve as the file's content.
+    await this.mk(repoId, dirname(p));
+    const tmpPath = `${fullPath}.tmp.${process.pid}.${Date.now()}.${Math.random()
+      .toString(36)
+      .slice(2, 8)}`;
    try {
-      await this.mk(repoId, dirname(p));
-      if (data instanceof Readable) {
-        data.on("error", (_err) => {
-          this.rm(repoId, p);
+      if (typeof data === "string") {
+        await fs.promises.writeFile(tmpPath, data);
+      } else {
+        await new Promise<void>((resolve, reject) => {
+          const ws = fs.createWriteStream(tmpPath);
+          let settled = false;
+          const finish = (err?: Error) => {
+            if (settled) return;
+            settled = true;
+            if (err) {
+              ws.destroy();
+              reject(err);
+            } else {
+              resolve();
+            }
+          };
+          data.on("error", finish);
+          ws.on("error", finish);
+          ws.on("finish", () => finish());
+          data.pipe(ws);
        });
      }
-      return await fs.promises.writeFile(fullPath, data, "utf-8");
+      await fs.promises.rename(tmpPath, fullPath);
    } catch (err) {
      console.error("[ERROR] FileSystem.write failed:", err);
+      await fs.promises.rm(tmpPath, { force: true }).catch(() => undefined);
      throw err;
    }
  }
@@ -42,6 +42,8 @@ router.post("/", async (req: express.Request, res: express.Response) => {
  const repoFullName = req.body.repoFullName.split("/");
  const repoId = req.body.repoId;
  const fileSha = req.body.sha;
+  const fileSize: number | undefined =
+    typeof req.body.size === "number" ? req.body.size : undefined;
  const commit = req.body.commit;
  const filePath = req.body.filePath;
  const anonymizerOptions = req.body.anonymizerOptions;
@@ -58,7 +60,7 @@ router.post("/", async (req: express.Request, res: express.Response) => {
    const content = await source.getFileContentCache(
      filePath,
      repoId,
-      () => fileSha
+      () => ({ sha: fileSha, size: fileSize })
    );
    const mime = lookup(filePath);
    if (mime && !filePath.endsWith(".ts")) {