fix(cache): atomic file writes and size-validated cache reads

A failed/interrupted GitHub fetch could leave a 0-byte or truncated
file in the local cache. Subsequent reads happily streamed the empty
content as the file's body — visible to users as an "Empty file" with
HTTP 200. Reproduced on artifact-70B6/Lethe/configs.py (#694).

- FileSystem.write: stream into a sibling .tmp and rename into place
  only on finish. Stream errors discard the tmp and leave any prior
  cached file untouched. Drop the utf-8 encoding that was silently
  corrupting binary blobs.
- GitHubStream.getFileContentCache: accept an expected size and treat
  cached.size < expected as a poisoned cache (truncated fetch) → rm
  and re-fetch. cached.size >= expected is accepted, which keeps
  Git LFS-resolved files (whose FileModel.size is the pointer size)
  working.
- AnonymizedFile: expose size() and pass it through to the streamer
  alongside sha so the cache check has the upstream size.

Existing poisoned entries self-heal on next access.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
tdurieux
2026-05-05 08:47:41 +03:00
parent 53959f677c
commit 9adff11e74
4 changed files with 76 additions and 11 deletions
+31 -4
View File
@@ -125,11 +125,38 @@ export default class GitHubStream extends GitHubBase {
async getFileContentCache(
filePath: string,
repoId: string,
fileSha: () => Promise<string> | string
fileMeta: () =>
| Promise<{ sha: string; size?: number }>
| { sha: string; size?: number }
| Promise<string>
| string
) {
const meta = await fileMeta();
const expected: { sha: string; size?: number } =
typeof meta === "string" ? { sha: meta } : meta;
const fileInfo = await storage.exists(repoId, filePath);
if (fileInfo == FILE_TYPE.FILE) {
return storage.read(repoId, filePath);
// If we know the upstream size, validate the cached entry. A cached
// file smaller than the upstream size means a previous fetch was
// truncated — likely a network error during the GitHub fetch left a
// 0-byte or partial blob behind. Treat it as a miss and re-fetch.
// Cached size >= expected is accepted: equal for normal files, and
// larger for Git LFS files where FileModel.size is the pointer's
// size but the cached bytes are the resolved LFS content.
if (expected.size != null && expected.size > 0) {
try {
const stat = await storage.fileInfo(repoId, filePath);
if (stat.size != null && stat.size < expected.size) {
await storage.rm(repoId, filePath);
} else {
return storage.read(repoId, filePath);
}
} catch {
// fall through and re-fetch
}
} else {
return storage.read(repoId, filePath);
}
} else if (fileInfo == FILE_TYPE.FOLDER) {
throw new AnonymousError("folder_not_supported", {
httpStatus: 400,
@@ -137,7 +164,7 @@ export default class GitHubStream extends GitHubBase {
});
}
const token = await this.data.getToken();
const blobStream = this.downloadFile(token, await fileSha());
const blobStream = this.downloadFile(token, expected.sha);
// If the blob is a Git LFS pointer, swap to a raw-URL fetch so the
// file content (not the pointer text) makes it into the pipeline. See
// #95 — Support for Git LFS.
@@ -179,7 +206,7 @@ export default class GitHubStream extends GitHubBase {
object: file,
});
}
return fileSha;
return { sha: fileSha, size: await file.size() };
}
);
}