fix(cache): make Zip-source caches atomic and robust to partial state

Follow-up to the GitHubStream cache fixes. The same poisoned-cache
class existed in the GitHubDownload path and a few related spots:

- GitHubDownload.download: wipe pre-existing state before extracting
  and write a .anon-complete marker only after a successful extract.
  On error, rm the partial cache so a retry starts clean. getFileContent
  and getFiles now gate on the marker instead of "any file/folder
  exists," so a half-extracted tree can never be served as canonical.
- GitHubDownload.getFileContent: validate cached file size against the
  upstream FileModel size (via the new AnonymizedFile.size()), same
  guard as GitHubStream. getFiles filters the marker from the listing.
- FileSystem.listFiles: drop the bogus stats.ino.toString() as sha.
  An inode isn't a content hash; anything comparing it to a Git blob
  sha would silently disagree. Leave undefined.
- S3.write: remove the fire-and-forget data.on("error") -> this.rm(...).
  Multipart Upload doesn't commit partial objects, so there was nothing
  to clean up, and the handler raced retries and could delete a
  previously-good object on a transient source-stream hiccup. The
  size-validated read path recovers from any other undersized objects.
- GitHubStream.resolveLfsPointer: drop the post-decision early-return
  in blobStream.on("error"). Currently redundant with the inner
  listener, but removes the future-refactor footgun.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
tdurieux
2026-05-05 08:54:42 +03:00
parent 9adff11e74
commit f413a30313
4 changed files with 74 additions and 13 deletions
+58 -5
View File
@@ -36,6 +36,10 @@ export default class GitHubDownload extends GitHubBase {
cause: error as Error,
});
}
// Wipe any partial state from a previous failed extraction. Without
// this, half-extracted trees would survive and later be served as the
// canonical listing.
await storage.rm(this.data.repoId);
await storage.mk(this.data.repoId);
try {
const downloadStream = got.stream(response.url);
@@ -53,7 +57,15 @@ export default class GitHubDownload extends GitHubBase {
downloadStream,
this.type
);
// Write the completion marker last. Its presence — and only its
// presence — means "the cache for this repo is fully extracted and
// safe to serve." Any code path that finds files but no marker must
// treat the cache as incomplete and re-download.
await storage.write(this.data.repoId, COMPLETE_MARKER, "ok", this.type);
} catch (error) {
// Best-effort cleanup of the partial extraction so the next call
// doesn't trip over half-written files.
await storage.rm(this.data.repoId).catch(() => undefined);
throw new AnonymousError("unable_to_download", {
httpStatus: 500,
cause: error as Error,
@@ -62,12 +74,48 @@ export default class GitHubDownload extends GitHubBase {
}
}
private async isCacheComplete(): Promise<boolean> {
return (
(await storage.exists(this.data.repoId, COMPLETE_MARKER)) ===
FILE_TYPE.FILE
);
}
async getFileContent(
file: AnonymizedFile,
progress?: (status: string) => void
): Promise<Readable> {
if (!(await this.isCacheComplete())) {
// will throw an error if the file is not in the repository
await file.originalPath();
await this.download(progress);
return storage.read(this.data.repoId, file.filePath);
}
const exists = await storage.exists(this.data.repoId, file.filePath);
if (exists === FILE_TYPE.FILE) {
// Validate the cached file size against the upstream tree size when
// it's known — guards against the same poisoned-cache class as in
// GitHubStream. cached.size >= expected is accepted (equal for
// normal files, larger for LFS-resolved blobs).
let expectedSize: number | undefined;
try {
expectedSize = await file.size();
} catch {
// not all callers populate a FileModel; fall back to reading
}
if (expectedSize != null && expectedSize > 0) {
try {
const stat = await storage.fileInfo(this.data.repoId, file.filePath);
if (stat.size != null && stat.size < expectedSize) {
await storage.rm(this.data.repoId, file.filePath);
await file.originalPath();
await this.download(progress);
return storage.read(this.data.repoId, file.filePath);
}
} catch {
// fall through to read; if read fails the caller surfaces the error
}
}
return storage.read(this.data.repoId, file.filePath);
} else if (exists === FILE_TYPE.FOLDER) {
throw new AnonymousError("folder_not_supported", {
@@ -75,20 +123,19 @@ export default class GitHubDownload extends GitHubBase {
object: file,
});
}
// will throw an error if the file is not in the repository
// marker present but file missing — shouldn't happen, but redownload
// rather than 404 on a stale partial cache.
await file.originalPath();
// the cache is not ready, we need to download the repository
await this.download(progress);
return storage.read(this.data.repoId, file.filePath);
}
async getFiles(progress?: (status: string) => void) {
if ((await storage.exists(this.data.repoId)) === FILE_TYPE.NOT_FOUND) {
if (!(await this.isCacheComplete())) {
await this.download(progress);
}
let nbFiles = 0;
return storage.listFiles(this.data.repoId, "", {
const all = await storage.listFiles(this.data.repoId, "", {
onEntry: () => {
if (progress) {
nbFiles++;
@@ -96,9 +143,15 @@ export default class GitHubDownload extends GitHubBase {
}
},
});
return all.filter((f) => f.name !== COMPLETE_MARKER);
}
}
// Sentinel object/file written at the root of an extracted repo cache
// once the extraction has finished successfully. Its presence means the
// cache is complete. Hidden so it doesn't show up in user listings.
const COMPLETE_MARKER = ".anon-complete";
function humanFileSize(bytes: number, si = false, dp = 1) {
const thresh = si ? 1000 : 1024;
+4 -1
View File
@@ -114,7 +114,10 @@ export default class GitHubStream extends GitHubBase {
});
blobStream.on("end", () => decide());
blobStream.on("error", (err) => {
if (decided) return;
// Always propagate — pre-decision this is the only listener; once a
// non-LFS decision is made, the inner branch attaches its own
// listener that will also fire, but we shouldn't rely on that being
// there if the code is later refactored.
decided = true;
out.destroy(err);
});
+5 -1
View File
@@ -153,13 +153,17 @@ export default class FileSystem extends StorageBase {
size: stats.size,
});
}
// Don't synthesise a sha here. The previous value (stats.ino)
// wasn't a content hash — just an inode number — and any code
// that compared it to an upstream Git blob sha would silently
// disagree. Leave it undefined so callers either look up the
// real sha from FileModel/GitHub or skip sha-keyed paths.
output2.push(
new FileModel({
name: file,
path: dir,
repoId: repoId,
size: stats.size,
sha: stats.ino.toString(),
})
);
}
+7 -6
View File
@@ -175,12 +175,13 @@ export default class S3Storage extends StorageBase {
): Promise<void> {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
if (data instanceof Readable) {
data.on("error", (err) => {
console.error(`[ERROR] S3 write ${path}`, err);
this.rm(repoId, path);
});
}
// No fire-and-forget rm on stream error: Upload uses multipart and
// does not commit a partially-uploaded object, so there's nothing to
// clean up. The previous handler raced with retries and could delete
// a previously-good object on a transient source-stream hiccup. The
// size-validated read path in GitHubStream.getFileContentCache
// recovers from any object that does end up undersized for any
// reason.
const params: PutObjectCommandInput = {
Bucket: config.S3_BUCKET,