mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 22:48:00 +02:00
fix(cache): make Zip-source caches atomic and robust to partial state
Follow-up to the GitHubStream cache fixes. The same poisoned-cache
class existed in the GitHubDownload path and a few related spots:
- GitHubDownload.download: wipe pre-existing state before extracting
and write a .anon-complete marker only after a successful extract.
On error, rm the partial cache so a retry starts clean. getFileContent
and getFiles now gate on the marker instead of "any file/folder
exists," so a half-extracted tree can never be served as canonical.
- GitHubDownload.getFileContent: validate cached file size against the
upstream FileModel size (via the new AnonymizedFile.size()), same
guard as GitHubStream. getFiles filters the marker from the listing.
- FileSystem.listFiles: drop the bogus stats.ino.toString() as sha.
An inode isn't a content hash; anything comparing it to a Git blob
sha would silently disagree. Leave undefined.
- S3.write: remove the fire-and-forget data.on("error") -> this.rm(...).
Multipart Upload doesn't commit partial objects, so there was nothing
to clean up, and the handler raced retries and could delete a
previously-good object on a transient source-stream hiccup. The
size-validated read path recovers from any other undersized objects.
- GitHubStream.resolveLfsPointer: drop the post-decision early-return
in blobStream.on("error"). Currently redundant with the inner
listener, but removes the future-refactor footgun.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -36,6 +36,10 @@ export default class GitHubDownload extends GitHubBase {
|
||||
cause: error as Error,
|
||||
});
|
||||
}
|
||||
// Wipe any partial state from a previous failed extraction. Without
|
||||
// this, half-extracted trees would survive and later be served as the
|
||||
// canonical listing.
|
||||
await storage.rm(this.data.repoId);
|
||||
await storage.mk(this.data.repoId);
|
||||
try {
|
||||
const downloadStream = got.stream(response.url);
|
||||
@@ -53,7 +57,15 @@ export default class GitHubDownload extends GitHubBase {
|
||||
downloadStream,
|
||||
this.type
|
||||
);
|
||||
// Write the completion marker last. Its presence — and only its
|
||||
// presence — means "the cache for this repo is fully extracted and
|
||||
// safe to serve." Any code path that finds files but no marker must
|
||||
// treat the cache as incomplete and re-download.
|
||||
await storage.write(this.data.repoId, COMPLETE_MARKER, "ok", this.type);
|
||||
} catch (error) {
|
||||
// Best-effort cleanup of the partial extraction so the next call
|
||||
// doesn't trip over half-written files.
|
||||
await storage.rm(this.data.repoId).catch(() => undefined);
|
||||
throw new AnonymousError("unable_to_download", {
|
||||
httpStatus: 500,
|
||||
cause: error as Error,
|
||||
@@ -62,12 +74,48 @@ export default class GitHubDownload extends GitHubBase {
|
||||
}
|
||||
}
|
||||
|
||||
private async isCacheComplete(): Promise<boolean> {
|
||||
return (
|
||||
(await storage.exists(this.data.repoId, COMPLETE_MARKER)) ===
|
||||
FILE_TYPE.FILE
|
||||
);
|
||||
}
|
||||
|
||||
async getFileContent(
|
||||
file: AnonymizedFile,
|
||||
progress?: (status: string) => void
|
||||
): Promise<Readable> {
|
||||
if (!(await this.isCacheComplete())) {
|
||||
// will throw an error if the file is not in the repository
|
||||
await file.originalPath();
|
||||
await this.download(progress);
|
||||
return storage.read(this.data.repoId, file.filePath);
|
||||
}
|
||||
const exists = await storage.exists(this.data.repoId, file.filePath);
|
||||
if (exists === FILE_TYPE.FILE) {
|
||||
// Validate the cached file size against the upstream tree size when
|
||||
// it's known — guards against the same poisoned-cache class as in
|
||||
// GitHubStream. cached.size >= expected is accepted (equal for
|
||||
// normal files, larger for LFS-resolved blobs).
|
||||
let expectedSize: number | undefined;
|
||||
try {
|
||||
expectedSize = await file.size();
|
||||
} catch {
|
||||
// not all callers populate a FileModel; fall back to reading
|
||||
}
|
||||
if (expectedSize != null && expectedSize > 0) {
|
||||
try {
|
||||
const stat = await storage.fileInfo(this.data.repoId, file.filePath);
|
||||
if (stat.size != null && stat.size < expectedSize) {
|
||||
await storage.rm(this.data.repoId, file.filePath);
|
||||
await file.originalPath();
|
||||
await this.download(progress);
|
||||
return storage.read(this.data.repoId, file.filePath);
|
||||
}
|
||||
} catch {
|
||||
// fall through to read; if read fails the caller surfaces the error
|
||||
}
|
||||
}
|
||||
return storage.read(this.data.repoId, file.filePath);
|
||||
} else if (exists === FILE_TYPE.FOLDER) {
|
||||
throw new AnonymousError("folder_not_supported", {
|
||||
@@ -75,20 +123,19 @@ export default class GitHubDownload extends GitHubBase {
|
||||
object: file,
|
||||
});
|
||||
}
|
||||
// will throw an error if the file is not in the repository
|
||||
// marker present but file missing — shouldn't happen, but redownload
|
||||
// rather than 404 on a stale partial cache.
|
||||
await file.originalPath();
|
||||
|
||||
// the cache is not ready, we need to download the repository
|
||||
await this.download(progress);
|
||||
return storage.read(this.data.repoId, file.filePath);
|
||||
}
|
||||
|
||||
async getFiles(progress?: (status: string) => void) {
|
||||
if ((await storage.exists(this.data.repoId)) === FILE_TYPE.NOT_FOUND) {
|
||||
if (!(await this.isCacheComplete())) {
|
||||
await this.download(progress);
|
||||
}
|
||||
let nbFiles = 0;
|
||||
return storage.listFiles(this.data.repoId, "", {
|
||||
const all = await storage.listFiles(this.data.repoId, "", {
|
||||
onEntry: () => {
|
||||
if (progress) {
|
||||
nbFiles++;
|
||||
@@ -96,9 +143,15 @@ export default class GitHubDownload extends GitHubBase {
|
||||
}
|
||||
},
|
||||
});
|
||||
return all.filter((f) => f.name !== COMPLETE_MARKER);
|
||||
}
|
||||
}
|
||||
|
||||
// Sentinel object/file written at the root of an extracted repo cache
|
||||
// once the extraction has finished successfully. Its presence means the
|
||||
// cache is complete. Hidden so it doesn't show up in user listings.
|
||||
const COMPLETE_MARKER = ".anon-complete";
|
||||
|
||||
function humanFileSize(bytes: number, si = false, dp = 1) {
|
||||
const thresh = si ? 1000 : 1024;
|
||||
|
||||
|
||||
@@ -114,7 +114,10 @@ export default class GitHubStream extends GitHubBase {
|
||||
});
|
||||
blobStream.on("end", () => decide());
|
||||
blobStream.on("error", (err) => {
|
||||
if (decided) return;
|
||||
// Always propagate — pre-decision this is the only listener; once a
|
||||
// non-LFS decision is made, the inner branch attaches its own
|
||||
// listener that will also fire, but we shouldn't rely on that being
|
||||
// there if the code is later refactored.
|
||||
decided = true;
|
||||
out.destroy(err);
|
||||
});
|
||||
|
||||
@@ -153,13 +153,17 @@ export default class FileSystem extends StorageBase {
|
||||
size: stats.size,
|
||||
});
|
||||
}
|
||||
// Don't synthesise a sha here. The previous value (stats.ino)
|
||||
// wasn't a content hash — just an inode number — and any code
|
||||
// that compared it to an upstream Git blob sha would silently
|
||||
// disagree. Leave it undefined so callers either look up the
|
||||
// real sha from FileModel/GitHub or skip sha-keyed paths.
|
||||
output2.push(
|
||||
new FileModel({
|
||||
name: file,
|
||||
path: dir,
|
||||
repoId: repoId,
|
||||
size: stats.size,
|
||||
sha: stats.ino.toString(),
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
@@ -175,12 +175,13 @@ export default class S3Storage extends StorageBase {
|
||||
): Promise<void> {
|
||||
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
|
||||
|
||||
if (data instanceof Readable) {
|
||||
data.on("error", (err) => {
|
||||
console.error(`[ERROR] S3 write ${path}`, err);
|
||||
this.rm(repoId, path);
|
||||
});
|
||||
}
|
||||
// No fire-and-forget rm on stream error: Upload uses multipart and
|
||||
// does not commit a partially-uploaded object, so there's nothing to
|
||||
// clean up. The previous handler raced with retries and could delete
|
||||
// a previously-good object on a transient source-stream hiccup. The
|
||||
// size-validated read path in GitHubStream.getFileContentCache
|
||||
// recovers from any object that does end up undersized for any
|
||||
// reason.
|
||||
|
||||
const params: PutObjectCommandInput = {
|
||||
Bucket: config.S3_BUCKET,
|
||||
|
||||
Reference in New Issue
Block a user