diff --git a/src/core/AnonymizedFile.ts b/src/core/AnonymizedFile.ts index 01c4b19..3225a35 100644 --- a/src/core/AnonymizedFile.ts +++ b/src/core/AnonymizedFile.ts @@ -39,6 +39,12 @@ export default class AnonymizedFile { return this._file.sha?.replace(/"/g, ""); } + async size(): Promise { + if (this._file) return this._file.size; + this._file = await this.getFileInfo(); + return this._file.size; + } + async getFileInfo(): Promise { if (this._file) return this._file; let fileDir = dirname(this.anonymizedPath); @@ -200,6 +206,7 @@ export default class AnonymizedFile { repoId: this.repository.repoId, filePath: this.filePath, sha: await this.sha(), + size: await this.size(), anonymizerOptions: anonymizer.opt, }, }); @@ -228,8 +235,9 @@ export default class AnonymizedFile { try { if (config.STREAMER_ENTRYPOINT) { // use the streamer service - const [sha, token] = await Promise.all([ + const [sha, size, token] = await Promise.all([ this.sha(), + this.size(), this.repository.getToken(), ]); const resStream = got @@ -237,6 +245,7 @@ export default class AnonymizedFile { method: "POST", json: { sha, + size, token, repoFullName: this.repository.model.source.repositoryName, commit: this.repository.model.source.commit, diff --git a/src/core/source/GitHubStream.ts b/src/core/source/GitHubStream.ts index e048cd4..368fd28 100644 --- a/src/core/source/GitHubStream.ts +++ b/src/core/source/GitHubStream.ts @@ -125,11 +125,38 @@ export default class GitHubStream extends GitHubBase { async getFileContentCache( filePath: string, repoId: string, - fileSha: () => Promise | string + fileMeta: () => + | Promise<{ sha: string; size?: number }> + | { sha: string; size?: number } + | Promise + | string ) { + const meta = await fileMeta(); + const expected: { sha: string; size?: number } = + typeof meta === "string" ? { sha: meta } : meta; const fileInfo = await storage.exists(repoId, filePath); if (fileInfo == FILE_TYPE.FILE) { - return storage.read(repoId, filePath); + // If we know the upstream size, validate the cached entry. A cached + // file smaller than the upstream size means a previous fetch was + // truncated — likely a network error during the GitHub fetch left a + // 0-byte or partial blob behind. Treat it as a miss and re-fetch. + // Cached size >= expected is accepted: equal for normal files, and + // larger for Git LFS files where FileModel.size is the pointer's + // size but the cached bytes are the resolved LFS content. + if (expected.size != null && expected.size > 0) { + try { + const stat = await storage.fileInfo(repoId, filePath); + if (stat.size != null && stat.size < expected.size) { + await storage.rm(repoId, filePath); + } else { + return storage.read(repoId, filePath); + } + } catch { + // fall through and re-fetch + } + } else { + return storage.read(repoId, filePath); + } } else if (fileInfo == FILE_TYPE.FOLDER) { throw new AnonymousError("folder_not_supported", { httpStatus: 400, @@ -137,7 +164,7 @@ export default class GitHubStream extends GitHubBase { }); } const token = await this.data.getToken(); - const blobStream = this.downloadFile(token, await fileSha()); + const blobStream = this.downloadFile(token, expected.sha); // If the blob is a Git LFS pointer, swap to a raw-URL fetch so the // file content (not the pointer text) makes it into the pipeline. See // #95 — Support for Git LFS. @@ -179,7 +206,7 @@ export default class GitHubStream extends GitHubBase { object: file, }); } - return fileSha; + return { sha: fileSha, size: await file.size() }; } ); } diff --git a/src/core/storage/FileSystem.ts b/src/core/storage/FileSystem.ts index 507ec65..1b9aad1 100644 --- a/src/core/storage/FileSystem.ts +++ b/src/core/storage/FileSystem.ts @@ -62,16 +62,43 @@ export default class FileSystem extends StorageBase { data: string | Readable ): Promise { const fullPath = join(config.FOLDER, this.repoPath(repoId), p); + // Atomic write: stream into a sibling .tmp and only rename into place + // when the source stream finishes successfully. If the source errors + // mid-flight (transient GitHub 5xx, socket reset, etc.), we drop the + // tmp and leave any pre-existing cached file untouched. Without this, + // a partial fetch would commit a 0-byte or truncated cache entry that + // future reads would happily serve as the file's content. + await this.mk(repoId, dirname(p)); + const tmpPath = `${fullPath}.tmp.${process.pid}.${Date.now()}.${Math.random() + .toString(36) + .slice(2, 8)}`; try { - await this.mk(repoId, dirname(p)); - if (data instanceof Readable) { - data.on("error", (_err) => { - this.rm(repoId, p); + if (typeof data === "string") { + await fs.promises.writeFile(tmpPath, data); + } else { + await new Promise((resolve, reject) => { + const ws = fs.createWriteStream(tmpPath); + let settled = false; + const finish = (err?: Error) => { + if (settled) return; + settled = true; + if (err) { + ws.destroy(); + reject(err); + } else { + resolve(); + } + }; + data.on("error", finish); + ws.on("error", finish); + ws.on("finish", () => finish()); + data.pipe(ws); }); } - return await fs.promises.writeFile(fullPath, data, "utf-8"); + await fs.promises.rename(tmpPath, fullPath); } catch (err) { console.error("[ERROR] FileSystem.write failed:", err); + await fs.promises.rm(tmpPath, { force: true }).catch(() => undefined); throw err; } } diff --git a/src/streamer/route.ts b/src/streamer/route.ts index 1266bde..ef0cfd8 100644 --- a/src/streamer/route.ts +++ b/src/streamer/route.ts @@ -42,6 +42,8 @@ router.post("/", async (req: express.Request, res: express.Response) => { const repoFullName = req.body.repoFullName.split("/"); const repoId = req.body.repoId; const fileSha = req.body.sha; + const fileSize: number | undefined = + typeof req.body.size === "number" ? req.body.size : undefined; const commit = req.body.commit; const filePath = req.body.filePath; const anonymizerOptions = req.body.anonymizerOptions; @@ -58,7 +60,7 @@ router.post("/", async (req: express.Request, res: express.Response) => { const content = await source.getFileContentCache( filePath, repoId, - () => fileSha + () => ({ sha: fileSha, size: fileSize }) ); const mime = lookup(filePath); if (mime && !filePath.endsWith(".ts")) {