mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 14:38:03 +02:00
fix(cache): atomic file writes and size-validated cache reads
A failed/interrupted GitHub fetch could leave a 0-byte or truncated file in the local cache. Subsequent reads happily streamed the empty content as the file's body — visible to users as an "Empty file" with HTTP 200. Reproduced on artifact-70B6/Lethe/configs.py (#694). - FileSystem.write: stream into a sibling .tmp and rename into place only on finish. Stream errors discard the tmp and leave any prior cached file untouched. Drop the utf-8 encoding that was silently corrupting binary blobs. - GitHubStream.getFileContentCache: accept an expected size and treat cached.size < expected as a poisoned cache (truncated fetch) → rm and re-fetch. cached.size >= expected is accepted, which keeps Git LFS-resolved files (whose FileModel.size is the pointer size) working. - AnonymizedFile: expose size() and pass it through to the streamer alongside sha so the cache check has the upstream size. Existing poisoned entries self-heal on next access. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -39,6 +39,12 @@ export default class AnonymizedFile {
|
||||
return this._file.sha?.replace(/"/g, "");
|
||||
}
|
||||
|
||||
async size(): Promise<number | undefined> {
|
||||
if (this._file) return this._file.size;
|
||||
this._file = await this.getFileInfo();
|
||||
return this._file.size;
|
||||
}
|
||||
|
||||
async getFileInfo(): Promise<IFile> {
|
||||
if (this._file) return this._file;
|
||||
let fileDir = dirname(this.anonymizedPath);
|
||||
@@ -200,6 +206,7 @@ export default class AnonymizedFile {
|
||||
repoId: this.repository.repoId,
|
||||
filePath: this.filePath,
|
||||
sha: await this.sha(),
|
||||
size: await this.size(),
|
||||
anonymizerOptions: anonymizer.opt,
|
||||
},
|
||||
});
|
||||
@@ -228,8 +235,9 @@ export default class AnonymizedFile {
|
||||
try {
|
||||
if (config.STREAMER_ENTRYPOINT) {
|
||||
// use the streamer service
|
||||
const [sha, token] = await Promise.all([
|
||||
const [sha, size, token] = await Promise.all([
|
||||
this.sha(),
|
||||
this.size(),
|
||||
this.repository.getToken(),
|
||||
]);
|
||||
const resStream = got
|
||||
@@ -237,6 +245,7 @@ export default class AnonymizedFile {
|
||||
method: "POST",
|
||||
json: {
|
||||
sha,
|
||||
size,
|
||||
token,
|
||||
repoFullName: this.repository.model.source.repositoryName,
|
||||
commit: this.repository.model.source.commit,
|
||||
|
||||
@@ -125,11 +125,38 @@ export default class GitHubStream extends GitHubBase {
|
||||
async getFileContentCache(
|
||||
filePath: string,
|
||||
repoId: string,
|
||||
fileSha: () => Promise<string> | string
|
||||
fileMeta: () =>
|
||||
| Promise<{ sha: string; size?: number }>
|
||||
| { sha: string; size?: number }
|
||||
| Promise<string>
|
||||
| string
|
||||
) {
|
||||
const meta = await fileMeta();
|
||||
const expected: { sha: string; size?: number } =
|
||||
typeof meta === "string" ? { sha: meta } : meta;
|
||||
const fileInfo = await storage.exists(repoId, filePath);
|
||||
if (fileInfo == FILE_TYPE.FILE) {
|
||||
return storage.read(repoId, filePath);
|
||||
// If we know the upstream size, validate the cached entry. A cached
|
||||
// file smaller than the upstream size means a previous fetch was
|
||||
// truncated — likely a network error during the GitHub fetch left a
|
||||
// 0-byte or partial blob behind. Treat it as a miss and re-fetch.
|
||||
// Cached size >= expected is accepted: equal for normal files, and
|
||||
// larger for Git LFS files where FileModel.size is the pointer's
|
||||
// size but the cached bytes are the resolved LFS content.
|
||||
if (expected.size != null && expected.size > 0) {
|
||||
try {
|
||||
const stat = await storage.fileInfo(repoId, filePath);
|
||||
if (stat.size != null && stat.size < expected.size) {
|
||||
await storage.rm(repoId, filePath);
|
||||
} else {
|
||||
return storage.read(repoId, filePath);
|
||||
}
|
||||
} catch {
|
||||
// fall through and re-fetch
|
||||
}
|
||||
} else {
|
||||
return storage.read(repoId, filePath);
|
||||
}
|
||||
} else if (fileInfo == FILE_TYPE.FOLDER) {
|
||||
throw new AnonymousError("folder_not_supported", {
|
||||
httpStatus: 400,
|
||||
@@ -137,7 +164,7 @@ export default class GitHubStream extends GitHubBase {
|
||||
});
|
||||
}
|
||||
const token = await this.data.getToken();
|
||||
const blobStream = this.downloadFile(token, await fileSha());
|
||||
const blobStream = this.downloadFile(token, expected.sha);
|
||||
// If the blob is a Git LFS pointer, swap to a raw-URL fetch so the
|
||||
// file content (not the pointer text) makes it into the pipeline. See
|
||||
// #95 — Support for Git LFS.
|
||||
@@ -179,7 +206,7 @@ export default class GitHubStream extends GitHubBase {
|
||||
object: file,
|
||||
});
|
||||
}
|
||||
return fileSha;
|
||||
return { sha: fileSha, size: await file.size() };
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@@ -62,16 +62,43 @@ export default class FileSystem extends StorageBase {
|
||||
data: string | Readable
|
||||
): Promise<void> {
|
||||
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
|
||||
// Atomic write: stream into a sibling .tmp and only rename into place
|
||||
// when the source stream finishes successfully. If the source errors
|
||||
// mid-flight (transient GitHub 5xx, socket reset, etc.), we drop the
|
||||
// tmp and leave any pre-existing cached file untouched. Without this,
|
||||
// a partial fetch would commit a 0-byte or truncated cache entry that
|
||||
// future reads would happily serve as the file's content.
|
||||
await this.mk(repoId, dirname(p));
|
||||
const tmpPath = `${fullPath}.tmp.${process.pid}.${Date.now()}.${Math.random()
|
||||
.toString(36)
|
||||
.slice(2, 8)}`;
|
||||
try {
|
||||
await this.mk(repoId, dirname(p));
|
||||
if (data instanceof Readable) {
|
||||
data.on("error", (_err) => {
|
||||
this.rm(repoId, p);
|
||||
if (typeof data === "string") {
|
||||
await fs.promises.writeFile(tmpPath, data);
|
||||
} else {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const ws = fs.createWriteStream(tmpPath);
|
||||
let settled = false;
|
||||
const finish = (err?: Error) => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
if (err) {
|
||||
ws.destroy();
|
||||
reject(err);
|
||||
} else {
|
||||
resolve();
|
||||
}
|
||||
};
|
||||
data.on("error", finish);
|
||||
ws.on("error", finish);
|
||||
ws.on("finish", () => finish());
|
||||
data.pipe(ws);
|
||||
});
|
||||
}
|
||||
return await fs.promises.writeFile(fullPath, data, "utf-8");
|
||||
await fs.promises.rename(tmpPath, fullPath);
|
||||
} catch (err) {
|
||||
console.error("[ERROR] FileSystem.write failed:", err);
|
||||
await fs.promises.rm(tmpPath, { force: true }).catch(() => undefined);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,6 +42,8 @@ router.post("/", async (req: express.Request, res: express.Response) => {
|
||||
const repoFullName = req.body.repoFullName.split("/");
|
||||
const repoId = req.body.repoId;
|
||||
const fileSha = req.body.sha;
|
||||
const fileSize: number | undefined =
|
||||
typeof req.body.size === "number" ? req.body.size : undefined;
|
||||
const commit = req.body.commit;
|
||||
const filePath = req.body.filePath;
|
||||
const anonymizerOptions = req.body.anonymizerOptions;
|
||||
@@ -58,7 +60,7 @@ router.post("/", async (req: express.Request, res: express.Response) => {
|
||||
const content = await source.getFileContentCache(
|
||||
filePath,
|
||||
repoId,
|
||||
() => fileSha
|
||||
() => ({ sha: fileSha, size: fileSize })
|
||||
);
|
||||
const mime = lookup(filePath);
|
||||
if (mime && !filePath.endsWith(".ts")) {
|
||||
|
||||
Reference in New Issue
Block a user