Files
anonymous_github/src/core/storage/FileSystem.ts
T
tdurieux f413a30313 fix(cache): make Zip-source caches atomic and robust to partial state
Follow-up to the GitHubStream cache fixes. The same poisoned-cache
class existed in the GitHubDownload path and a few related spots:

- GitHubDownload.download: wipe pre-existing state before extracting
  and write a .anon-complete marker only after a successful extract.
  On error, rm the partial cache so a retry starts clean. getFileContent
  and getFiles now gate on the marker instead of "any file/folder
  exists," so a half-extracted tree can never be served as canonical.
- GitHubDownload.getFileContent: validate cached file size against the
  upstream FileModel size (via the new AnonymizedFile.size()), same
  guard as GitHubStream. getFiles filters the marker from the listing.
- FileSystem.listFiles: drop the bogus stats.ino.toString() as sha.
  An inode isn't a content hash; anything comparing it to a Git blob
  sha would silently disagree. Leave undefined.
- S3.write: remove the fire-and-forget data.on("error") -> this.rm(...).
  Multipart Upload doesn't commit partial objects, so there was nothing
  to clean up, and the handler raced retries and could delete a
  previously-good object on a transient source-stream hiccup. The
  size-validated read path recovers from any other undersized objects.
- GitHubStream.resolveLfsPointer: drop the post-decision early-return
  in blobStream.on("error"). Currently redundant with the inner
  listener, but removes the future-refactor footgun.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-05 08:54:42 +03:00

227 lines
6.9 KiB
TypeScript

import config from "../../config";
import * as fs from "fs";
import { Extract } from "unzip-stream";
import { join, basename, dirname } from "path";
import { Response } from "express";
import { Readable, pipeline, Transform } from "stream";
import * as archiver from "archiver";
import { promisify } from "util";
import { lookup } from "mime-types";
import StorageBase, { FILE_TYPE } from "./Storage";
import FileModel from "../model/files/files.model";
import { IFile } from "../model/files/files.types";
export default class FileSystem extends StorageBase {
type = "FileSystem";
constructor() {
super();
}
/** @override */
async exists(repoId: string, p: string = ""): Promise<FILE_TYPE> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
try {
const stat = await fs.promises.stat(fullPath);
if (stat.isDirectory()) return FILE_TYPE.FOLDER;
if (stat.isFile()) return FILE_TYPE.FILE;
} catch {
// ignore file not found or not downloaded
}
return FILE_TYPE.NOT_FOUND;
}
/** @override */
async send(repoId: string, p: string, res: Response) {
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
res.sendFile(fullPath, { dotfiles: "allow" });
}
/** @override */
async read(repoId: string, p: string): Promise<Readable> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
return fs.createReadStream(fullPath);
}
async fileInfo(repoId: string, path: string) {
const fullPath = join(config.FOLDER, this.repoPath(repoId), path);
const info = await fs.promises.stat(fullPath);
return {
size: info.size,
lastModified: info.mtime,
contentType: info.isDirectory()
? "application/x-directory"
: (lookup(fullPath) as string),
};
}
/** @override */
async write(
repoId: string,
p: string,
data: string | Readable
): Promise<void> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
// Atomic write: stream into a sibling .tmp and only rename into place
// when the source stream finishes successfully. If the source errors
// mid-flight (transient GitHub 5xx, socket reset, etc.), we drop the
// tmp and leave any pre-existing cached file untouched. Without this,
// a partial fetch would commit a 0-byte or truncated cache entry that
// future reads would happily serve as the file's content.
await this.mk(repoId, dirname(p));
const tmpPath = `${fullPath}.tmp.${process.pid}.${Date.now()}.${Math.random()
.toString(36)
.slice(2, 8)}`;
try {
if (typeof data === "string") {
await fs.promises.writeFile(tmpPath, data);
} else {
await new Promise<void>((resolve, reject) => {
const ws = fs.createWriteStream(tmpPath);
let settled = false;
const finish = (err?: Error) => {
if (settled) return;
settled = true;
if (err) {
ws.destroy();
reject(err);
} else {
resolve();
}
};
data.on("error", finish);
ws.on("error", finish);
ws.on("finish", () => finish());
data.pipe(ws);
});
}
await fs.promises.rename(tmpPath, fullPath);
} catch (err) {
console.error("[ERROR] FileSystem.write failed:", err);
await fs.promises.rm(tmpPath, { force: true }).catch(() => undefined);
throw err;
}
}
/** @override */
async rm(repoId: string, dir: string = ""): Promise<void> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
await fs.promises.rm(fullPath, {
force: true,
recursive: true,
});
}
/** @override */
async mk(repoId: string, dir: string = ""): Promise<void> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
try {
await fs.promises.mkdir(fullPath, {
recursive: true,
});
} catch (err: unknown) {
if (err instanceof Error && (err as NodeJS.ErrnoException).code !== "EEXIST") {
throw err;
}
}
}
/** @override */
async listFiles(
repoId: string,
dir: string = "",
opt: {
onEntry?: (file: { path: string; size: number }) => void;
} = {}
): Promise<IFile[]> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
const files = await fs.promises.readdir(fullPath);
const output2: IFile[] = [];
for (const file of files) {
const filePath = join(fullPath, file);
try {
const stats = await fs.promises.stat(filePath);
if (stats.isDirectory()) {
output2.push(new FileModel({ name: file, path: dir, repoId }));
output2.push(
...(await this.listFiles(repoId, join(dir, file), opt))
);
} else if (stats.isFile()) {
if (opt.onEntry) {
opt.onEntry({
path: join(dir, file),
size: stats.size,
});
}
// Don't synthesise a sha here. The previous value (stats.ino)
// wasn't a content hash — just an inode number — and any code
// that compared it to an upstream Git blob sha would silently
// disagree. Leave it undefined so callers either look up the
// real sha from FileModel/GitHub or skip sha-keyed paths.
output2.push(
new FileModel({
name: file,
path: dir,
repoId: repoId,
size: stats.size,
})
);
}
} catch {
// ignore stat errors for individual files
}
}
return output2;
}
/** @override */
async extractZip(repoId: string, p: string, data: Readable): Promise<void> {
const pipe = promisify(pipeline);
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
const extractor = Extract({
path: fullPath,
decodeString: (buf) => {
const name = buf.toString();
const newName = name.substr(name.indexOf("/") + 1);
if (newName == "") {
return "___IGNORE___";
}
return newName;
},
});
await pipe(data, extractor);
await this.rm(repoId, join(p, "___IGNORE___"));
}
/** @override */
async archive(
repoId: string,
dir: string,
opt?: {
format?: "zip" | "tar";
fileTransformer?: (path: string) => Transform;
}
) {
const archive = archiver(opt?.format || "zip", {});
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
await this.listFiles(repoId, dir, {
onEntry: async (file) => {
let rs = await this.read(repoId, file.path);
if (opt?.fileTransformer) {
// apply transformation on the stream
rs = rs.pipe(opt.fileTransformer(file.path));
}
const f = file.path.replace(fullPath, "");
archive.append(rs, {
name: basename(f),
prefix: dirname(f),
});
},
}).then(() => {
archive.finalize();
});
return archive;
}
}