fix old github download repos

This commit is contained in:
tdurieux
2026-05-06 19:37:16 +03:00
parent da78708b7b
commit 67cb2538b1
6 changed files with 154 additions and 22 deletions
+15 -1
View File
@@ -141,7 +141,21 @@ export default class Repository {
force: false,
}
): Promise<IFile[]> {
const hasFile = await FileModel.exists({ repoId: this.repoId }).exec();
let hasFile = await FileModel.exists({ repoId: this.repoId }).exec();
// Files created by GitHubDownload don't carry a valid 40-char GitHub
// blob SHA. When the source type later switches to GitHubStream the
// stale entries cause blob-API 404s. Detect this by sampling a file
// with a sha and checking its length; force a re-fetch if it doesn't
// look like a GitHub SHA.
if (hasFile && this.source instanceof GitHubStream) {
const sample = await FileModel.findOne(
{ repoId: this.repoId, sha: { $exists: true, $ne: null } },
{ sha: 1 }
).exec();
if (sample?.sha && sample.sha.length !== 40) {
hasFile = null;
}
}
if (!hasFile || opt.force) {
await FileModel.deleteMany({ repoId: this.repoId }).exec();
const files = await this.source.getFiles(opt.progress);
+81 -6
View File
@@ -20,6 +20,21 @@ import config from "../../config";
const logger = createLogger("gh-stream");
export function githubRawFileUrl(
owner: string,
repo: string,
commit: string,
filePath: string
): string {
const encodedPath = filePath
.split("/")
.map((segment) => encodeURIComponent(segment))
.join("/");
return `https://github.com/${encodeURIComponent(owner)}/${encodeURIComponent(
repo
)}/raw/${encodeURIComponent(commit)}/${encodedPath}`;
}
export default class GitHubStream extends GitHubBase {
type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubStream";
@@ -64,7 +79,12 @@ export default class GitHubStream extends GitHubBase {
// blob endpoint above returns the raw pointer text instead, so we use this
// as the fallback for LFS files (#95).
private downloadFileViaRaw(token: string, filePath: string) {
const url = `https://github.com/${this.data.organization}/${this.data.repoName}/raw/${this.data.commit}/${filePath}`;
const url = githubRawFileUrl(
this.data.organization,
this.data.repoName,
this.data.commit,
filePath
);
logger.debug("downloading via raw URL (LFS)", { url });
return got.stream(url, {
headers: { authorization: `token ${token}` },
@@ -72,6 +92,58 @@ export default class GitHubStream extends GitHubBase {
});
}
// Try the blob API, then fall back to the raw URL on statuses where the
// path-based endpoint can still succeed. 422 is the blob endpoint's size
// cap; 404 can happen with stale/invalid blob SHAs while the path still
// exists at the requested commit.
private downloadWithFallback(
token: string,
sha: string,
filePath: string
): Promise<stream.Readable> {
return new Promise<stream.Readable>((resolve) => {
const blobStream = this.downloadFile(token, sha);
let settled = false;
const fallbackStatuses = new Set([404, 422]);
const fallbackToRaw = (statusCode?: number) => {
settled = true;
logger.info("blob API failed, falling back to raw URL", {
filePath,
statusCode,
});
resolve(this.downloadFileViaRaw(token, filePath));
};
blobStream.on("error", (err) => {
if (settled) return;
const statusCode = (
err as { response?: { statusCode?: number } }
)?.response?.statusCode;
if (statusCode && fallbackStatuses.has(statusCode)) {
fallbackToRaw(statusCode);
return;
}
// Other errors: let the normal pipeline handle them.
settled = true;
const passthrough = new stream.PassThrough();
passthrough.destroy(err);
resolve(passthrough);
});
blobStream.on("response", (response) => {
if (settled) return;
if (fallbackStatuses.has(response.statusCode || 0)) {
blobStream.destroy();
fallbackToRaw(response.statusCode);
return;
}
settled = true;
resolve(this.resolveLfsPointer(blobStream, token, filePath));
});
});
}
// Wrap a blob stream so that if its first ~150 bytes look like a Git LFS
// pointer, the bytes are dropped and replaced by a fresh fetch from the
// raw URL endpoint (which resolves LFS automatically). Non-LFS files are
@@ -190,11 +262,14 @@ export default class GitHubStream extends GitHubBase {
});
}
const token = await this.data.getToken();
const blobStream = this.downloadFile(token, expected.sha);
// If the blob is a Git LFS pointer, swap to a raw-URL fetch so the
// file content (not the pointer text) makes it into the pipeline. See
// #95 — Support for Git LFS.
const content = this.resolveLfsPointer(blobStream, token, filePath);
// Try the blob API first, but fall back to the raw URL on recoverable
// blob misses/caps while still preserving LFS pointer handling.
const content = await this.downloadWithFallback(
token,
expected.sha,
filePath
);
// duplicate the stream to write it to the storage
const stream1 = content.pipe(new stream.PassThrough());
+31 -5
View File
@@ -6,14 +6,40 @@ import { fileETag } from "./file-etag";
export const router = express.Router();
function decodePathSegment(segment: string): string {
try {
return decodeURIComponent(segment);
} catch {
try {
return decodeURI(segment);
} catch {
return segment;
}
}
}
export function filePathFromRequestUrl(
reqUrl: string,
protocol: string,
hostname: string,
repoId: string
): string {
const pathname = new URL(reqUrl, `${protocol}://${hostname}`).pathname;
const prefix = `/${encodeURIComponent(repoId)}/file/`;
const rawPath = pathname.startsWith(prefix)
? pathname.substring(prefix.length)
: pathname.replace(`/${repoId}/file/`, "");
return rawPath.split("/").map(decodePathSegment).join("/");
}
router.get(
"/:repoId/file/:path*",
async (req: express.Request, res: express.Response) => {
const anonymizedPath = decodeURI(
new URL(req.url, `${req.protocol}://${req.hostname}`).pathname.replace(
`/${req.params.repoId}/file/`,
""
)
const anonymizedPath = filePathFromRequestUrl(
req.url,
req.protocol,
req.hostname,
req.params.repoId
);
if (anonymizedPath.endsWith("/")) {
return handleError(