multiple fixes

This commit is contained in:
tdurieux
2026-05-05 10:32:31 +03:00
parent 5b72b630c4
commit f8c91ca0af
23 changed files with 1675 additions and 661 deletions
+12 -2
View File
@@ -22,6 +22,7 @@ import {
import { getToken } from "./GitHubUtils";
import config from "../config";
import FileModel from "./model/files/files.model";
import AnonymizedRepositoryModel from "./model/anonymizedRepositories/anonymizedRepositories.model";
import { IFile } from "./model/files/files.types";
import AnonymizedFile from "./AnonymizedFile";
import { FilterQuery } from "mongoose";
@@ -351,7 +352,7 @@ export default class Repository {
);
await this.resetSate(RepositoryStatus.PREPARING);
await downloadQueue.add(this.repoId, this, {
await downloadQueue.add(this.repoId, { repoId: this.repoId }, {
jobId: this.repoId,
attempts: 3,
});
@@ -405,7 +406,16 @@ export default class Repository {
this._model.statusDate = new Date();
this._model.statusMessage = statusMessage;
if (!isConnected) return this.model;
await this._model.save();
await AnonymizedRepositoryModel.updateOne(
{ _id: this._model._id },
{
$set: {
status,
statusDate: this._model.statusDate,
statusMessage,
},
}
).exec();
}
/**
+30 -23
View File
@@ -80,37 +80,44 @@ export default class User {
});
});
// find the repositories that are already in the database
const finds = (
await RepositoryModel.find({
externalId: {
$in: repositories.map((repo) => repo.externalId),
},
}).select("externalId")
).map((m) => m.externalId);
// save all the new repositories
await Promise.all(
repositories
.filter((r) => finds.indexOf(r.externalId) == -1)
.map((r) => r.save())
// find the repositories that are already in the database — fetch both
// externalId and id so we can both detect duplicates and reuse the
// ids of existing rows without re-querying.
const externalIds = repositories.map((repo) => repo.externalId);
const existing = await RepositoryModel.find({
externalId: { $in: externalIds },
}).select("id externalId");
const existingByExternalId = new Map(
existing.map((m) => [m.externalId, m.id])
);
// save only the if of the repositories in the user model
this._model.repositories = (
await RepositoryModel.find({
externalId: {
$in: repositories.map((repo) => repo.externalId),
},
}).select("id")
).map((m) => m.id);
// save all the new repositories
const newRepos = repositories.filter(
(r) => !existingByExternalId.has(r.externalId)
);
const saved = await Promise.all(newRepos.map((r) => r.save()));
for (const m of saved) {
existingByExternalId.set(m.externalId, m.id);
}
// collect ids in the order of the upstream repositories list
this._model.repositories = externalIds
.map((eid) => existingByExternalId.get(eid))
.filter((id) => !!id) as unknown as typeof this._model.repositories;
// have the model
await this._model.save();
return repositories.map((r) => new GitHubRepository(r));
} else {
// Only the fields read by GitHubRepository.toJSON() (and the immediate
// callers in user routes). Branches/readme are loaded on demand by
// GitHubRepository methods, which issue their own queries.
const out = (
await RepositoryModel.find({ _id: { $in: this._model.repositories } })
await RepositoryModel.find({
_id: { $in: this._model.repositories },
}).select(
"externalId name url size hasPage pageSource defaultBranch"
)
).map((i) => new GitHubRepository(i));
return out;
}
+106 -99
View File
@@ -192,8 +192,62 @@ export class AnonymizeTransformer extends Transform {
}
}
// Markdown image pattern hoisted out of removeImage() so we don't recompile
// it on every chunk of every file streamed through the anonymizer.
const markdownImageRegex =
/!\[[^\]]*\]\((?<filename>.*?)(?="|\))(?<optionalpart>".*")?\)/g;
interface CompiledTermVariant {
// Global regex used to replace matches in content (and paths).
replaceRegex: RegExp;
// Non-global twin used inside the URL callback to test() without
// mutating shared lastIndex state.
testRegex: RegExp;
mask: string;
}
function compileTerms(terms: string[] | undefined): CompiledTermVariant[] {
if (!terms || terms.length === 0) return [];
const compiled: CompiledTermVariant[] = [];
for (let i = 0; i < terms.length; i++) {
const spec = terms[i];
if (spec.trim() === "") continue;
// #285 — entries of the form "term=>replacement" override the default
// XXXX-N mask so users can scrub with their preferred token.
const parsed = parseTermSpec(spec);
let term = parsed.term;
const mask =
parsed.replacement !== null
? parsed.replacement
: config.ANONYMIZATION_MASK + "-" + (i + 1);
try {
new RegExp(term, "gi");
} catch {
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
}
for (const variant of termVariants(term)) {
const bounded = withWordBoundaries(variant.pattern, {
sniffSource: variant.sniff,
unicode: variant.unicode,
});
const baseFlags = variant.unicode ? "iu" : "i";
compiled.push({
replaceRegex: new RegExp(bounded, "g" + baseFlags),
testRegex: new RegExp(bounded, baseFlags),
mask,
});
}
}
return compiled;
}
export class ContentAnonimizer {
public wasAnonymized = false;
// Compiled once per instance and reused for every anonymize() call.
// Streamed files invoke anonymize() many times per file (one per chunk),
// so caching here avoids rebuilding regexes on every chunk.
private compiledTerms: CompiledTermVariant[];
private selfLinkRegexes: RegExp[] | null = null;
constructor(
readonly opt: {
@@ -204,26 +258,33 @@ export class ContentAnonimizer {
branchName?: string;
repoId?: string;
}
) {}
) {
this.compiledTerms = compileTerms(opt.terms);
if (opt.repoName && opt.branchName) {
const r = opt.repoName;
const b = opt.branchName;
this.selfLinkRegexes = [
new RegExp(`https://raw.githubusercontent.com/${r}/${b}\\b`, "gi"),
new RegExp(`https://github.com/${r}/blob/${b}\\b`, "gi"),
new RegExp(`https://github.com/${r}/tree/${b}\\b`, "gi"),
new RegExp(`https://github.com/${r}`, "gi"),
];
}
}
private removeImage(content: string): string {
if (this.opt.image !== false) {
return content;
}
// remove image in markdown
return content.replace(
/!\[[^\]]*\]\((?<filename>.*?)(?="|\))(?<optionalpart>".*")?\)/g,
() => {
this.wasAnonymized = true;
return config.ANONYMIZATION_MASK;
}
);
return content.replace(markdownImageRegex, () => {
this.wasAnonymized = true;
return config.ANONYMIZATION_MASK;
});
}
private removeLink(content: string): string {
if (this.opt.link !== false) {
return content;
}
// remove image in markdown
return content.replace(urlRegex, () => {
this.wasAnonymized = true;
return config.ANONYMIZATION_MASK;
@@ -231,83 +292,33 @@ export class ContentAnonimizer {
}
private replaceGitHubSelfLinks(content: string): string {
if (!this.opt.repoName || !this.opt.branchName) {
return content;
}
const repoName = this.opt.repoName;
const branchName = this.opt.branchName;
const replaceCallback = () => {
if (!this.selfLinkRegexes) return content;
const replacement = `https://${config.APP_HOSTNAME}/r/${this.opt.repoId}`;
const cb = () => {
this.wasAnonymized = true;
return `https://${config.APP_HOSTNAME}/r/${this.opt.repoId}`;
return replacement;
};
content = content.replace(
new RegExp(
`https://raw.githubusercontent.com/${repoName}/${branchName}\\b`,
"gi"
),
replaceCallback
);
content = content.replace(
new RegExp(`https://github.com/${repoName}/blob/${branchName}\\b`, "gi"),
replaceCallback
);
content = content.replace(
new RegExp(`https://github.com/${repoName}/tree/${branchName}\\b`, "gi"),
replaceCallback
);
return content.replace(
new RegExp(`https://github.com/${repoName}`, "gi"),
replaceCallback
);
for (const re of this.selfLinkRegexes) {
content = content.replace(re, cb);
}
return content;
}
private replaceTerms(content: string): string {
const terms = this.opt.terms || [];
for (let i = 0; i < terms.length; i++) {
const spec = terms[i];
if (spec.trim() == "") {
continue;
}
// #285 — entries of the form "term=>replacement" override the default
// XXXX-N mask so users can scrub with their preferred token (e.g.
// "ABC", "XYZ"), keeping anonymized identifiers valid in source code.
const parsed = parseTermSpec(spec);
let term = parsed.term;
const mask =
parsed.replacement !== null
? parsed.replacement
: config.ANONYMIZATION_MASK + "-" + (i + 1);
try {
new RegExp(term, "gi");
} catch {
// escape regex characters
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
}
// Try the term verbatim first, then a diacritic-insensitive expansion
// so "Davo" anonymizes "Davó" (and vice versa). See term-matching.ts.
for (const variant of termVariants(term)) {
const bounded = withWordBoundaries(variant.pattern, {
sniffSource: variant.sniff,
unicode: variant.unicode,
});
const flags = variant.unicode ? "giu" : "gi";
// remove whole url if it contains the term
content = content.replace(urlRegex, (match) => {
if (new RegExp(bounded, flags).test(match)) {
this.wasAnonymized = true;
return mask;
}
return match;
});
// remove the term in the text
content = content.replace(new RegExp(bounded, flags), () => {
for (const c of this.compiledTerms) {
// remove whole url if it contains the term
content = content.replace(urlRegex, (match) => {
if (c.testRegex.test(match)) {
this.wasAnonymized = true;
return mask;
});
}
return c.mask;
}
return match;
});
// remove the term in the text
content = content.replace(c.replaceRegex, () => {
this.wasAnonymized = true;
return c.mask;
});
}
return content;
}
@@ -322,24 +333,20 @@ export class ContentAnonimizer {
}
export function anonymizePath(path: string, terms: string[]) {
for (let i = 0; i < terms.length; i++) {
const spec = terms[i];
if (spec.trim() == "") {
continue;
}
const parsed = parseTermSpec(spec);
let term = parsed.term;
const mask =
parsed.replacement !== null
? parsed.replacement
: config.ANONYMIZATION_MASK + "-" + (i + 1);
try {
new RegExp(term, "gi");
} catch {
// escape regex characters
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
}
path = path.replace(new RegExp(term, "gi"), mask);
return anonymizePathCompiled(path, compileTerms(terms));
}
// Variant that accepts pre-compiled term regexes — call sites that anonymize
// many paths in a row (tree traversal) should compile once and reuse.
export function anonymizePathCompiled(
path: string,
compiled: CompiledTermVariant[]
) {
for (const c of compiled) {
path = path.replace(c.replaceRegex, c.mask);
}
return path;
}
export { compileTerms };
export type { CompiledTermVariant };
+90 -12
View File
@@ -1,11 +1,76 @@
import { Queue, Worker } from "bullmq";
import config from "../config";
import Repository from "../core/Repository";
import AnonymizedRepositoryModel from "../core/model/anonymizedRepositories/anonymizedRepositories.model";
import { RepositoryStatus } from "../core/types";
import * as path from "path";
export let cacheQueue: Queue<Repository>;
export let removeQueue: Queue<Repository>;
export let downloadQueue: Queue<Repository>;
// Minimal payload for queue jobs. Workers re-fetch the Repository from the
// database via getRepository(repoId), so passing the full Mongoose-backed
// Repository instance through msgpackr is unnecessary — and triggers
// ERR_BUFFER_OUT_OF_BOUNDS on long term lists / large nested fields.
export interface RepoJobData {
repoId: string;
}
const IN_FLIGHT_STATUSES: RepositoryStatus[] = [
RepositoryStatus.PREPARING,
RepositoryStatus.QUEUE,
RepositoryStatus.DOWNLOAD,
];
async function markErrorIfInFlight(repoId: string, message: string) {
try {
await AnonymizedRepositoryModel.updateOne(
{ repoId, status: { $in: IN_FLIGHT_STATUSES } },
{
$set: {
status: RepositoryStatus.ERROR,
statusDate: new Date(),
statusMessage: message || "preparation_failed",
},
}
).exec();
} catch (e) {
console.log("[QUEUE] markErrorIfInFlight error", repoId, e);
}
}
/**
* Recover repositories left in an in-flight status (preparing/queue/download)
* with no live BullMQ job — typically caused by a worker process crash or
* server restart during anonymization. Marks them as ERROR so they don't
* appear stuck forever; the public route can re-queue them on next visit.
*/
export async function recoverStuckPreparing() {
if (!downloadQueue) return;
try {
const stuck = await AnonymizedRepositoryModel.find(
{ status: { $in: IN_FLIGHT_STATUSES } },
{ repoId: 1 }
).lean();
for (const doc of stuck) {
try {
const job = await downloadQueue.getJob(doc.repoId);
if (job) {
const state = await job.getState();
if (state === "active" || state === "waiting" || state === "delayed") {
continue;
}
}
await markErrorIfInFlight(doc.repoId, "preparation_interrupted");
console.log("[QUEUE] recovered stuck repo", doc.repoId);
} catch (e) {
console.log("[QUEUE] recover error for", doc.repoId, e);
}
}
} catch (e) {
console.log("[QUEUE] recoverStuckPreparing failed", e);
}
}
export let cacheQueue: Queue<RepoJobData>;
export let removeQueue: Queue<RepoJobData>;
export let downloadQueue: Queue<RepoJobData>;
// avoid to load the queue outside the main server
export function startWorker() {
@@ -14,28 +79,31 @@ export function startWorker() {
port: config.REDIS_PORT,
};
cacheQueue = new Queue<Repository>("cache removal", {
cacheQueue = new Queue<RepoJobData>("cache removal", {
connection,
defaultJobOptions: {
removeOnComplete: true,
removeOnFail: true,
},
});
removeQueue = new Queue<Repository>("repository removal", {
removeQueue = new Queue<RepoJobData>("repository removal", {
connection: {
host: config.REDIS_HOSTNAME,
port: config.REDIS_PORT,
},
defaultJobOptions: {
removeOnComplete: true,
removeOnFail: true,
},
});
downloadQueue = new Queue<Repository>("repository download", {
downloadQueue = new Queue<RepoJobData>("repository download", {
connection,
defaultJobOptions: {
removeOnComplete: true,
removeOnFail: true,
},
});
const cacheWorker = new Worker<Repository>(
const cacheWorker = new Worker<RepoJobData>(
cacheQueue.name,
path.resolve("build/queue/processes/removeCache.js"),
{
@@ -47,7 +115,7 @@ export function startWorker() {
cacheWorker.on("completed", async (job) => {
await job.remove();
});
const removeWorker = new Worker<Repository>(
const removeWorker = new Worker<RepoJobData>(
removeQueue.name,
path.resolve("build/queue/processes/removeRepository.js"),
{
@@ -60,7 +128,7 @@ export function startWorker() {
await job.remove();
});
const downloadWorker = new Worker<Repository>(
const downloadWorker = new Worker<RepoJobData>(
downloadQueue.name,
path.resolve("build/queue/processes/downloadRepository.js"),
{
@@ -77,7 +145,17 @@ export function startWorker() {
downloadWorker.on("completed", async (job) => {
console.log("[QUEUE] download repository completed", job.data.repoId);
});
downloadWorker.on("failed", async (job) => {
console.log("download repository failed", job.data.repoId);
downloadWorker.on("failed", async (job, err) => {
const repoId = job?.data?.repoId;
console.log(
"[QUEUE] download repository failed",
repoId,
err?.message || err
);
if (!repoId) return;
if (job && typeof job.attemptsMade === "number" && job.opts?.attempts) {
if (job.attemptsMade < job.opts.attempts) return;
}
await markErrorIfInFlight(repoId, err?.message || "preparation_failed");
});
}
+48 -27
View File
@@ -1,11 +1,11 @@
import { SandboxedJob } from "bullmq";
import { config } from "dotenv";
config();
import Repository from "../../core/Repository";
import { getRepository as getRepositoryImport } from "../../server/database";
import { RepositoryStatus } from "../../core/types";
import { RepoJobData } from "../index";
export default async function (job: SandboxedJob<Repository, void>) {
export default async function (job: SandboxedJob<RepoJobData, void>) {
const {
connect,
getRepository,
@@ -18,29 +18,36 @@ export default async function (job: SandboxedJob<Repository, void>) {
let statusInterval: any = null;
await connect();
const repo = await getRepository(job.data.repoId);
let tickPromise: Promise<void> | null = null;
try {
let progress: { status: string } | null = null;
statusInterval = setInterval(async () => {
try {
if (
repo.status == RepositoryStatus.READY ||
repo.status == RepositoryStatus.ERROR
) {
return clearInterval(statusInterval);
statusInterval = setInterval(() => {
if (tickPromise) return;
tickPromise = (async () => {
try {
if (
repo.status == RepositoryStatus.READY ||
repo.status == RepositoryStatus.ERROR
) {
clearInterval(statusInterval);
return;
}
if (
progress &&
repo.status &&
repo.model.statusMessage !== progress?.status
) {
console.log(
`[QUEUE] Progress: ${job.data.repoId} ${progress.status}`
);
await repo.updateStatus(repo.status, progress?.status || "");
}
} catch {
// ignore error
} finally {
tickPromise = null;
}
if (
progress &&
repo.status &&
repo.model.statusMessage !== progress?.status
) {
console.log(
`[QUEUE] Progress: ${job.data.repoId} ${progress.status}`
);
await repo.updateStatus(repo.status, progress?.status || "");
}
} catch {
// ignore error
}
})();
}, 1000);
function updateProgress(obj: { status: string } | string) {
const o = typeof obj === "string" ? { status: obj } : obj;
@@ -51,9 +58,12 @@ export default async function (job: SandboxedJob<Repository, void>) {
await repo.resetSate(RepositoryStatus.PREPARING, "");
await repo.anonymize(updateProgress);
clearInterval(statusInterval);
if (tickPromise) await tickPromise;
await repo.updateStatus(RepositoryStatus.READY, "");
console.log(`[QUEUE] ${job.data.repoId} is downloaded`);
} catch (error) {
clearInterval(statusInterval);
if (tickPromise) await tickPromise;
updateProgress({ status: "error" });
if (error instanceof Error) {
await repo.updateStatus(RepositoryStatus.ERROR, error.message);
@@ -64,13 +74,24 @@ export default async function (job: SandboxedJob<Repository, void>) {
}
} catch (error: unknown) {
clearInterval(statusInterval);
console.log(`[QUEUE] ${job.data.repoId} is finished with an error`, error);
setTimeout(async () => {
// delay to avoid double saving
if (tickPromise) {
try {
await repo.updateStatus(RepositoryStatus.ERROR, (error as Error).message);
await tickPromise;
} catch { /* ignored */ }
}, 400);
}
console.log(`[QUEUE] ${job.data.repoId} is finished with an error`, error);
try {
await repo.updateStatus(
RepositoryStatus.ERROR,
error instanceof Error ? error.message : String(error)
);
} catch (persistError) {
console.log(
`[QUEUE] failed to persist ERROR status for ${job.data.repoId}`,
persistError
);
}
throw error;
} finally {
clearInterval(statusInterval);
}
+2 -2
View File
@@ -1,8 +1,8 @@
import { SandboxedJob } from "bullmq";
import Repository from "../../core/Repository";
import { getRepository as getRepositoryImport } from "../../server/database";
import { RepoJobData } from "../index";
export default async function (job: SandboxedJob<Repository, void>) {
export default async function (job: SandboxedJob<RepoJobData, void>) {
const {
connect,
getRepository,
+2 -2
View File
@@ -1,9 +1,9 @@
import { SandboxedJob } from "bullmq";
import Repository from "../../core/Repository";
import { getRepository as getRepositoryImport } from "../../server/database";
import { RepositoryStatus } from "../../core/types";
import { RepoJobData } from "../index";
export default async function (job: SandboxedJob<Repository, void>) {
export default async function (job: SandboxedJob<RepoJobData, void>) {
const {
connect,
getRepository,
+22 -6
View File
@@ -16,7 +16,7 @@ import { bearerTokenAuth } from "./routes/token-auth";
import router from "./routes";
import AnonymizedRepositoryModel from "../core/model/anonymizedRepositories/anonymizedRepositories.model";
import { conferenceStatusCheck, repositoryStatusCheck } from "./schedule";
import { startWorker } from "../queue";
import { startWorker, recoverStuckPreparing } from "../queue";
import AnonymizedPullRequestModel from "../core/model/anonymizedPullRequests/anonymizedPullRequests.model";
import { getUser } from "./routes/route-utils";
import config from "../config";
@@ -165,9 +165,17 @@ export default async function start() {
apiRouter.use("/gist", speedLimiter, router.gistPrivate);
apiRouter.use("/anonymize-preview", speedLimiter, router.anonymizePreview);
// Cache message.txt presence so /api/message doesn't hit the filesystem
// synchronously on every request. Re-checked on a 60s interval — the file
// is admin-managed and doesn't need real-time freshness.
const messagePath = resolve("message.txt");
let messageExists = existsSync(messagePath);
setInterval(() => {
messageExists = existsSync(messagePath);
}, 60 * 1000).unref();
apiRouter.get("/message", async (_, res) => {
if (existsSync("./message.txt")) {
return res.sendFile(resolve("message.txt"));
if (messageExists) {
return res.sendFile(messagePath);
}
res.sendStatus(404);
});
@@ -186,10 +194,17 @@ export default async function start() {
res.json(stat);
return;
}
const [nbRepositories, users, nbPageViews, nbPullRequests] =
const [nbRepositories, nbUsersAgg, nbPageViews, nbPullRequests] =
await Promise.all([
AnonymizedRepositoryModel.estimatedDocumentCount(),
AnonymizedRepositoryModel.distinct("owner"),
// Count distinct owners server-side instead of materializing the full
// list of ObjectIds with `.distinct("owner")` only to take its length.
AnonymizedRepositoryModel.collection
.aggregate([
{ $group: { _id: "$owner" } },
{ $count: "n" },
])
.toArray(),
AnonymizedRepositoryModel.collection
.aggregate([
{
@@ -202,7 +217,7 @@ export default async function start() {
stat = {
nbRepositories,
nbUsers: users.length,
nbUsers: (nbUsersAgg[0] as { n?: number } | undefined)?.n || 0,
nbPageViews: nbPageViews[0]?.total || 0,
nbPullRequests,
};
@@ -235,6 +250,7 @@ export default async function start() {
repositoryStatusCheck();
await connect();
await recoverStuckPreparing();
app.listen(config.PORT);
console.log("Database connected and Server started on port: " + config.PORT);
}
+428 -107
View File
@@ -10,6 +10,7 @@ import User from "../../core/User";
import { ensureAuthenticated } from "./connection";
import { handleError, getUser, isOwnerOrAdmin, getRepo } from "./route-utils";
import adminTokensRouter from "./admin-tokens";
import { octokit, getToken } from "../../core/GitHubUtils";
const router = express.Router();
@@ -34,17 +35,69 @@ router.use(
router.use("/tokens", adminTokensRouter);
router.post("/queue/:name/:repo_id", async (req, res) => {
let queue: Queue<Repository, void>;
if (req.params.name == "download") {
queue = downloadQueue;
} else if (req.params.name == "cache") {
queue = cacheQueue;
} else if (req.params.name == "remove") {
queue = removeQueue;
} else {
return res.status(404).json({ error: "queue_not_found" });
const QUEUE_STATES = [
"waiting",
"active",
"completed",
"failed",
"delayed",
] as const;
function pickQueue(name: string): Queue | null {
if (name === "download") return downloadQueue;
if (name === "cache") return cacheQueue;
if (name === "remove") return removeQueue;
return null;
}
function escapeRegex(s: string): string {
return s.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
}
function parseSort(req: express.Request, fallbackField = "_id"): Record<string, 1 | -1> {
const direction = req.query.direction === "asc" ? 1 : -1;
const field = (req.query.sort as string) || fallbackField;
return { [field]: direction };
}
function parseDateRange(req: express.Request, field: string) {
const range: Record<string, Date> = {};
if (req.query.dateFrom) {
const d = new Date(req.query.dateFrom as string);
if (!isNaN(d.getTime())) range.$gte = d;
}
if (req.query.dateTo) {
const d = new Date(req.query.dateTo as string);
if (!isNaN(d.getTime())) range.$lte = d;
}
if (Object.keys(range).length === 0) return null;
return { [field]: range };
}
function csvEscape(v: unknown): string {
if (v == null) return "";
const s = typeof v === "object" ? JSON.stringify(v) : String(v);
if (/[",\n\r]/.test(s)) return `"${s.replace(/"/g, '""')}"`;
return s;
}
function sendCsv(
res: express.Response,
filename: string,
columns: string[],
rows: Array<Record<string, unknown>>
) {
const header = columns.join(",");
const lines = rows.map((r) => columns.map((c) => csvEscape(r[c])).join(","));
const body = [header, ...lines].join("\n");
res.setHeader("Content-Type", "text/csv; charset=utf-8");
res.setHeader("Content-Disposition", `attachment; filename="${filename}"`);
res.send(body);
}
router.post("/queue/:name/:repo_id", async (req, res) => {
const queue = pickQueue(req.params.name);
if (!queue) return res.status(404).json({ error: "queue_not_found" });
let job;
try {
job = await queue.getJob(req.params.repo_id);
@@ -68,16 +121,8 @@ router.post("/queue/:name/:repo_id", async (req, res) => {
});
router.delete("/queue/:name/:repo_id", async (req, res) => {
let queue: Queue;
if (req.params.name == "download") {
queue = downloadQueue;
} else if (req.params.name == "cache") {
queue = cacheQueue;
} else if (req.params.name == "remove") {
queue = removeQueue;
} else {
return res.status(404).json({ error: "queue_not_found" });
}
const queue = pickQueue(req.params.name);
if (!queue) return res.status(404).json({ error: "queue_not_found" });
try {
const job = await queue.getJob(req.params.repo_id);
if (!job) {
@@ -90,58 +135,153 @@ router.delete("/queue/:name/:repo_id", async (req, res) => {
}
});
// Bulk retry all failed in a queue
router.post("/queue/:name/retry-failed", async (req, res) => {
const queue = pickQueue(req.params.name);
if (!queue) return res.status(404).json({ error: "queue_not_found" });
try {
const failed = await queue.getJobs(["failed"]);
let count = 0;
for (const j of failed) {
try {
await j.retry();
count++;
} catch {
// ignore single job failures
}
}
res.json({ retried: count, total: failed.length });
} catch (error) {
handleError(error, res, req);
}
});
// Bulk drain all waiting/delayed
router.post("/queue/:name/drain", async (req, res) => {
const queue = pickQueue(req.params.name);
if (!queue) return res.status(404).json({ error: "queue_not_found" });
try {
await queue.drain(true);
res.json({ ok: true });
} catch (error) {
handleError(error, res, req);
}
});
router.get("/queues", async (req, res) => {
const out = await Promise.all([
downloadQueue.getJobs([
"waiting",
"active",
"completed",
"failed",
"delayed",
]),
removeQueue.getJobs([
"waiting",
"active",
"completed",
"failed",
"delayed",
]),
cacheQueue.getJobs(["waiting", "active", "completed", "failed", "delayed"]),
const search = req.query.search ? String(req.query.search).toLowerCase() : "";
const stateFilter = req.query.state ? String(req.query.state) : null;
const states = stateFilter && (QUEUE_STATES as readonly string[]).includes(stateFilter)
? [stateFilter]
: (QUEUE_STATES as readonly string[]);
const [download, remove, cache, dCounts, rCounts, cCounts] = await Promise.all([
downloadQueue.getJobs(states),
removeQueue.getJobs(states),
cacheQueue.getJobs(states),
downloadQueue.getJobCounts(...QUEUE_STATES),
removeQueue.getJobCounts(...QUEUE_STATES),
cacheQueue.getJobCounts(...QUEUE_STATES),
]);
const matches = (job: { id?: string | undefined; name?: string }) => {
if (!search) return true;
return (
(job.id || "").toLowerCase().includes(search) ||
(job.name || "").toLowerCase().includes(search)
);
};
res.json({
downloadQueue: out[0],
removeQueue: out[1],
cacheQueue: out[2],
downloadQueue: download.filter(matches),
removeQueue: remove.filter(matches),
cacheQueue: cache.filter(matches),
counts: {
download: dCounts,
remove: rCounts,
cache: cCounts,
},
});
});
// Global stats endpoint: counts by status, total disk, recent failures
router.get("/stats", async (req, res) => {
try {
const [statusBreakdown, totalSize, recentErrors, totalUsers, totalConferences] =
await Promise.all([
AnonymizedRepositoryModel.aggregate([
{ $group: { _id: "$status", count: { $sum: 1 }, storage: { $sum: "$size.storage" } } },
]),
AnonymizedRepositoryModel.aggregate([
{ $group: { _id: null, total: { $sum: "$size.storage" } } },
]),
AnonymizedRepositoryModel.countDocuments({
status: "error",
statusDate: { $gte: new Date(Date.now() - 1000 * 60 * 60 * 24) },
}),
UserModel.estimatedDocumentCount(),
ConferenceModel.estimatedDocumentCount(),
]);
res.json({
statusBreakdown,
totalStorage: totalSize[0]?.total || 0,
recentErrors24h: recentErrors,
totalUsers,
totalConferences,
});
} catch (error) {
handleError(error, res, req);
}
});
router.get("/repos", async (req, res) => {
const page = parseInt(req.query.page as string) || 1;
const limit = parseInt(req.query.limit as string) || 10;
const limit = Math.min(parseInt(req.query.limit as string) || 10, 1000);
const ready = req.query.ready == "true";
const error = req.query.error == "true";
const preparing = req.query.preparing == "true";
const remove = req.query.removed == "true";
const expired = req.query.expired == "true";
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let sort: any = { _id: 1 };
if (req.query.sort) {
sort = {};
sort[req.query.sort as string] = -1;
}
const query = [];
const sort = parseSort(req);
const query: Record<string, unknown>[] = [];
// multi-field search: repoId, source.repositoryName, statusMessage, conference
if (req.query.search) {
const escaped = (req.query.search as string).replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
query.push({ repoId: { $regex: escaped } });
const escaped = escapeRegex(req.query.search as string);
const re = { $regex: escaped, $options: "i" };
query.push({
$or: [
{ repoId: re },
{ "source.repositoryName": re },
{ statusMessage: re },
{ conference: re },
],
});
}
// filter by owner username
if (req.query.owner) {
const ownerUsername = req.query.owner as string;
const ownerDoc = await UserModel.findOne({ username: ownerUsername }, { _id: 1 });
if (!ownerDoc) {
return res.json({ query: { $and: query }, page, total: 0, sort, results: [], statusCounts: [], totalSize: 0 });
}
query.push({ owner: ownerDoc._id });
}
// filter by conference
if (req.query.conference) {
query.push({ conference: req.query.conference });
}
// date range filter on anonymizeDate
const dateFilter = parseDateRange(req, "anonymizeDate");
if (dateFilter) query.push(dateFilter);
const status: { status: string }[] = [];
if (ready) {
status.push({ status: "ready" });
}
if (error) {
status.push({ status: "error" });
}
if (ready) status.push({ status: "ready" });
if (error) status.push({ status: "error" });
if (expired) {
status.push({ status: "expiring" });
status.push({ status: "expired" });
@@ -157,23 +297,59 @@ router.get("/repos", async (req, res) => {
if (status.length > 0) {
query.push({ $or: status });
}
const filter = query.length ? { $and: query } : {};
const skipIndex = (page - 1) * limit;
const [total, results] = await Promise.all([
AnonymizedRepositoryModel.find({
$and: query,
}).countDocuments(),
AnonymizedRepositoryModel.find({ $and: query })
// CSV export branch
if (req.query.format === "csv") {
const all = await AnonymizedRepositoryModel.find(filter).sort(sort).limit(50000).lean();
const rows = all.map((r) => ({
repoId: r.repoId,
status: r.status,
statusMessage: r.statusMessage || "",
anonymizeDate: r.anonymizeDate ? new Date(r.anonymizeDate).toISOString() : "",
lastView: r.lastView ? new Date(r.lastView).toISOString() : "",
pageView: r.pageView || 0,
sourceRepository: r.source?.repositoryName || "",
sourceBranch: r.source?.branch || "",
sourceCommit: r.source?.commit || "",
conference: r.conference || "",
storage: r.size?.storage || 0,
terms: (r.options?.terms || []).length,
}));
return sendCsv(
res,
`repositories-${new Date().toISOString().slice(0, 10)}.csv`,
Object.keys(rows[0] || { repoId: 1 }),
rows
);
}
const [total, results, statusCounts, sizeAgg] = await Promise.all([
AnonymizedRepositoryModel.find(filter).countDocuments(),
AnonymizedRepositoryModel.find(filter)
.skip(skipIndex)
.sort(sort)
.limit(limit)
.exec(),
AnonymizedRepositoryModel.aggregate([
{ $match: filter },
{ $group: { _id: "$status", count: { $sum: 1 }, storage: { $sum: "$size.storage" } } },
]),
AnonymizedRepositoryModel.aggregate([
{ $match: filter },
{ $group: { _id: null, total: { $sum: "$size.storage" } } },
]),
]);
res.json({
query: { $and: query },
query: filter,
page,
total,
sort,
results,
statusCounts,
totalSize: sizeAgg[0]?.total || 0,
});
});
@@ -184,7 +360,7 @@ router.delete(
const repo = await getRepo(req, res, { nocheck: true });
if (!repo) return;
try {
await cacheQueue.add(repo.repoId, repo, { jobId: repo.repoId });
await cacheQueue.add(repo.repoId, { repoId: repo.repoId }, { jobId: repo.repoId });
return res.json({ status: repo.status });
} catch (error) {
handleError(error, res, req);
@@ -192,33 +368,163 @@ router.delete(
}
);
// Live GitHub info for a repository (admin diagnostic)
router.get(
"/repos/:repoId/github",
async (req: express.Request, res: express.Response) => {
try {
const repo = await getRepo(req, res, { nocheck: true });
if (!repo) return;
let token: string | undefined;
try {
token = await getToken(repo);
} catch {
token = undefined;
}
const oct = octokit(token || "");
const fullName = repo.model.source?.repositoryName || "";
const [owner, name] = fullName.split("/");
if (!owner || !name) {
return res.status(400).json({ error: "invalid_source_repository" });
}
const out: Record<string, unknown> = {
source: { owner, repo: name, branch: repo.model.source?.branch, commit: repo.model.source?.commit },
};
try {
const info = await oct.repos.get({ owner, repo: name });
out.repository = {
fullName: info.data.full_name,
private: info.data.private,
archived: info.data.archived,
disabled: info.data.disabled,
defaultBranch: info.data.default_branch,
description: info.data.description,
stargazers: info.data.stargazers_count,
watchers: info.data.watchers_count,
forks: info.data.forks_count,
openIssues: info.data.open_issues_count,
size: info.data.size,
language: info.data.language,
license: info.data.license?.spdx_id,
createdAt: info.data.created_at,
updatedAt: info.data.updated_at,
pushedAt: info.data.pushed_at,
htmlUrl: info.data.html_url,
topics: info.data.topics,
};
} catch (e) {
out.repositoryError = (e as Error)?.message || String(e);
}
try {
if (repo.model.source?.branch) {
const br = await oct.repos.getBranch({ owner, repo: name, branch: repo.model.source.branch });
out.branch = {
name: br.data.name,
protected: br.data.protected,
commitSha: br.data.commit?.sha,
};
}
} catch (e) {
out.branchError = (e as Error)?.message || String(e);
}
try {
if (repo.model.source?.commit) {
const c = await oct.repos.getCommit({ owner, repo: name, ref: repo.model.source.commit });
out.commit = {
sha: c.data.sha,
message: c.data.commit?.message,
author: c.data.commit?.author,
committer: c.data.commit?.committer,
htmlUrl: c.data.html_url,
stats: c.data.stats,
filesChanged: c.data.files?.length,
};
}
} catch (e) {
out.commitError = (e as Error)?.message || String(e);
}
try {
const r = await oct.rateLimit.get();
out.rateLimit = {
remaining: r.data.rate.remaining,
limit: r.data.rate.limit,
reset: new Date(r.data.rate.reset * 1000).toISOString(),
};
} catch {
// ignore
}
res.json(out);
} catch (error) {
handleError(error, res, req);
}
}
);
router.get("/users", async (req, res) => {
const page = parseInt(req.query.page as string) || 1;
const limit = parseInt(req.query.limit as string) || 10;
const limit = Math.min(parseInt(req.query.limit as string) || 10, 1000);
const skipIndex = (page - 1) * limit;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let sort: any = { _id: 1 };
if (req.query.sort) {
sort = {};
sort[req.query.sort as string] = -1;
}
let query = {};
const sort = parseSort(req);
const filter: Record<string, unknown> = {};
if (req.query.search) {
const escaped = (req.query.search as string).replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
query = { username: { $regex: escaped } };
const escaped = escapeRegex(req.query.search as string);
filter.$or = [
{ username: { $regex: escaped, $options: "i" } },
{ "emails.email": { $regex: escaped, $options: "i" } },
];
}
if (req.query.status) {
filter.status = req.query.status;
}
if (req.query.role === "admin") {
filter.isAdmin = true;
}
const dateFilter = parseDateRange(req, "dateOfEntry");
if (dateFilter) Object.assign(filter, dateFilter);
// CSV export
if (req.query.format === "csv") {
const all = await UserModel.find(filter).sort(sort).limit(50000).lean();
const rows = all.map((u) => ({
username: u.username,
email: u.emails?.[0]?.email || "",
status: u.status,
isAdmin: !!u.isAdmin,
repoCount: (u.repositories || []).length,
dateOfEntry: u.dateOfEntry ? new Date(u.dateOfEntry).toISOString() : "",
}));
return sendCsv(
res,
`users-${new Date().toISOString().slice(0, 10)}.csv`,
["username", "email", "status", "isAdmin", "repoCount", "dateOfEntry"],
rows
);
}
res.json({
query: query,
page,
total: await UserModel.find(query).countDocuments(),
sort,
results: await UserModel.find(query)
.sort(sort)
.limit(limit)
.skip(skipIndex),
});
const [total, results, statusCounts] = await Promise.all([
UserModel.find(filter).countDocuments(),
UserModel.aggregate([
{ $match: filter },
{ $sort: sort },
{ $skip: skipIndex },
{ $limit: limit },
{
$addFields: {
repoCount: { $size: { $ifNull: ["$repositories", []] } },
},
},
{ $project: { accessTokens: 0, apiTokens: 0 } },
]),
UserModel.aggregate([
{ $match: filter },
{ $group: { _id: "$status", count: { $sum: 1 } } },
]),
]);
res.json({ query: filter, page, total, sort, results, statusCounts });
});
router.get(
"/users/:username",
@@ -266,35 +572,50 @@ router.get(
);
router.get("/conferences", async (req, res) => {
const page = parseInt(req.query.page as string) || 1;
const limit = parseInt(req.query.limit as string) || 10;
const limit = Math.min(parseInt(req.query.limit as string) || 10, 1000);
const skipIndex = (page - 1) * limit;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let sort: any = { _id: 1 };
if (req.query.sort) {
sort = {};
sort[req.query.sort as string] = -1;
}
let query = {};
const sort = parseSort(req);
const filter: Record<string, unknown> = {};
if (req.query.search) {
const escaped = (req.query.search as string).replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
query = {
$or: [
{ name: { $regex: escaped } },
{ conferenceID: { $regex: escaped } },
],
};
const escaped = escapeRegex(req.query.search as string);
filter.$or = [
{ name: { $regex: escaped, $options: "i" } },
{ conferenceID: { $regex: escaped, $options: "i" } },
];
}
res.json({
query: query,
page,
total: await ConferenceModel.find(query).estimatedDocumentCount(),
sort,
results: await ConferenceModel.find(query)
.sort(sort)
.limit(limit)
.skip(skipIndex),
});
if (req.query.status) filter.status = req.query.status;
const dateFilter = parseDateRange(req, "startDate");
if (dateFilter) Object.assign(filter, dateFilter);
if (req.query.format === "csv") {
const all = await ConferenceModel.find(filter).sort(sort).limit(50000).lean();
const rows = all.map((c: Record<string, unknown>) => ({
conferenceID: c.conferenceID,
name: c.name,
status: c.status,
price: c.price || 0,
repoCount: ((c.repositories as unknown[]) || []).length,
startDate: c.startDate ? new Date(c.startDate as Date).toISOString() : "",
endDate: c.endDate ? new Date(c.endDate as Date).toISOString() : "",
}));
return sendCsv(
res,
`conferences-${new Date().toISOString().slice(0, 10)}.csv`,
["conferenceID", "name", "status", "price", "repoCount", "startDate", "endDate"],
rows
);
}
const [total, results, statusCounts] = await Promise.all([
ConferenceModel.find(filter).countDocuments(),
ConferenceModel.find(filter).sort(sort).limit(limit).skip(skipIndex),
ConferenceModel.aggregate([
{ $match: filter },
{ $group: { _id: "$status", count: { $sum: 1 } } },
]),
]);
res.json({ query: filter, page, total, sort, results, statusCounts });
});
export default router;
+3 -3
View File
@@ -175,7 +175,7 @@ router.delete(
const user = await getUser(req);
isOwnerOrAdmin([repo.owner.id], user);
await repo.updateStatus(RepositoryStatus.REMOVING);
await removeQueue.add(repo.repoId, repo, { jobId: repo.repoId });
await removeQueue.add(repo.repoId, { repoId: repo.repoId }, { jobId: repo.repoId });
return res.json({ status: repo.status });
} catch (error) {
handleError(error, res, req);
@@ -470,7 +470,7 @@ router.post(
repo.model.conference = repoUpdate.conference;
await repo.updateStatus(RepositoryStatus.PREPARING);
res.json({ status: repo.status });
await downloadQueue.add(repo.repoId, repo, { jobId: repo.repoId });
await downloadQueue.add(repo.repoId, { repoId: repo.repoId }, { jobId: repo.repoId });
} catch (error) {
return handleError(error, res, req);
}
@@ -559,7 +559,7 @@ router.post("/", async (req: express.Request, res: express.Response) => {
}
res.send({ status: repo.status });
downloadQueue.add(repo.repoId, new Repository(repo), {
downloadQueue.add(repo.repoId, { repoId: repo.repoId }, {
jobId: repo.repoId,
attempts: 3,
});
+1 -1
View File
@@ -178,7 +178,7 @@ router.get(
// && repo.status != "preparing"
) {
await repo.updateStatus(RepositoryStatus.PREPARING);
await downloadQueue.add(repo.repoId, repo, {
await downloadQueue.add(repo.repoId, { repoId: repo.repoId }, {
jobId: repo.repoId,
attempts: 3,
});
+3 -2
View File
@@ -61,6 +61,7 @@ router.get("/quota", async (req: express.Request, res: express.Response) => {
}
if (uncachedIds.length) {
const uncachedSet = new Set(uncachedIds);
const agg = await FileModel.aggregate([
{ $match: { repoId: { $in: uncachedIds } } },
{
@@ -76,7 +77,7 @@ router.get("/quota", async (req: express.Request, res: express.Response) => {
byId.set(row._id, { storage: row.storage || 0, file: row.file || 0 });
}
for (const r of ready) {
if (!uncachedIds.includes(r.repoId)) continue;
if (!uncachedSet.has(r.repoId)) continue;
const size = byId.get(r.repoId) || { storage: 0, file: 0 };
totalStorage += size.storage;
totalFiles += size.file;
@@ -85,7 +86,7 @@ router.get("/quota", async (req: express.Request, res: express.Response) => {
if (isConnected) {
await Promise.all(
ready
.filter((r) => uncachedIds.includes(r.repoId))
.filter((r) => uncachedSet.has(r.repoId))
.map((r) => r.model.save())
);
}