import storage from "./storage"; import { RepositoryStatus } from "./types"; import { Readable } from "stream"; import * as sha1 from "crypto-js/sha1"; import User from "./User"; import GitHubStream from "./source/GitHubStream"; import Zip from "./source/Zip"; import { anonymizePathCompiled, compileTerms } from "./anonymize-utils"; import UserModel from "./model/users/users.model"; import { IAnonymizedRepositoryDocument } from "./model/anonymizedRepositories/anonymizedRepositories.types"; import { AnonymizeTransformer } from "./anonymize-utils"; import GitHubBase from "./source/GitHubBase"; import Conference from "./Conference"; import ConferenceModel from "./model/conference/conferences.model"; import AnonymousError from "./AnonymousError"; import { downloadQueue } from "../queue"; import { isConnected } from "../server/database"; import { getRepositoryFromGitHub, GitHubRepository, } from "./source/GitHubRepository"; import { getToken } from "./GitHubUtils"; import config from "../config"; import FileModel from "./model/files/files.model"; import AnonymizedRepositoryModel from "./model/anonymizedRepositories/anonymizedRepositories.model"; import { createLogger, serializeError } from "./logger"; const logger = createLogger("repository"); import { IFile } from "./model/files/files.types"; import AnonymizedFile from "./AnonymizedFile"; import { FilterQuery } from "mongoose"; function anonymizeTreeRecursive( tree: IFile[], terms: string[], opt: { /** Include the file sha in the response */ includeSha: boolean; } = { includeSha: false, } ): Partial[] { const compiled = compileTerms(terms); return tree.map((file) => { return { name: anonymizePathCompiled(file.name, compiled), path: anonymizePathCompiled(file.path, compiled), size: file.size, sha: opt.includeSha ? file.sha : file.size ? sha1(file.sha || "") .toString() .substring(0, 8) : undefined, }; }); } export default class Repository { private _model: IAnonymizedRepositoryDocument; owner: User; constructor(data: IAnonymizedRepositoryDocument) { this._model = data; this.owner = new User(new UserModel({ _id: data.owner })); this.owner.model.isNew = false; } private checkedToken: boolean = false; async getToken() { if (this.checkedToken) return this._model.source.accessToken as string; const originalToken = this._model.source.accessToken; const token = await getToken(this); if (originalToken != token) { this._model.source.accessToken = token; if (isConnected) { await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { "source.accessToken": token } } ).exec(); } } this.checkedToken = true; return token; } get source() { const ghRepo = new GitHubRepository({ name: this.model.source.repositoryName, }); if (this.model.source.type === "Zip") { return new Zip(this.model.source, this.repoId); } return new GitHubStream({ repoId: this.repoId, commit: this.model.source.commit || "HEAD", organization: ghRepo.owner, repoName: ghRepo.repo, getToken: () => this.getToken(), }); } /** * Get the anonymized file tree * @param opt force to get an updated list of files * @returns The anonymized file tree */ async anonymizedFiles( opt: { /** Force to refresh the file tree */ force?: boolean; /** Include the file sha in the response */ includeSha: boolean; recursive?: boolean; path?: string; } = { force: false, includeSha: false, recursive: true, } ): Promise[]> { const terms = this._model.options.terms || []; return anonymizeTreeRecursive(await this.files(opt), terms, opt); } /** * Get the file tree * * @param opt force to get an updated list of files * @returns The file tree */ async files( opt: { recursive?: boolean; path?: string; force?: boolean; progress?: (status: string) => void; } = { recursive: true, force: false, } ): Promise { let hasFile = await FileModel.exists({ repoId: this.repoId }).exec(); // Files created by GitHubDownload don't carry a valid 40-char GitHub // blob SHA. When the source type later switches to GitHubStream the // stale entries cause blob-API 404s. Detect this by sampling a file // with a sha and checking its length; force a re-fetch if it doesn't // look like a GitHub SHA. if (hasFile && this.source instanceof GitHubStream) { const sample = await FileModel.findOne( { repoId: this.repoId, sha: { $exists: true, $ne: null } }, { sha: 1 } ).exec(); if (sample?.sha && sample.sha.length !== 40) { hasFile = null; } } if (!hasFile || opt.force) { await FileModel.deleteMany({ repoId: this.repoId }).exec(); const files = await this.source.getFiles(opt.progress); files.forEach((f) => (f.repoId = this.repoId)); await FileModel.insertMany(files); const sourceWithTruncation = this.source as unknown as { truncatedFolderList?: string[]; }; if (Array.isArray(sourceWithTruncation.truncatedFolderList)) { this._model.truncatedFolders = sourceWithTruncation.truncatedFolderList; } this._model.size = { storage: 0, file: 0 }; await this.computeSize(); if (isConnected) { await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { truncatedFolders: this._model.truncatedFolders, size: this._model.size, }, } ).exec(); } } if (opt.path?.includes(config.ANONYMIZATION_MASK)) { const f = new AnonymizedFile({ repository: this, anonymizedPath: opt.path, }); opt.path = await f.originalPath(); } const escapedPath = opt.path ? opt.path.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&") : undefined; let pathQuery: string | RegExp | undefined = escapedPath ? new RegExp(`^${escapedPath}`) : undefined; if (opt.recursive === false) { pathQuery = escapedPath ? new RegExp(`^${escapedPath}$`) : ""; } const query: FilterQuery = { repoId: this.repoId, }; if (pathQuery !== undefined) { query.path = pathQuery; } return await FileModel.find(query).exec(); } /** * Check the status of the repository */ async check() { if ( this._model.options.expirationMode !== "never" && this.status == RepositoryStatus.READY && this._model.options.expirationDate ) { if (this._model.options.expirationDate <= new Date()) { await this.expire(); } } if ( this.status == RepositoryStatus.EXPIRED || this.status == RepositoryStatus.EXPIRING || this.status == RepositoryStatus.REMOVING || this.status == RepositoryStatus.REMOVED ) { throw new AnonymousError("repository_expired", { object: this, httpStatus: 410, }); } const fiveMinuteAgo = new Date(); fiveMinuteAgo.setMinutes(fiveMinuteAgo.getMinutes() - 5); if ( this.status == RepositoryStatus.PREPARING || (this.status == RepositoryStatus.DOWNLOAD && this._model.statusDate > fiveMinuteAgo) ) { throw new AnonymousError("repository_not_ready", { object: this, httpStatus: 425, }); } } /** * Compress and anonymize the repository * * @returns A stream of anonymized repository compressed */ zip(): Promise { return storage.archive(this.repoId, "", { format: "zip", fileTransformer: (filename: string) => this.generateAnonymizeTransformer(filename), }); } generateAnonymizeTransformer(filePath: string) { return new AnonymizeTransformer({ filePath: filePath, terms: this.options.terms, image: this.options.image, link: this.options.link, repoId: this.repoId, repoName: this.model.source.repositoryName, branchName: this.model.source.branch || "main", }); } async isReady() { if (this.status !== RepositoryStatus.READY) return false; if (!(await FileModel.exists({ repoId: this.repoId }).exec())) { this.model.status = RepositoryStatus.PREPARING; await this.updateIfNeeded({ force: true }); return false; } return true; } /** * Update the repository if a new commit exists * * @returns void */ async updateIfNeeded(opt?: { force: boolean }): Promise { if ( this._model.options.expirationMode !== "never" && this.status != RepositoryStatus.EXPIRED && this._model.options.expirationDate ) { if (this._model.options.expirationDate <= new Date()) { this._model.status = RepositoryStatus.EXPIRED; await this.expire(); throw new AnonymousError("repository_expired", { object: this, httpStatus: 410, }); } } const yesterday = new Date(); yesterday.setDate(yesterday.getDate() - 1); if ( opt?.force || (this._model.options.update && this._model.lastView < yesterday) ) { // Only GitHubBase can be update for the moment if (this.source instanceof GitHubBase) { const token = await this.getToken(); const ghRepo = await getRepositoryFromGitHub({ accessToken: token, owner: this.source.data.organization, repo: this.source.data.repoName, repositoryID: this.model.source.repositoryId, force: true, }); // update the repository name if it has changed. Persist it // immediately — otherwise, when the commit is unchanged and the // function returns early below, the renamed value lives in this // in-memory model only and the next request reloads the stale // name from MongoDB and re-runs the rename detection forever. if (this.model.source.repositoryName !== ghRepo.fullName) { this.model.source.repositoryName = ghRepo.fullName; if (isConnected) { await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { "source.repositoryName": ghRepo.fullName } } ).exec(); } } const branches = await ghRepo.branches({ force: true, accessToken: token, }); const branchName = this.model.source.branch || ghRepo.model.defaultBranch; const newCommit = branches.filter((f) => f.name == branchName)[0] ?.commit; if (!newCommit) { logger.error("branch not found", { code: "branch_not_found", httpStatus: 404, repoId: this.repoId, branch: branchName, repo: this.model.source.repositoryName, }); await this.updateStatus(RepositoryStatus.ERROR, "branch_not_found"); await this.resetSate(); throw new AnonymousError("branch_not_found", { object: this, httpStatus: 404, }); } if ( this.model.source.commit == newCommit && this.status == RepositoryStatus.READY ) { logger.info("up to date", { repoId: this._model.repoId }); return; } this._model.source.commit = newCommit; const commitInfo = await ghRepo.getCommitInfo(newCommit, { accessToken: token, }); if ( commitInfo.commit?.author?.date || commitInfo.commit?.committer?.date ) { const d = (commitInfo.commit?.author?.date || commitInfo.commit.committer?.date) as string; this._model.source.commitDate = new Date(d); } this.model.source.commit = newCommit; this._model.anonymizeDate = new Date(); logger.info("update queued", { repoId: this._model.repoId, commit: newCommit, }); if (isConnected) { await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { "source.commit": newCommit, "source.commitDate": this._model.source.commitDate, anonymizeDate: this._model.anonymizeDate, }, } ).exec(); } await this.resetSate(RepositoryStatus.PREPARING); await downloadQueue.add(this.repoId, { repoId: this.repoId }, { jobId: `repo-${this.repoId}`, attempts: 3, }); } } } /** * Download the require state for the repository to work * * @returns void */ async anonymize(progress?: (status: string) => void) { if (this.status === RepositoryStatus.READY) { return; } this.model.increment(); await this.updateStatus(RepositoryStatus.DOWNLOAD); await this.files({ force: false, progress, recursive: false, }); // Previously inserted a dummy {path:"", name:"", size:0} FileModel // here for empty repos "to avoid errors" — but that record collides // with the special-case in AnonymizedFile.getFileInfo for the empty // path, surfaces in unfiltered file listings, and breaks anything // that assumes FileModel rows correspond to real files. Empty repos // are handled by the route layer; nothing to materialise here. await this.updateStatus(RepositoryStatus.READY); await this.computeSize(); } /** * Update the last view and view count */ async countView() { this._model.lastView = new Date(); this._model.pageView = (this._model.pageView || 0) + 1; if (!isConnected) return this.model; await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { lastView: this._model.lastView }, $inc: { pageView: 1 }, } ).exec(); } /** * Update the status of the repository * @param status the new status * @param errorMessage a potential error message to display */ async updateStatus(status: RepositoryStatus, statusMessage?: string) { if (!status) return this.model; this._model.status = status; this._model.statusDate = new Date(); this._model.statusMessage = statusMessage; if (!isConnected) return this.model; await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { status, statusDate: this._model.statusDate, statusMessage, }, } ).exec(); } /** * Expire the repository */ async expire() { await this.updateStatus(RepositoryStatus.EXPIRING); await this.resetSate(); await this.updateStatus(RepositoryStatus.EXPIRED); } /** * Remove the repository */ async remove() { await this.updateStatus(RepositoryStatus.REMOVING); await this.resetSate(); await this.updateStatus(RepositoryStatus.REMOVED); } /** * Reset/delete the state of the repository */ async resetSate(status?: RepositoryStatus, statusMessage?: string) { // remove attribute this._model.size = { storage: 0, file: 0 }; if (status) { await this.updateStatus(status, statusMessage); } // remove cache await Promise.all([ FileModel.deleteMany({ repoId: this.repoId }).exec(), this.removeCache(), ]); logger.info("reset", { repoId: this._model.repoId }); } /** * Remove the cached files * @returns */ async removeCache() { await storage.rm(this.repoId); this.model.isReseted = true; this.model.size = { storage: 0, file: 0 }; if (isConnected) { try { await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { isReseted: true, size: this._model.size } } ).exec(); } catch (error) { logger.error("removeCache save failed", serializeError(error)); } } } /** * Compute the size of the repository in term of storage and number of files. * * @returns The size of the repository in bite */ async computeSize(): Promise<{ /** * Size of the repository in bit */ storage: number; /** * The number of files */ file: number; }> { if (this.status !== RepositoryStatus.READY) return { storage: 0, file: 0 }; if (this._model.size.file) return this._model.size; const res = await FileModel.aggregate([ { $match: { repoId: this.repoId, }, }, { $group: { _id: "$repoId", storage: { $sum: "$size" }, file: { $sum: 1 }, }, }, ]); this._model.size = { storage: res[0]?.storage || 0, file: res[0]?.file || 0, }; if (isConnected) { await AnonymizedRepositoryModel.updateOne( { _id: this._model._id }, { $set: { size: this._model.size } } ).exec(); } return this._model.size; } /** * Returns the conference of the repository * * @returns conference of the repository */ async conference(): Promise { if (!this._model.conference) { return null; } const conference = await ConferenceModel.findOne({ conferenceID: this._model.conference, }); if (conference) return new Conference(conference); return null; } /***** Getters ********/ get repoId() { return this._model.repoId; } get options() { return this._model.options; } get coauthors() { return this._model.coauthors || []; } get model() { return this._model; } get status() { return this._model.status; } get size() { if (this.status != RepositoryStatus.READY) return { storage: 0, file: 0 }; return this._model.size; } toJSON() { return { repoId: this._model.repoId, options: this._model.options, coauthors: (this._model.coauthors || []).map((c) => ({ username: c.username, githubId: c.githubId, photo: c.photo, })), conference: this._model.conference, anonymizeDate: this._model.anonymizeDate, status: this.status, statusMessage: this._model.statusMessage, lastView: this._model.lastView, pageView: this._model.pageView, size: this.size, source: { repositoryID: this.model.source.repositoryId, fullName: this.model.source.repositoryName, commit: this.model.source.commit, branch: this.model.source.branch, type: this.model.source.type, }, }; } }