mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-15 14:38:03 +02:00
643 lines
18 KiB
TypeScript
643 lines
18 KiB
TypeScript
import storage from "./storage";
|
|
import { RepositoryStatus } from "./types";
|
|
import { Readable } from "stream";
|
|
import * as sha1 from "crypto-js/sha1";
|
|
import User from "./User";
|
|
import GitHubStream from "./source/GitHubStream";
|
|
import Zip from "./source/Zip";
|
|
import { anonymizePathCompiled, compileTerms } from "./anonymize-utils";
|
|
import UserModel from "./model/users/users.model";
|
|
import { IAnonymizedRepositoryDocument } from "./model/anonymizedRepositories/anonymizedRepositories.types";
|
|
import { AnonymizeTransformer } from "./anonymize-utils";
|
|
import GitHubBase from "./source/GitHubBase";
|
|
import Conference from "./Conference";
|
|
import ConferenceModel from "./model/conference/conferences.model";
|
|
import AnonymousError from "./AnonymousError";
|
|
import { downloadQueue } from "../queue";
|
|
import { isConnected } from "../server/database";
|
|
import {
|
|
getRepositoryFromGitHub,
|
|
GitHubRepository,
|
|
} from "./source/GitHubRepository";
|
|
import { getToken } from "./GitHubUtils";
|
|
import config from "../config";
|
|
import FileModel from "./model/files/files.model";
|
|
import AnonymizedRepositoryModel from "./model/anonymizedRepositories/anonymizedRepositories.model";
|
|
import { createLogger, serializeError } from "./logger";
|
|
|
|
const logger = createLogger("repository");
|
|
import { IFile } from "./model/files/files.types";
|
|
import AnonymizedFile from "./AnonymizedFile";
|
|
import { FilterQuery } from "mongoose";
|
|
function anonymizeTreeRecursive(
|
|
tree: IFile[],
|
|
terms: string[],
|
|
opt: {
|
|
/** Include the file sha in the response */
|
|
includeSha: boolean;
|
|
} = {
|
|
includeSha: false,
|
|
}
|
|
): Partial<IFile>[] {
|
|
const compiled = compileTerms(terms);
|
|
return tree.map((file) => {
|
|
return {
|
|
name: anonymizePathCompiled(file.name, compiled),
|
|
path: anonymizePathCompiled(file.path, compiled),
|
|
size: file.size,
|
|
sha: opt.includeSha
|
|
? file.sha
|
|
: file.size
|
|
? sha1(file.sha || "")
|
|
.toString()
|
|
.substring(0, 8)
|
|
: undefined,
|
|
};
|
|
});
|
|
}
|
|
|
|
export default class Repository {
|
|
private _model: IAnonymizedRepositoryDocument;
|
|
owner: User;
|
|
|
|
constructor(data: IAnonymizedRepositoryDocument) {
|
|
this._model = data;
|
|
this.owner = new User(new UserModel({ _id: data.owner }));
|
|
this.owner.model.isNew = false;
|
|
}
|
|
|
|
private checkedToken: boolean = false;
|
|
|
|
async getToken() {
|
|
if (this.checkedToken) return this._model.source.accessToken as string;
|
|
const originalToken = this._model.source.accessToken;
|
|
const token = await getToken(this);
|
|
if (originalToken != token) {
|
|
this._model.source.accessToken = token;
|
|
if (isConnected) {
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{ $set: { "source.accessToken": token } }
|
|
).exec();
|
|
}
|
|
}
|
|
this.checkedToken = true;
|
|
return token;
|
|
}
|
|
|
|
get source() {
|
|
const ghRepo = new GitHubRepository({
|
|
name: this.model.source.repositoryName,
|
|
});
|
|
if (this.model.source.type === "Zip") {
|
|
return new Zip(this.model.source, this.repoId);
|
|
}
|
|
return new GitHubStream({
|
|
repoId: this.repoId,
|
|
commit: this.model.source.commit || "HEAD",
|
|
organization: ghRepo.owner,
|
|
repoName: ghRepo.repo,
|
|
getToken: () => this.getToken(),
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Get the anonymized file tree
|
|
* @param opt force to get an updated list of files
|
|
* @returns The anonymized file tree
|
|
*/
|
|
async anonymizedFiles(
|
|
opt: {
|
|
/** Force to refresh the file tree */
|
|
force?: boolean;
|
|
/** Include the file sha in the response */
|
|
includeSha: boolean;
|
|
recursive?: boolean;
|
|
path?: string;
|
|
} = {
|
|
force: false,
|
|
includeSha: false,
|
|
recursive: true,
|
|
}
|
|
): Promise<Partial<IFile>[]> {
|
|
const terms = this._model.options.terms || [];
|
|
return anonymizeTreeRecursive(await this.files(opt), terms, opt);
|
|
}
|
|
|
|
/**
|
|
* Get the file tree
|
|
*
|
|
* @param opt force to get an updated list of files
|
|
* @returns The file tree
|
|
*/
|
|
async files(
|
|
opt: {
|
|
recursive?: boolean;
|
|
path?: string;
|
|
force?: boolean;
|
|
progress?: (status: string) => void;
|
|
} = {
|
|
recursive: true,
|
|
force: false,
|
|
}
|
|
): Promise<IFile[]> {
|
|
let hasFile = await FileModel.exists({ repoId: this.repoId }).exec();
|
|
// Files created by GitHubDownload don't carry a valid 40-char GitHub
|
|
// blob SHA. When the source type later switches to GitHubStream the
|
|
// stale entries cause blob-API 404s. Detect this by sampling a file
|
|
// with a sha and checking its length; force a re-fetch if it doesn't
|
|
// look like a GitHub SHA.
|
|
if (hasFile && this.source instanceof GitHubStream) {
|
|
const sample = await FileModel.findOne(
|
|
{ repoId: this.repoId, sha: { $exists: true, $ne: null } },
|
|
{ sha: 1 }
|
|
).exec();
|
|
if (sample?.sha && sample.sha.length !== 40) {
|
|
hasFile = null;
|
|
}
|
|
}
|
|
if (!hasFile || opt.force) {
|
|
await FileModel.deleteMany({ repoId: this.repoId }).exec();
|
|
const files = await this.source.getFiles(opt.progress);
|
|
files.forEach((f) => (f.repoId = this.repoId));
|
|
await FileModel.insertMany(files);
|
|
|
|
const sourceWithTruncation = this.source as unknown as {
|
|
truncatedFolderList?: string[];
|
|
};
|
|
if (Array.isArray(sourceWithTruncation.truncatedFolderList)) {
|
|
this._model.truncatedFolders = sourceWithTruncation.truncatedFolderList;
|
|
}
|
|
|
|
this._model.size = { storage: 0, file: 0 };
|
|
await this.computeSize();
|
|
if (isConnected) {
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{
|
|
$set: {
|
|
truncatedFolders: this._model.truncatedFolders,
|
|
size: this._model.size,
|
|
},
|
|
}
|
|
).exec();
|
|
}
|
|
}
|
|
if (opt.path?.includes(config.ANONYMIZATION_MASK)) {
|
|
const f = new AnonymizedFile({
|
|
repository: this,
|
|
anonymizedPath: opt.path,
|
|
});
|
|
opt.path = await f.originalPath();
|
|
}
|
|
|
|
const escapedPath = opt.path
|
|
? opt.path.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&")
|
|
: undefined;
|
|
let pathQuery: string | RegExp | undefined = escapedPath
|
|
? new RegExp(`^${escapedPath}`)
|
|
: undefined;
|
|
if (opt.recursive === false) {
|
|
pathQuery = escapedPath ? new RegExp(`^${escapedPath}$`) : "";
|
|
}
|
|
|
|
const query: FilterQuery<IFile> = {
|
|
repoId: this.repoId,
|
|
};
|
|
if (pathQuery !== undefined) {
|
|
query.path = pathQuery;
|
|
}
|
|
return await FileModel.find(query).exec();
|
|
}
|
|
|
|
/**
|
|
* Check the status of the repository
|
|
*/
|
|
async check() {
|
|
if (
|
|
this._model.options.expirationMode !== "never" &&
|
|
this.status == RepositoryStatus.READY &&
|
|
this._model.options.expirationDate
|
|
) {
|
|
if (this._model.options.expirationDate <= new Date()) {
|
|
await this.expire();
|
|
}
|
|
}
|
|
if (
|
|
this.status == RepositoryStatus.EXPIRED ||
|
|
this.status == RepositoryStatus.EXPIRING ||
|
|
this.status == RepositoryStatus.REMOVING ||
|
|
this.status == RepositoryStatus.REMOVED
|
|
) {
|
|
throw new AnonymousError("repository_expired", {
|
|
object: this,
|
|
httpStatus: 410,
|
|
});
|
|
}
|
|
const fiveMinuteAgo = new Date();
|
|
fiveMinuteAgo.setMinutes(fiveMinuteAgo.getMinutes() - 5);
|
|
|
|
if (
|
|
this.status == RepositoryStatus.PREPARING ||
|
|
(this.status == RepositoryStatus.DOWNLOAD &&
|
|
this._model.statusDate > fiveMinuteAgo)
|
|
) {
|
|
throw new AnonymousError("repository_not_ready", {
|
|
object: this,
|
|
httpStatus: 425,
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Compress and anonymize the repository
|
|
*
|
|
* @returns A stream of anonymized repository compressed
|
|
*/
|
|
zip(): Promise<Readable> {
|
|
return storage.archive(this.repoId, "", {
|
|
format: "zip",
|
|
fileTransformer: (filename: string) =>
|
|
this.generateAnonymizeTransformer(filename),
|
|
});
|
|
}
|
|
|
|
generateAnonymizeTransformer(filePath: string) {
|
|
return new AnonymizeTransformer({
|
|
filePath: filePath,
|
|
terms: this.options.terms,
|
|
image: this.options.image,
|
|
link: this.options.link,
|
|
repoId: this.repoId,
|
|
repoName: this.model.source.repositoryName,
|
|
branchName: this.model.source.branch || "main",
|
|
});
|
|
}
|
|
|
|
async isReady() {
|
|
if (this.status !== RepositoryStatus.READY) return false;
|
|
if (!(await FileModel.exists({ repoId: this.repoId }).exec())) {
|
|
this.model.status = RepositoryStatus.PREPARING;
|
|
await this.updateIfNeeded({ force: true });
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Update the repository if a new commit exists
|
|
*
|
|
* @returns void
|
|
*/
|
|
async updateIfNeeded(opt?: { force: boolean }): Promise<void> {
|
|
if (
|
|
this._model.options.expirationMode !== "never" &&
|
|
this.status != RepositoryStatus.EXPIRED &&
|
|
this._model.options.expirationDate
|
|
) {
|
|
if (this._model.options.expirationDate <= new Date()) {
|
|
this._model.status = RepositoryStatus.EXPIRED;
|
|
await this.expire();
|
|
throw new AnonymousError("repository_expired", {
|
|
object: this,
|
|
httpStatus: 410,
|
|
});
|
|
}
|
|
}
|
|
const yesterday = new Date();
|
|
yesterday.setDate(yesterday.getDate() - 1);
|
|
if (
|
|
opt?.force ||
|
|
(this._model.options.update && this._model.lastView < yesterday)
|
|
) {
|
|
// Only GitHubBase can be update for the moment
|
|
if (this.source instanceof GitHubBase) {
|
|
const token = await this.getToken();
|
|
|
|
const ghRepo = await getRepositoryFromGitHub({
|
|
accessToken: token,
|
|
owner: this.source.data.organization,
|
|
repo: this.source.data.repoName,
|
|
repositoryID: this.model.source.repositoryId,
|
|
force: true,
|
|
});
|
|
|
|
// update the repository name if it has changed. Persist it
|
|
// immediately — otherwise, when the commit is unchanged and the
|
|
// function returns early below, the renamed value lives in this
|
|
// in-memory model only and the next request reloads the stale
|
|
// name from MongoDB and re-runs the rename detection forever.
|
|
if (this.model.source.repositoryName !== ghRepo.fullName) {
|
|
this.model.source.repositoryName = ghRepo.fullName;
|
|
if (isConnected) {
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{ $set: { "source.repositoryName": ghRepo.fullName } }
|
|
).exec();
|
|
}
|
|
}
|
|
const branches = await ghRepo.branches({
|
|
force: true,
|
|
accessToken: token,
|
|
});
|
|
const branchName =
|
|
this.model.source.branch || ghRepo.model.defaultBranch;
|
|
const newCommit = branches.filter((f) => f.name == branchName)[0]
|
|
?.commit;
|
|
|
|
if (!newCommit) {
|
|
logger.error("branch not found", {
|
|
code: "branch_not_found",
|
|
httpStatus: 404,
|
|
repoId: this.repoId,
|
|
branch: branchName,
|
|
repo: this.model.source.repositoryName,
|
|
});
|
|
await this.updateStatus(RepositoryStatus.ERROR, "branch_not_found");
|
|
await this.resetSate();
|
|
throw new AnonymousError("branch_not_found", {
|
|
object: this,
|
|
httpStatus: 404,
|
|
});
|
|
}
|
|
if (
|
|
this.model.source.commit == newCommit &&
|
|
this.status == RepositoryStatus.READY
|
|
) {
|
|
logger.info("up to date", { repoId: this._model.repoId });
|
|
return;
|
|
}
|
|
this._model.source.commit = newCommit;
|
|
const commitInfo = await ghRepo.getCommitInfo(newCommit, {
|
|
accessToken: token,
|
|
});
|
|
if (
|
|
commitInfo.commit?.author?.date ||
|
|
commitInfo.commit?.committer?.date
|
|
) {
|
|
const d = (commitInfo.commit?.author?.date ||
|
|
commitInfo.commit.committer?.date) as string;
|
|
this._model.source.commitDate = new Date(d);
|
|
}
|
|
this.model.source.commit = newCommit;
|
|
this._model.anonymizeDate = new Date();
|
|
logger.info("update queued", {
|
|
repoId: this._model.repoId,
|
|
commit: newCommit,
|
|
});
|
|
|
|
if (isConnected) {
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{
|
|
$set: {
|
|
"source.commit": newCommit,
|
|
"source.commitDate": this._model.source.commitDate,
|
|
anonymizeDate: this._model.anonymizeDate,
|
|
},
|
|
}
|
|
).exec();
|
|
}
|
|
await this.resetSate(RepositoryStatus.PREPARING);
|
|
await downloadQueue.add(this.repoId, { repoId: this.repoId }, {
|
|
jobId: `repo-${this.repoId}`,
|
|
attempts: 3,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Download the require state for the repository to work
|
|
*
|
|
* @returns void
|
|
*/
|
|
async anonymize(progress?: (status: string) => void) {
|
|
if (this.status === RepositoryStatus.READY) {
|
|
return;
|
|
}
|
|
this.model.increment();
|
|
await this.updateStatus(RepositoryStatus.DOWNLOAD);
|
|
await this.files({
|
|
force: false,
|
|
progress,
|
|
recursive: false,
|
|
});
|
|
// Previously inserted a dummy {path:"", name:"", size:0} FileModel
|
|
// here for empty repos "to avoid errors" — but that record collides
|
|
// with the special-case in AnonymizedFile.getFileInfo for the empty
|
|
// path, surfaces in unfiltered file listings, and breaks anything
|
|
// that assumes FileModel rows correspond to real files. Empty repos
|
|
// are handled by the route layer; nothing to materialise here.
|
|
await this.updateStatus(RepositoryStatus.READY);
|
|
await this.computeSize();
|
|
}
|
|
|
|
/**
|
|
* Update the last view and view count
|
|
*/
|
|
async countView() {
|
|
this._model.lastView = new Date();
|
|
this._model.pageView = (this._model.pageView || 0) + 1;
|
|
if (!isConnected) return this.model;
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{
|
|
$set: { lastView: this._model.lastView },
|
|
$inc: { pageView: 1 },
|
|
}
|
|
).exec();
|
|
}
|
|
|
|
/**
|
|
* Update the status of the repository
|
|
* @param status the new status
|
|
* @param errorMessage a potential error message to display
|
|
*/
|
|
async updateStatus(status: RepositoryStatus, statusMessage?: string) {
|
|
if (!status) return this.model;
|
|
this._model.status = status;
|
|
this._model.statusDate = new Date();
|
|
this._model.statusMessage = statusMessage;
|
|
if (!isConnected) return this.model;
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{
|
|
$set: {
|
|
status,
|
|
statusDate: this._model.statusDate,
|
|
statusMessage,
|
|
},
|
|
}
|
|
).exec();
|
|
}
|
|
|
|
/**
|
|
* Expire the repository
|
|
*/
|
|
async expire() {
|
|
await this.updateStatus(RepositoryStatus.EXPIRING);
|
|
await this.resetSate();
|
|
await this.updateStatus(RepositoryStatus.EXPIRED);
|
|
}
|
|
|
|
/**
|
|
* Remove the repository
|
|
*/
|
|
async remove() {
|
|
await this.updateStatus(RepositoryStatus.REMOVING);
|
|
await this.resetSate();
|
|
await this.updateStatus(RepositoryStatus.REMOVED);
|
|
}
|
|
|
|
/**
|
|
* Reset/delete the state of the repository
|
|
*/
|
|
async resetSate(status?: RepositoryStatus, statusMessage?: string) {
|
|
// remove attribute
|
|
this._model.size = { storage: 0, file: 0 };
|
|
if (status) {
|
|
await this.updateStatus(status, statusMessage);
|
|
}
|
|
// remove cache
|
|
await Promise.all([
|
|
FileModel.deleteMany({ repoId: this.repoId }).exec(),
|
|
this.removeCache(),
|
|
]);
|
|
logger.info("reset", { repoId: this._model.repoId });
|
|
}
|
|
|
|
/**
|
|
* Remove the cached files
|
|
* @returns
|
|
*/
|
|
async removeCache() {
|
|
await storage.rm(this.repoId);
|
|
this.model.isReseted = true;
|
|
this.model.size = { storage: 0, file: 0 };
|
|
if (isConnected) {
|
|
try {
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{ $set: { isReseted: true, size: this._model.size } }
|
|
).exec();
|
|
} catch (error) {
|
|
logger.error("removeCache save failed", serializeError(error));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Compute the size of the repository in term of storage and number of files.
|
|
*
|
|
* @returns The size of the repository in bite
|
|
*/
|
|
async computeSize(): Promise<{
|
|
/**
|
|
* Size of the repository in bit
|
|
*/
|
|
storage: number;
|
|
/**
|
|
* The number of files
|
|
*/
|
|
file: number;
|
|
}> {
|
|
if (this.status !== RepositoryStatus.READY)
|
|
return { storage: 0, file: 0 };
|
|
if (this._model.size.file) return this._model.size;
|
|
const res = await FileModel.aggregate([
|
|
{
|
|
$match: {
|
|
repoId: this.repoId,
|
|
},
|
|
},
|
|
{
|
|
$group: {
|
|
_id: "$repoId",
|
|
storage: { $sum: "$size" },
|
|
file: { $sum: 1 },
|
|
},
|
|
},
|
|
]);
|
|
this._model.size = {
|
|
storage: res[0]?.storage || 0,
|
|
file: res[0]?.file || 0,
|
|
};
|
|
if (isConnected) {
|
|
await AnonymizedRepositoryModel.updateOne(
|
|
{ _id: this._model._id },
|
|
{ $set: { size: this._model.size } }
|
|
).exec();
|
|
}
|
|
return this._model.size;
|
|
}
|
|
|
|
/**
|
|
* Returns the conference of the repository
|
|
*
|
|
* @returns conference of the repository
|
|
*/
|
|
async conference(): Promise<Conference | null> {
|
|
if (!this._model.conference) {
|
|
return null;
|
|
}
|
|
const conference = await ConferenceModel.findOne({
|
|
conferenceID: this._model.conference,
|
|
});
|
|
if (conference) return new Conference(conference);
|
|
return null;
|
|
}
|
|
|
|
/***** Getters ********/
|
|
|
|
get repoId() {
|
|
return this._model.repoId;
|
|
}
|
|
|
|
get options() {
|
|
return this._model.options;
|
|
}
|
|
|
|
get coauthors() {
|
|
return this._model.coauthors || [];
|
|
}
|
|
|
|
get model() {
|
|
return this._model;
|
|
}
|
|
|
|
get status() {
|
|
return this._model.status;
|
|
}
|
|
|
|
get size() {
|
|
if (this.status != RepositoryStatus.READY) return { storage: 0, file: 0 };
|
|
return this._model.size;
|
|
}
|
|
|
|
toJSON() {
|
|
return {
|
|
repoId: this._model.repoId,
|
|
options: this._model.options,
|
|
coauthors: (this._model.coauthors || []).map((c) => ({
|
|
username: c.username,
|
|
githubId: c.githubId,
|
|
photo: c.photo,
|
|
})),
|
|
conference: this._model.conference,
|
|
anonymizeDate: this._model.anonymizeDate,
|
|
status: this.status,
|
|
statusMessage: this._model.statusMessage,
|
|
lastView: this._model.lastView,
|
|
pageView: this._model.pageView,
|
|
size: this.size,
|
|
source: {
|
|
repositoryID: this.model.source.repositoryId,
|
|
fullName: this.model.source.repositoryName,
|
|
commit: this.model.source.commit,
|
|
branch: this.model.source.branch,
|
|
type: this.model.source.type,
|
|
},
|
|
};
|
|
}
|
|
}
|