feat: introduce streamers that handle the stream and anonymization from github

This commit is contained in:
tdurieux
2024-04-03 11:13:01 +01:00
parent 73019c1b44
commit 4d12641c7e
64 changed files with 419 additions and 257 deletions

348
src/core/AnonymizedFile.ts Normal file
View File

@@ -0,0 +1,348 @@
import { join, basename } from "path";
import { Response } from "express";
import { Readable } from "stream";
import { trace } from "@opentelemetry/api";
import { lookup } from "mime-types";
import Repository from "./Repository";
import { RepositoryStatus, Tree, TreeElement, TreeFile } from "./types";
import config from "../config";
import { anonymizePath, isTextFile } from "./anonymize-utils";
import AnonymousError from "./AnonymousError";
import { handleError } from "../server/routes/route-utils";
import got from "got";
/**
* Represent a file in a anonymized repository
*/
export default class AnonymizedFile {
private _originalPath: string | undefined;
private fileSize?: number;
repository: Repository;
anonymizedPath: string;
_sha?: string;
constructor(data: { repository: Repository; anonymizedPath: string }) {
this.repository = data.repository;
if (!this.repository.options.terms)
throw new AnonymousError("terms_not_specified", {
object: this,
httpStatus: 400,
});
this.anonymizedPath = data.anonymizedPath;
}
async sha() {
return trace.getTracer("ano-file").startActiveSpan("sha", async (span) => {
try {
span.setAttribute("anonymizedPath", this.anonymizedPath);
if (this._sha) return this._sha.replace(/"/g, "");
await this.originalPath();
return this._sha?.replace(/"/g, "");
} finally {
span.end();
}
});
}
/**
* De-anonymize the path
*
* @returns the origin relative path of the file
*/
async originalPath(): Promise<string> {
return trace
.getTracer("ano-file")
.startActiveSpan("originalPath", async (span) => {
try {
span.setAttribute("anonymizedPath", this.anonymizedPath);
if (this._originalPath) return this._originalPath;
if (!this.anonymizedPath) {
throw new AnonymousError("path_not_specified", {
object: this,
httpStatus: 400,
});
}
let currentOriginal = (await this.repository.files({
force: false,
})) as TreeElement;
const paths = this.anonymizedPath.trim().split("/");
let currentOriginalPath = "";
for (let i = 0; i < paths.length; i++) {
const fileName = paths[i];
if (fileName == "") {
continue;
}
if (!(currentOriginal as Tree)[fileName]) {
// anonymize all the file in the folder and check if there is one that match the current filename
const options = [];
for (let originalFileName in currentOriginal) {
if (
anonymizePath(
originalFileName,
this.repository.options.terms
) == fileName
) {
options.push(originalFileName);
}
}
// if only one option we found the original filename
if (options.length == 1) {
currentOriginalPath = join(currentOriginalPath, options[0]);
currentOriginal = (currentOriginal as Tree)[options[0]];
} else if (options.length == 0) {
throw new AnonymousError("file_not_found", {
object: this,
httpStatus: 404,
});
} else {
const nextName = paths[i + 1];
if (!nextName) {
// if there is no next name we can't find the file and we return the first option
currentOriginalPath = join(currentOriginalPath, options[0]);
currentOriginal = (currentOriginal as Tree)[options[0]];
}
let found = false;
for (const option of options) {
const optionTree = (currentOriginal as Tree)[option];
if ((optionTree as Tree).child) {
const optionTreeChild = (optionTree as Tree).child;
if ((optionTreeChild as Tree)[nextName]) {
currentOriginalPath = join(currentOriginalPath, option);
currentOriginal = optionTreeChild;
found = true;
break;
}
}
}
if (!found) {
// if we didn't find the next name we return the first option
currentOriginalPath = join(currentOriginalPath, options[0]);
currentOriginal = (currentOriginal as Tree)[options[0]];
}
}
} else {
currentOriginalPath = join(currentOriginalPath, fileName);
currentOriginal = (currentOriginal as Tree)[fileName];
}
}
if (
currentOriginal.sha === undefined ||
currentOriginal.size === undefined
) {
throw new AnonymousError("folder_not_supported", { object: this });
}
const file = currentOriginal as TreeFile;
this.fileSize = file.size;
this._sha = file.sha;
this._originalPath = currentOriginalPath;
return this._originalPath;
} finally {
span.end();
}
});
}
extension() {
const filename = basename(this.anonymizedPath);
const extensions = filename.split(".").reverse();
return extensions[0].toLowerCase();
}
isImage() {
const extension = this.extension();
return [
"png",
"jpg",
"jpeg",
"gif",
"svg",
"ico",
"bmp",
"tiff",
"tif",
"webp",
"avif",
"heif",
"heic",
].includes(extension);
}
isFileSupported() {
const extension = this.extension();
if (!this.repository.options.pdf && extension == "pdf") {
return false;
}
if (!this.repository.options.image && this.isImage()) {
return false;
}
return true;
}
async content(): Promise<Readable> {
return trace
.getTracer("ano-file")
.startActiveSpan("content", async (span) => {
try {
if (this.anonymizedPath.includes(config.ANONYMIZATION_MASK)) {
await this.originalPath();
}
span.addEvent("filePath", { originalPath: this.filePath });
if (this.fileSize && this.fileSize > config.MAX_FILE_SIZE) {
throw new AnonymousError("file_too_big", {
object: this,
httpStatus: 403,
});
}
const content = await this.repository.source?.getFileContent(this);
if (
!this.repository.model.isReseted ||
this.repository.status != RepositoryStatus.READY
) {
this.repository.model.isReseted = false;
await this.repository.updateStatus(RepositoryStatus.READY);
}
return content;
} finally {
span.end();
}
});
}
async anonymizedContent() {
const span = trace.getTracer("ano-file").startSpan("Repository.conference");
span.setAttribute("anonymizedPath", this.anonymizedPath);
const anonymizer = this.repository.generateAnonymizeTransformer(
this.anonymizedPath
);
if (!config.STREAMER_ENTRYPOINT) {
// collect the content locally
const content = await this.content();
return content.pipe(anonymizer).on("close", () => {
span.end();
});
}
// use the streamer service
return got.stream(join(config.STREAMER_ENTRYPOINT, "api"), {
method: "POST",
json: {
token: await this.repository.getToken(),
repoFullName: this.repository.model.source.repositoryName,
commit: this.repository.model.source.commit,
branch: this.repository.model.source.branch,
repoId: this.repository.repoId,
filePath: this.filePath,
sha: await this.sha(),
anonymizerOptions: anonymizer.opt,
},
});
}
get filePath() {
if (!this._originalPath) {
if (this.anonymizedPath.includes(config.ANONYMIZATION_MASK)) {
throw new AnonymousError("path_not_defined", {
object: this,
httpStatus: 400,
});
}
return this.anonymizedPath;
}
return this._originalPath;
}
async send(res: Response): Promise<void> {
const anonymizer = this.repository.generateAnonymizeTransformer(
this.anonymizedPath
);
return trace
.getTracer("ano-file")
.startActiveSpan("AnonymizedFile.send", async (span) => {
span.setAttribute("repoId", this.repository.repoId);
span.setAttribute("anonymizedPath", this.anonymizedPath);
return new Promise<void>(async (resolve, reject) => {
try {
if (config.STREAMER_ENTRYPOINT) {
// use the streamer service
got
.stream(join(config.STREAMER_ENTRYPOINT, "api"), {
method: "POST",
json: {
token: await this.repository.getToken(),
repoFullName: this.repository.model.source.repositoryName,
commit: this.repository.model.source.commit,
branch: this.repository.model.source.branch,
repoId: this.repository.repoId,
filePath: this.filePath,
sha: await this.sha(),
anonymizerOptions: anonymizer.opt,
},
})
.on("error", () => {
handleError(
new AnonymousError("file_not_found", {
object: this,
httpStatus: 404,
}),
res
);
})
.pipe(res)
.on("close", () => {
span.end();
resolve();
});
return;
}
const mime = lookup(this.anonymizedPath);
if (mime && this.extension() != "ts") {
res.contentType(mime);
} else if (isTextFile(this.anonymizedPath)) {
res.contentType("text/plain");
}
res.header("Accept-Ranges", "none");
anonymizer.once("transform", (data) => {
if (!mime && data.isText) {
res.contentType("text/plain");
}
if (!data.wasAnonimized && this.fileSize) {
// the text files may be anonymized and therefore the size may be different
res.header("Content-Length", this.fileSize.toString());
}
});
const content = await this.content();
function handleStreamError(error: Error) {
if (!content.closed && !content.destroyed) {
content.destroy();
}
span.recordException(error);
span.end();
reject(error);
// handleError(error, res);
}
content
.on("error", handleStreamError)
.pipe(anonymizer)
.pipe(res)
.on("error", handleStreamError)
.on("close", () => {
if (!content.closed && !content.destroyed) {
content.destroy();
}
span.end();
resolve();
});
} catch (error) {
handleError(error, res);
}
});
});
}
}

View File

@@ -0,0 +1,53 @@
import { CustomError } from "ts-custom-error";
import AnonymizedFile from "./AnonymizedFile";
import Repository from "./Repository";
import GitHubBase from "./source/GitHubBase";
import { GitHubRepository } from "./source/GitHubRepository";
import User from "./User";
/**
* Custom error message
*/
export default class AnonymousError extends CustomError {
value?: any;
httpStatus?: number;
cause?: Error;
constructor(
message: string,
opt?: {
httpStatus?: number;
cause?: Error;
object?: any;
}
) {
super(message);
this.value = opt?.object;
this.httpStatus = opt?.httpStatus;
this.cause = opt?.cause;
}
toString(): string {
let out = "";
let detail = this.value ? JSON.stringify(this.value) : null;
if (this.value instanceof Repository) {
detail = this.value.repoId;
} else if (this.value instanceof AnonymizedFile) {
detail = `/r/${this.value.repository.repoId}/${this.value.anonymizedPath}`;
} else if (this.value instanceof GitHubRepository) {
detail = `${this.value.fullName}`;
} else if (this.value instanceof User) {
detail = `${this.value.username}`;
} else if (this.value instanceof GitHubBase) {
detail = `GHDownload ${this.value.data.repoId}`;
}
out += this.message;
if (detail) {
out += `: ${detail}`;
}
if (this.cause) {
out += `\n\tCause by ${this.cause}\n${this.cause.stack}`;
}
return out;
}
}

138
src/core/Conference.ts Normal file
View File

@@ -0,0 +1,138 @@
import AnonymizedRepositoryModel from "./model/anonymizedRepositories/anonymizedRepositories.model";
import { IConferenceDocument } from "./model/conference/conferences.types";
import Repository from "./Repository";
import { ConferenceStatus } from "./types";
export default class Conference {
private _data: IConferenceDocument;
private _repositories: Repository[] = [];
constructor(data: IConferenceDocument) {
this._data = data;
}
/**
* Update the status of the conference
* @param status the new status
* @param errorMessage a potential error message to display
*/
async updateStatus(status: ConferenceStatus, errorMessage?: string) {
this._data.status = status;
await this._data.save();
return;
}
/**
* Check if the conference is expired
*/
isExpired() {
return this._data.endDate < new Date();
}
/**
* Expire the conference
*/
async expire() {
await this.updateStatus("expired");
await Promise.all(
(await this.repositories()).map(async (conf) => await conf.expire())
);
}
/**
* Remove the conference
*/
async remove() {
await this.updateStatus("removed");
await Promise.all(
(await this.repositories()).map(async (conf) => await conf.remove())
);
}
/**
* Returns the list of repositories of this conference
*
* @returns the list of repositories of this conference
*/
async repositories(): Promise<Repository[]> {
if (this._repositories) return this._repositories;
const repoIds = this._data.repositories
.filter((r) => !r.removeDate)
.map((r) => r.id)
.filter((f) => f);
this._repositories = (
await AnonymizedRepositoryModel.find({
_id: { $in: repoIds },
})
).map((r) => new Repository(r));
return this._repositories;
}
get ownerIDs() {
return this._data?.owners;
}
get quota() {
return this._data.plan.quota;
}
get status() {
return this._data.status;
}
get conferenceID() {
return this._data.conferenceID;
}
get name() {
return this._data.name;
}
get startDate() {
return this._data.startDate;
}
get endDate() {
return this._data.endDate;
}
get url() {
return this._data.url;
}
get options() {
return this._data.options;
}
toJSON(opt?: { billing: boolean }): any {
const pricePerHourPerRepo = this._data.plan.pricePerRepository / 30;
let price = 0;
const today =
new Date() > this._data.endDate ? this._data.endDate : new Date();
this._data.repositories.forEach((r) => {
const removeDate =
r.removeDate && r.removeDate < today ? r.removeDate : today;
price +=
(Math.max(removeDate.getTime() - r.addDate.getTime(), 0) /
1000 /
60 /
60 /
24) *
pricePerHourPerRepo;
});
return {
conferenceID: this._data.conferenceID,
name: this._data.name,
url: this._data.url,
startDate: this._data.startDate,
endDate: this._data.endDate,
status: this._data.status,
billing: this._data.billing,
options: this._data.options,
plan: this._data.plan,
price,
nbRepositories: this._data.repositories.filter((r) => !r.removeDate)
.length,
};
}
}

58
src/core/GitHubUtils.ts Normal file
View File

@@ -0,0 +1,58 @@
import { trace } from "@opentelemetry/api";
import { Octokit } from "@octokit/rest";
import Repository from "./Repository";
import UserModel from "./model/users/users.model";
import config from "../config";
export function octokit(token: string) {
return new Octokit({
auth: token,
request: {
fetch: fetch,
},
});
}
export async function checkToken(token: string) {
const oct = octokit(token);
try {
await oct.users.getAuthenticated();
return true;
} catch (error) {
return false;
}
}
export async function getToken(repository: Repository) {
const span = trace.getTracer("ano-file").startSpan("GHUtils.getToken");
span.setAttribute("repoId", repository.repoId);
try {
if (repository.model.source.accessToken) {
if (await checkToken(repository.model.source.accessToken)) {
return repository.model.source.accessToken;
}
}
if (!repository.owner.model.accessTokens?.github) {
const accessTokens = (
await UserModel.findById(repository.owner.id, {
accessTokens: 1,
})
)?.accessTokens;
if (accessTokens) {
repository.owner.model.accessTokens = accessTokens;
}
}
if (repository.owner.model.accessTokens?.github) {
const check = await checkToken(
repository.owner.model.accessTokens?.github
);
if (check) {
return repository.owner.model.accessTokens?.github;
}
}
return config.GITHUB_TOKEN;
} finally {
span.end();
}
}

329
src/core/PullRequest.ts Normal file
View File

@@ -0,0 +1,329 @@
import { RepositoryStatus } from "./types";
import User from "./User";
import UserModel from "./model/users/users.model";
import Conference from "./Conference";
import ConferenceModel from "./model/conference/conferences.model";
import AnonymousError from "./AnonymousError";
import { IAnonymizedPullRequestDocument } from "./model/anonymizedPullRequests/anonymizedPullRequests.types";
import config from "../config";
import got from "got";
import { octokit } from "./GitHubUtils";
import { ContentAnonimizer } from "./anonymize-utils";
export default class PullRequest {
private _model: IAnonymizedPullRequestDocument;
owner: User;
constructor(data: IAnonymizedPullRequestDocument) {
this._model = data;
this.owner = new User(new UserModel({ _id: data.owner }));
this.owner.model.isNew = false;
}
async getToken() {
let owner = this.owner.model;
if (owner && !owner.accessTokens.github) {
const temp = await UserModel.findById(owner._id);
if (temp) {
owner = temp;
}
}
if (owner && owner.accessTokens && owner.accessTokens.github) {
if (owner.accessTokens.github != this._model.source.accessToken) {
this._model.source.accessToken = owner.accessTokens.github;
}
return owner.accessTokens.github;
}
if (this._model.source.accessToken) {
try {
return this._model.source.accessToken;
} catch (error) {
console.debug(
"[ERROR] Token is invalid",
this._model.source.pullRequestId
);
}
}
return config.GITHUB_TOKEN;
}
async download() {
console.debug(
"[INFO] Downloading pull request",
this._model.source.pullRequestId
);
const oct = octokit(await this.getToken());
const [owner, repo] = this._model.source.repositoryFullName.split("/");
const pull_number = this._model.source.pullRequestId;
const [prInfo, comments, diff] = await Promise.all([
oct.rest.pulls.get({
owner,
repo,
pull_number,
}),
oct.paginate("GET /repos/{owner}/{repo}/issues/{issue_number}/comments", {
owner: owner,
repo: repo,
issue_number: pull_number,
per_page: 100,
}),
got(`https://github.com/${owner}/${repo}/pull/${pull_number}.diff`),
]);
this._model.pullRequest = {
diff: diff.body,
title: prInfo.data.title,
body: prInfo.data.body || "",
creationDate: new Date(prInfo.data.created_at),
updatedDate: new Date(prInfo.data.updated_at),
draft: prInfo.data.draft,
merged: prInfo.data.merged,
mergedDate: prInfo.data.merged_at
? new Date(prInfo.data.merged_at)
: undefined,
state: prInfo.data.state,
baseRepositoryFullName: prInfo.data.base.repo.full_name,
headRepositoryFullName: prInfo.data.head.repo?.full_name,
comments: comments.map((comment) => ({
body: comment.body || "",
creationDate: new Date(comment.created_at),
updatedDate: new Date(comment.updated_at),
author: comment.user?.login || "",
})),
};
}
/**
* Check the status of the pullRequest
*/
check() {
if (
this._model.options.expirationMode !== "never" &&
this.status == "ready" &&
this._model.options.expirationDate
) {
if (this._model.options.expirationDate <= new Date()) {
this.expire();
}
}
if (
this.status == "expired" ||
this.status == "expiring" ||
this.status == "removing" ||
this.status == "removed"
) {
throw new AnonymousError("pullRequest_expired", {
object: this,
httpStatus: 410,
});
}
const fiveMinuteAgo = new Date();
fiveMinuteAgo.setMinutes(fiveMinuteAgo.getMinutes() - 5);
if (
this.status == "preparing" ||
(this.status == "download" && this._model.statusDate > fiveMinuteAgo)
) {
throw new AnonymousError("pullRequest_not_ready", {
object: this,
});
}
}
/**
* Update the pullRequest if a new commit exists
*
* @returns void
*/
async updateIfNeeded(opt?: { force: boolean }): Promise<void> {
const yesterday = new Date();
yesterday.setDate(yesterday.getDate() - 1);
if (
opt?.force ||
(this._model.options.update && this._model.anonymizeDate < yesterday)
) {
await this.updateStatus(RepositoryStatus.DOWNLOAD);
await this.download();
this._model.anonymizeDate = new Date();
await this.updateStatus(RepositoryStatus.READY);
await this._model.save();
}
}
/**
* Download the require state for the pullRequest to work
*
* @returns void
*/
async anonymize() {
if (this.status === RepositoryStatus.READY) return;
await this.updateStatus(RepositoryStatus.PREPARING);
await this.updateIfNeeded({ force: true });
await this.updateStatus(RepositoryStatus.READY);
return;
}
/**
* Update the last view and view count
*/
async countView() {
this._model.lastView = new Date();
this._model.pageView = (this._model.pageView || 0) + 1;
await this._model.save();
}
/**
* Update the status of the pullRequest
* @param status the new status
* @param errorMessage a potential error message to display
*/
async updateStatus(status: RepositoryStatus, statusMessage?: string) {
this._model.status = status;
this._model.statusDate = new Date();
this._model.statusMessage = statusMessage;
await this._model.save();
}
/**
* Expire the pullRequest
*/
async expire() {
await this.updateStatus(RepositoryStatus.EXPIRING);
await this.resetSate();
await this.updateStatus(RepositoryStatus.EXPIRED);
}
/**
* Remove the pullRequest
*/
async remove() {
await this.updateStatus(RepositoryStatus.REMOVING);
await this.resetSate();
await this.updateStatus(RepositoryStatus.REMOVED);
}
/**
* Reset/delete the state of the pullRequest
*/
async resetSate(status?: RepositoryStatus, statusMessage?: string) {
if (status) this._model.status = status;
if (statusMessage) this._model.statusMessage = statusMessage;
// remove cache
this._model.pullRequest.comments = [];
this._model.pullRequest.body = "";
this._model.pullRequest.title = "";
this._model.pullRequest.diff = "";
this._model.pullRequest.baseRepositoryFullName = "";
this._model.pullRequest.headRepositoryFullName = "";
this._model.pullRequest.merged = false;
this._model.pullRequest.mergedDate = undefined;
this._model.pullRequest.state = "closed";
this._model.pullRequest.draft = false;
await this._model.save();
}
/**
* Returns the conference of the pullRequest
*
* @returns conference of the pullRequest
*/
async conference(): Promise<Conference | null> {
if (!this._model.conference) {
return null;
}
const conference = await ConferenceModel.findOne({
conferenceID: this._model.conference,
});
if (conference) return new Conference(conference);
return null;
}
content() {
const output: any = {
anonymizeDate: this._model.anonymizeDate,
merged: this._model.pullRequest.merged,
mergedDate: this._model.pullRequest.mergedDate,
state: this._model.pullRequest.state,
draft: this._model.pullRequest.draft,
};
const anonymizer = new ContentAnonimizer({
...this.options,
repoId: this.pullRequestId,
});
if (this.options.title) {
output.title = anonymizer.anonymize(this._model.pullRequest.title);
}
if (this.options.body) {
output.body = anonymizer.anonymize(this._model.pullRequest.body);
}
if (this.options.comments) {
output.comments = this._model.pullRequest.comments?.map((comment) => {
const o: any = {};
if (this.options.body) o.body = anonymizer.anonymize(comment.body);
if (this.options.username)
o.author = anonymizer.anonymize(comment.author);
if (this.options.date) {
o.updatedDate = comment.updatedDate;
o.creationDate = comment.creationDate;
}
return o;
});
}
if (this.options.diff) {
output.diff = anonymizer.anonymize(this._model.pullRequest.diff);
}
if (this.options.origin) {
output.baseRepositoryFullName =
this._model.pullRequest.baseRepositoryFullName;
}
if (this.options.date) {
output.updatedDate = this.model.pullRequest.updatedDate;
output.creationDate = this.model.pullRequest.creationDate;
}
return output;
}
/***** Getters ********/
get pullRequestId() {
return this._model.pullRequestId;
}
get options() {
return this._model.options;
}
get source() {
return this._model.source;
}
get model() {
return this._model;
}
get status() {
return this._model.status;
}
toJSON() {
return {
pullRequestId: this._model.pullRequestId,
options: this._model.options,
conference: this._model.conference,
anonymizeDate: this._model.anonymizeDate,
status: this._model.status,
state: this.model.pullRequest.state,
merged: this.model.pullRequest.merged,
mergedDate: this.model.pullRequest.mergedDate,
statusMessage: this._model.statusMessage,
source: {
pullRequestId: this._model.source.pullRequestId,
repositoryFullName: this._model.source.repositoryFullName,
},
pullRequest: this._model.pullRequest,
lastView: this._model.lastView,
pageView: this._model.pageView,
};
}
}

530
src/core/Repository.ts Normal file
View File

@@ -0,0 +1,530 @@
import storage from "./storage";
import { RepositoryStatus, Tree, TreeElement, TreeFile } from "./types";
import { Readable } from "stream";
import User from "./User";
import GitHubStream from "./source/GitHubStream";
import GitHubDownload from "./source/GitHubDownload";
import Zip from "./source/Zip";
import { anonymizePath } from "./anonymize-utils";
import UserModel from "./model/users/users.model";
import { IAnonymizedRepositoryDocument } from "./model/anonymizedRepositories/anonymizedRepositories.types";
import { AnonymizeTransformer } from "./anonymize-utils";
import GitHubBase from "./source/GitHubBase";
import Conference from "./Conference";
import ConferenceModel from "./model/conference/conferences.model";
import AnonymousError from "./AnonymousError";
import { downloadQueue } from "../queue";
import { isConnected } from "../server/database";
import AnonymizedRepositoryModel from "./model/anonymizedRepositories/anonymizedRepositories.model";
import { GitHubRepository } from "./source/GitHubRepository";
import { trace } from "@opentelemetry/api";
import { getToken } from "./GitHubUtils";
function anonymizeTreeRecursive(
tree: TreeElement,
terms: string[],
opt: {
/** Include the file sha in the response */
includeSha: boolean;
} = {
includeSha: false,
}
): TreeElement {
if (typeof tree.size !== "object" && tree.sha !== undefined) {
if (opt?.includeSha) return tree as TreeFile;
return { size: tree.size } as TreeFile;
}
const output: Tree = {};
Object.getOwnPropertyNames(tree).forEach((file) => {
const anonymizedPath = anonymizePath(file, terms);
output[anonymizedPath] = anonymizeTreeRecursive(
(tree as Tree)[file],
terms,
opt
);
});
return output;
}
export default class Repository {
private _model: IAnonymizedRepositoryDocument;
owner: User;
constructor(data: IAnonymizedRepositoryDocument) {
this._model = data;
this.owner = new User(new UserModel({ _id: data.owner }));
this.owner = new User(new UserModel({ _id: data.owner }));
this.owner.model.isNew = false;
}
private checkedToken: boolean = false;
async getToken() {
if (this.checkedToken) return this._model.source.accessToken as string;
const originalToken = this._model.source.accessToken;
const token = await getToken(this);
if (originalToken != token) {
this._model.source.accessToken = token;
await this._model.save();
}
this.checkedToken = true;
return token;
}
get source() {
const ghRepo = new GitHubRepository({
name: this.model.source.repositoryName,
});
switch (this.model.source.type) {
case "GitHubDownload":
return new GitHubDownload({
repoId: this.repoId,
commit: this.model.source.commit || "HEAD",
organization: ghRepo.owner,
repoName: ghRepo.repo,
getToken: () => this.getToken(),
});
case "GitHubStream":
return new GitHubStream({
repoId: this.repoId,
commit: this.model.source.commit || "HEAD",
organization: ghRepo.owner,
repoName: ghRepo.repo,
getToken: () => this.getToken(),
});
case "Zip":
return new Zip(this.model.source, this.repoId);
default:
throw new AnonymousError("unsupported_source", {
object: this,
httpStatus: 400,
});
}
}
/**
* Get the anonymized file tree
* @param opt force to get an updated list of files
* @returns The anonymized file tree
*/
async anonymizedFiles(
opt: {
/** Force to refresh the file tree */
force?: boolean;
/** Include the file sha in the response */
includeSha: boolean;
} = {
force: false,
includeSha: false,
}
): Promise<Tree> {
const terms = this._model.options.terms || [];
return anonymizeTreeRecursive(await this.files(opt), terms, opt) as Tree;
}
/**
* Get the file tree
*
* @param opt force to get an updated list of files
* @returns The file tree
*/
async files(opt: { force?: boolean } = { force: false }): Promise<Tree> {
const span = trace.getTracer("ano-file").startSpan("Repository.files");
span.setAttribute("repoId", this.repoId);
try {
if (!this._model.originalFiles && !opt.force) {
const res = await AnonymizedRepositoryModel.findById(this._model._id, {
originalFiles: 1,
});
if (!res) throw new AnonymousError("repository_not_found");
this.model.originalFiles = res.originalFiles;
}
if (
this._model.originalFiles &&
Object.getOwnPropertyNames(this._model.originalFiles).length !== 0 &&
!opt.force
) {
return this._model.originalFiles;
}
const files = await this.source.getFiles();
this._model.originalFiles = files;
this._model.size = { storage: 0, file: 0 };
await this.computeSize();
return files;
} finally {
span.end();
}
}
/**
* Check the status of the repository
*/
check() {
if (
this._model.options.expirationMode !== "never" &&
this.status == RepositoryStatus.READY &&
this._model.options.expirationDate
) {
if (this._model.options.expirationDate <= new Date()) {
this.expire();
}
}
if (
this.status == RepositoryStatus.EXPIRED ||
this.status == RepositoryStatus.EXPIRING ||
this.status == RepositoryStatus.REMOVING ||
this.status == RepositoryStatus.REMOVED
) {
throw new AnonymousError("repository_expired", {
object: this,
httpStatus: 410,
});
}
const fiveMinuteAgo = new Date();
fiveMinuteAgo.setMinutes(fiveMinuteAgo.getMinutes() - 5);
if (
this.status == RepositoryStatus.PREPARING ||
(this.status == RepositoryStatus.DOWNLOAD &&
this._model.statusDate > fiveMinuteAgo)
) {
throw new AnonymousError("repository_not_ready", {
object: this,
});
}
}
/**
* Compress and anonymize the repository
*
* @returns A stream of anonymized repository compressed
*/
zip(): Promise<Readable> {
return storage.archive(this.repoId, "", {
format: "zip",
fileTransformer: (filename: string) =>
this.generateAnonymizeTransformer(filename),
});
}
generateAnonymizeTransformer(filePath: string) {
return new AnonymizeTransformer({
filePath: filePath,
terms: this.options.terms,
image: this.options.image,
link: this.options.link,
repoId: this.repoId,
repoName: this.model.source.repositoryName,
branchName: this.model.source.branch || "main",
});
}
/**
* Update the repository if a new commit exists
*
* @returns void
*/
async updateIfNeeded(opt?: { force: boolean }): Promise<void> {
const span = trace
.getTracer("ano-file")
.startSpan("Repository.updateIfNeeded");
span.setAttribute("repoId", this.repoId);
const yesterday = new Date();
yesterday.setDate(yesterday.getDate() - 1);
if (
opt?.force ||
(this._model.options.update && this._model.lastView < yesterday)
) {
// Only GitHubBase can be update for the moment
if (this.source instanceof GitHubBase) {
const token = await this.getToken();
const ghRepo = new GitHubRepository({
name: this.model.source.repositoryName,
});
const branches = await ghRepo.branches({
force: true,
accessToken: token,
});
const branchName = this.model.source.branch || "main";
const newCommit = branches.filter((f) => f.name == branchName)[0]
?.commit;
if (
this.model.source.commit == newCommit &&
this.status == RepositoryStatus.READY
) {
console.log(`[UPDATE] ${this._model.repoId} is up to date`);
span.setAttribute("status", "up_to_date");
span.end();
return;
}
this._model.source.commit = newCommit;
const commitInfo = await ghRepo.getCommitInfo(newCommit, {
accessToken: token,
});
if (
commitInfo.commit?.author?.date ||
commitInfo.commit?.committer?.date
) {
const d = (commitInfo.commit?.author?.date ||
commitInfo.commit.committer?.date) as string;
this._model.source.commitDate = new Date(d);
}
this.model.source.commit = newCommit;
if (!newCommit) {
console.error(
`${branchName} for ${this.model.source.repositoryName} is not found`
);
await this.updateStatus(RepositoryStatus.ERROR, "branch_not_found");
await this.resetSate();
span.setAttribute("status", "branch_not_found");
span.end();
throw new AnonymousError("branch_not_found", {
object: this,
});
}
this._model.anonymizeDate = new Date();
console.log(
`[UPDATE] ${this._model.repoId} will be updated to ${newCommit}`
);
await this.resetSate(RepositoryStatus.PREPARING);
await downloadQueue.add(this.repoId, this, {
jobId: this.repoId,
attempts: 3,
});
}
}
span.end();
}
/**
* Download the require state for the repository to work
*
* @returns void
*/
async anonymize() {
const span = trace.getTracer("ano-file").startSpan("Repository.anonymize");
span.setAttribute("repoId", this.repoId);
if (this.status === RepositoryStatus.READY) {
span.end();
return;
}
await this.updateStatus(RepositoryStatus.PREPARING);
await this.files();
await this.updateStatus(RepositoryStatus.READY);
span.end();
}
/**
* Update the last view and view count
*/
async countView() {
const span = trace.getTracer("ano-file").startSpan("Repository.countView");
span.setAttribute("repoId", this.repoId);
try {
this._model.lastView = new Date();
this._model.pageView = (this._model.pageView || 0) + 1;
if (!isConnected) return this.model;
await this._model.save();
} finally {
span.end();
}
}
/**
* Update the status of the repository
* @param status the new status
* @param errorMessage a potential error message to display
*/
async updateStatus(status: RepositoryStatus, statusMessage?: string) {
const span = trace
.getTracer("ano-file")
.startSpan("Repository.updateStatus");
span.setAttribute("repoId", this.repoId);
span.setAttribute("status", status);
span.setAttribute("statusMessage", statusMessage || "");
try {
if (!status) return this.model;
this._model.status = status;
this._model.statusDate = new Date();
this._model.statusMessage = statusMessage;
if (!isConnected) return this.model;
await this._model.save();
} finally {
span.end();
}
}
/**
* Expire the repository
*/
async expire() {
const span = trace.getTracer("ano-file").startSpan("Repository.expire");
span.setAttribute("repoId", this.repoId);
await this.updateStatus(RepositoryStatus.EXPIRING);
await this.resetSate();
await this.updateStatus(RepositoryStatus.EXPIRED);
span.end();
}
/**
* Remove the repository
*/
async remove() {
const span = trace.getTracer("ano-file").startSpan("Repository.remove");
span.setAttribute("repoId", this.repoId);
await this.updateStatus(RepositoryStatus.REMOVING);
await this.resetSate();
await this.updateStatus(RepositoryStatus.REMOVED);
span.end();
}
/**
* Reset/delete the state of the repository
*/
async resetSate(status?: RepositoryStatus, statusMessage?: string) {
const span = trace.getTracer("ano-file").startSpan("Repository.resetState");
span.setAttribute("repoId", this.repoId);
// remove attribute
this._model.size = { storage: 0, file: 0 };
this._model.originalFiles = undefined;
if (status) {
await this.updateStatus(status, statusMessage);
}
// remove cache
await this.removeCache();
console.log(`[RESET] ${this._model.repoId} has been reset`);
span.end();
}
/**
* Remove the cached files
* @returns
*/
async removeCache() {
const span = trace
.getTracer("ano-file")
.startSpan("Repository.removeCache");
span.setAttribute("repoId", this.repoId);
try {
return storage.rm(this.repoId);
} finally {
this.model.isReseted = true;
await this.model.save();
span.end();
}
}
/**
* Compute the size of the repository in term of storage and number of files.
*
* @returns The size of the repository in bite
*/
async computeSize(): Promise<{
/**
* Size of the repository in bit
*/
storage: number;
/**
* The number of files
*/
file: number;
}> {
const span = trace
.getTracer("ano-file")
.startSpan("Repository.removeCache");
span.setAttribute("repoId", this.repoId);
try {
if (this.status !== RepositoryStatus.READY)
return { storage: 0, file: 0 };
if (this._model.size.file) return this._model.size;
function recursiveCount(files: Tree): { storage: number; file: number } {
const out = { storage: 0, file: 0 };
for (const name in files) {
const file = files[name];
if (file.size && parseInt(file.size.toString()) == file.size) {
out.storage += file.size as number;
out.file++;
} else if (typeof file == "object") {
const r = recursiveCount(file as Tree);
out.storage += r.storage;
out.file += r.file;
}
}
return out;
}
const files = await this.files();
this._model.size = recursiveCount(files);
await this._model.save();
return this._model.size;
} finally {
span.end();
}
}
/**
* Returns the conference of the repository
*
* @returns conference of the repository
*/
async conference(): Promise<Conference | null> {
const span = trace.getTracer("ano-file").startSpan("Repository.conference");
span.setAttribute("repoId", this.repoId);
try {
if (!this._model.conference) {
return null;
}
const conference = await ConferenceModel.findOne({
conferenceID: this._model.conference,
});
if (conference) return new Conference(conference);
return null;
} finally {
span.end();
}
}
/***** Getters ********/
get repoId() {
return this._model.repoId;
}
get options() {
return this._model.options;
}
get model() {
return this._model;
}
get status() {
return this._model.status;
}
get size() {
if (this.status != RepositoryStatus.READY) return { storage: 0, file: 0 };
return this._model.size;
}
toJSON() {
return {
repoId: this._model.repoId,
options: this._model.options,
conference: this._model.conference,
anonymizeDate: this._model.anonymizeDate,
status: this.status,
statusMessage: this._model.statusMessage,
lastView: this._model.lastView,
pageView: this._model.pageView,
size: this.size,
source: {
fullName: this.model.source.repositoryName,
commit: this.model.source.commit,
branch: this.model.source.branch,
type: this.model.source.type,
},
};
}
}

194
src/core/User.ts Normal file
View File

@@ -0,0 +1,194 @@
import { trace } from "@opentelemetry/api";
import AnonymizedRepositoryModel from "./model/anonymizedRepositories/anonymizedRepositories.model";
import RepositoryModel from "./model/repositories/repositories.model";
import { IUserDocument } from "./model/users/users.types";
import Repository from "./Repository";
import { GitHubRepository } from "./source/GitHubRepository";
import PullRequest from "./PullRequest";
import AnonymizedPullRequestModel from "./model/anonymizedPullRequests/anonymizedPullRequests.model";
import { octokit } from "./GitHubUtils";
/**
* Model for a user
*/
export default class User {
private _model: IUserDocument;
constructor(model: IUserDocument) {
this._model = model;
}
get id(): string {
return this._model.id;
}
get username(): string {
return this._model.username;
}
get isAdmin(): boolean {
return !!this._model.isAdmin;
}
get accessToken(): string {
return this._model.accessTokens.github;
}
get photo(): string | undefined {
return this._model.photo;
}
get default() {
return this._model.default;
}
set default(d) {
this._model.default = d;
}
/**
* Get the GitHub repositories of the user
* @param opt options
* @returns the list of github repositories
*/
async getGitHubRepositories(opt?: {
/**
* Get the repository from GitHub
*/
force: boolean;
}): Promise<GitHubRepository[]> {
const span = trace
.getTracer("ano-file")
.startSpan("User.getGitHubRepositories");
span.setAttribute("username", this.username);
if (
!this._model.repositories ||
this._model.repositories.length == 0 ||
opt?.force === true
) {
// get the list of repo from github
const oct = octokit(this.accessToken);
const repositories = (
await oct.paginate("GET /user/repos", {
visibility: "all",
sort: "pushed",
per_page: 100,
})
).map((r) => {
return new RepositoryModel({
externalId: "gh_" + r.id,
name: r.full_name,
url: r.html_url,
size: r.size,
defaultBranch: r.default_branch,
});
});
// find the repositories that are already in the database
const finds = (
await RepositoryModel.find({
externalId: {
$in: repositories.map((repo) => repo.externalId),
},
}).select("externalId")
).map((m) => m.externalId);
// save all the new repositories
await Promise.all(
repositories
.filter((r) => finds.indexOf(r.externalId) == -1)
.map((r) => r.save())
);
// save only the if of the repositories in the user model
this._model.repositories = (
await RepositoryModel.find({
externalId: {
$in: repositories.map((repo) => repo.externalId),
},
}).select("id")
).map((m) => m.id);
// have the model
await this._model.save();
span.end();
return repositories.map((r) => new GitHubRepository(r));
} else {
const out = (
await RepositoryModel.find({ _id: { $in: this._model.repositories } })
).map((i) => new GitHubRepository(i));
span.end();
return out;
}
}
/**
* Get the lost of anonymized repositories
* @returns the list of anonymized repositories
*/
async getRepositories() {
const span = trace.getTracer("ano-file").startSpan("User.getRepositories");
span.setAttribute("username", this.username);
const repositories = (
await AnonymizedRepositoryModel.find(
{
owner: this.id,
},
{
originalFiles: 0,
}
).exec()
).map((d) => new Repository(d));
const promises = [];
for (let repo of repositories) {
if (
repo.status == "ready" &&
repo.options.expirationMode != "never" &&
repo.options.expirationDate != null &&
repo.options.expirationDate < new Date()
) {
// expire the repository
promises.push(repo.expire());
}
}
await Promise.all(promises);
span.end();
return repositories;
}
/**
* Get the lost of anonymized repositories
* @returns the list of anonymized repositories
*/
async getPullRequests() {
const span = trace.getTracer("ano-file").startSpan("User.getPullRequests");
span.setAttribute("username", this.username);
const pullRequests = (
await AnonymizedPullRequestModel.find({
owner: this.id,
}).exec()
).map((d) => new PullRequest(d));
const promises = [];
for (let repo of pullRequests) {
if (
repo.status == "ready" &&
repo.options.expirationMode != "never" &&
repo.options.expirationDate != null &&
repo.options.expirationDate < new Date()
) {
// expire the repository
promises.push(repo.expire());
}
}
await Promise.all(promises);
span.end();
return pullRequests;
}
get model() {
return this._model;
}
toJSON() {
return this._model.toJSON();
}
}

226
src/core/anonymize-utils.ts Normal file
View File

@@ -0,0 +1,226 @@
import { basename } from "path";
import { Transform, Readable } from "stream";
import { isText } from "istextorbinary";
import { trace } from "@opentelemetry/api";
import config from "../config";
const urlRegex =
/<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
export function streamToString(stream: Readable): Promise<string> {
const chunks: Buffer[] = [];
return new Promise((resolve, reject) => {
stream.on("data", (chunk) => chunks.push(Buffer.from(chunk)));
stream.on("error", (err) => reject(err));
stream.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
});
}
export function isTextFile(filePath: string, content?: Buffer) {
const filename = basename(filePath);
const extensions = filename.split(".").reverse();
const extension = extensions[0].toLowerCase();
if (config.additionalExtensions.includes(extension)) {
return true;
}
if (isText(filename)) {
return true;
}
return isText(filename, content);
}
export class AnonymizeTransformer extends Transform {
public isText: boolean | null = null;
anonimizer: ContentAnonimizer;
constructor(
readonly opt: {
filePath: string;
} & ConstructorParameters<typeof ContentAnonimizer>[0]
) {
super();
this.isText = isTextFile(this.opt.filePath);
this.anonimizer = new ContentAnonimizer(this.opt);
}
get wasAnonimized() {
return this.anonimizer.wasAnonymized;
}
_transform(chunk: Buffer, encoding: string, callback: () => void) {
trace
.getTracer("ano-file")
.startActiveSpan("AnonymizeTransformer.transform", async (span) => {
span.setAttribute("path", this.opt.filePath);
if (this.isText === null) {
this.isText = isTextFile(this.opt.filePath, chunk);
}
if (this.isText) {
const content = this.anonimizer.anonymize(chunk.toString());
if (this.anonimizer.wasAnonymized) {
chunk = Buffer.from(content);
}
}
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk,
});
this.push(chunk);
span.end();
callback();
});
}
}
export class ContentAnonimizer {
public wasAnonymized = false;
constructor(
readonly opt: {
image?: boolean;
link?: boolean;
terms?: string[];
repoName?: string;
branchName?: string;
repoId?: string;
}
) {}
private removeImage(content: string): string {
if (this.opt.image !== false) {
return content;
}
// remove image in markdown
return content.replace(
/!\[[^\]]*\]\((?<filename>.*?)(?=\"|\))(?<optionalpart>\".*\")?\)/g,
() => {
this.wasAnonymized = true;
return config.ANONYMIZATION_MASK;
}
);
}
private removeLink(content: string): string {
if (this.opt.link !== false) {
return content;
}
// remove image in markdown
return content.replace(urlRegex, () => {
this.wasAnonymized = true;
return config.ANONYMIZATION_MASK;
});
}
private replaceGitHubSelfLinks(content: string): string {
if (!this.opt.repoName || !this.opt.branchName) {
return content;
}
const repoName = this.opt.repoName;
const branchName = this.opt.branchName;
const replaceCallback = () => {
this.wasAnonymized = true;
return `https://${config.APP_HOSTNAME}/r/${this.opt.repoId}`;
};
content = content.replace(
new RegExp(
`https://raw.githubusercontent.com/${repoName}/${branchName}\\b`,
"gi"
),
replaceCallback
);
content = content.replace(
new RegExp(`https://github.com/${repoName}/blob/${branchName}\\b`, "gi"),
replaceCallback
);
content = content.replace(
new RegExp(`https://github.com/${repoName}/tree/${branchName}\\b`, "gi"),
replaceCallback
);
return content.replace(
new RegExp(`https://github.com/${repoName}`, "gi"),
replaceCallback
);
}
private replaceTerms(content: string): string {
const terms = this.opt.terms || [];
for (let i = 0; i < terms.length; i++) {
let term = terms[i];
if (term.trim() == "") {
continue;
}
const mask = config.ANONYMIZATION_MASK + "-" + (i + 1);
try {
new RegExp(term, "gi");
} catch {
// escape regex characters
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
}
// remove whole url if it contains the term
content = content.replace(urlRegex, (match) => {
if (new RegExp(`\\b${term}\\b`, "gi").test(match)) {
this.wasAnonymized = true;
return mask;
}
return match;
});
// remove the term in the text
content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), () => {
this.wasAnonymized = true;
return mask;
});
}
return content;
}
anonymize(content: string) {
const span = trace
.getTracer("ano-file")
.startSpan("ContentAnonimizer.anonymize");
try {
content = this.removeImage(content);
span.addEvent("removeImage");
content = this.removeLink(content);
span.addEvent("removeLink");
content = this.replaceGitHubSelfLinks(content);
span.addEvent("replaceGitHubSelfLinks");
content = this.replaceTerms(content);
span.addEvent("replaceTerms");
return content;
} finally {
span.end();
}
}
}
export function anonymizePath(path: string, terms: string[]) {
return trace
.getTracer("ano-file")
.startActiveSpan("utils.anonymizePath", (span) => {
span.setAttribute("path", path);
for (let i = 0; i < terms.length; i++) {
let term = terms[i];
if (term.trim() == "") {
continue;
}
try {
new RegExp(term, "gi");
} catch {
// escape regex characters
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
}
path = path.replace(
new RegExp(term, "gi"),
config.ANONYMIZATION_MASK + "-" + (i + 1)
);
}
span.setAttribute("return", path);
span.end();
return path;
});
}

View File

@@ -0,0 +1,14 @@
import { model } from "mongoose";
import AnonymizedPullRequestSchema from "./anonymizedPullRequests.schema";
import {
IAnonymizedPullRequestDocument,
IAnonymizedPullRequestModel,
} from "./anonymizedPullRequests.types";
const AnonymizedPullRequestModel = model<IAnonymizedPullRequestDocument>(
"AnonymizedPullRequest",
AnonymizedPullRequestSchema
) as IAnonymizedPullRequestModel;
export default AnonymizedPullRequestModel;

View File

@@ -0,0 +1,66 @@
import { Schema } from "mongoose";
const AnonymizedPullRequestSchema = new Schema({
pullRequestId: {
type: String,
index: { unique: true },
},
status: {
type: String,
default: "preparing",
},
statusDate: Date,
statusMessage: String,
anonymizeDate: Date,
lastView: Date,
pageView: Number,
owner: Schema.Types.ObjectId,
conference: String,
source: {
pullRequestId: Number,
repositoryFullName: String,
accessToken: String,
},
options: {
terms: [String],
expirationMode: { type: String },
expirationDate: Date,
update: Boolean,
image: Boolean,
link: Boolean,
title: Boolean,
body: Boolean,
comments: Boolean,
diff: Boolean,
origin: Boolean,
username: Boolean,
date: Boolean,
},
dateOfEntry: {
type: Date,
default: new Date(),
},
pullRequest: {
diff: String,
title: String,
body: String,
creationDate: Date,
updatedDate: Date,
draft: Boolean,
merged: Boolean,
mergedDate: Date,
state: String,
baseRepositoryFullName: String,
headRepositoryFullName: String,
comments: [
{
body: String,
creationDate: Date,
updatedDate: Date,
author: String,
},
],
},
});
export default AnonymizedPullRequestSchema;

View File

@@ -0,0 +1,61 @@
import { Document, Model } from "mongoose";
import { RepositoryStatus } from "../../types";
export interface IAnonymizedPullRequest {
pullRequestId: string;
status?: RepositoryStatus;
statusMessage?: string;
statusDate: Date;
anonymizeDate: Date;
source: {
pullRequestId: number;
repositoryFullName: string;
accessToken?: string;
};
owner: string;
conference: string;
options: {
terms: string[];
expirationMode: "never" | "redirect" | "remove";
expirationDate?: Date;
update: boolean;
image: boolean;
link: boolean;
title: boolean;
body: boolean;
comments: boolean;
diff: boolean;
origin: boolean;
username: boolean;
date: boolean;
};
pageView: number;
lastView: Date;
pullRequest: {
diff: string;
title: string;
body: string;
creationDate: Date;
updatedDate: Date;
draft?: boolean;
merged?: boolean;
mergedDate?: Date;
state?: string;
baseRepositoryFullName?: string;
headRepositoryFullName?: string;
comments?: {
body: string;
creationDate: Date;
updatedDate: Date;
author: string;
}[];
};
}
export interface IAnonymizedPullRequestDocument
extends IAnonymizedPullRequest,
Document {
setLastUpdated: (this: IAnonymizedPullRequestDocument) => Promise<void>;
}
export interface IAnonymizedPullRequestModel
extends Model<IAnonymizedPullRequestDocument> {}

View File

@@ -0,0 +1,14 @@
import { model } from "mongoose";
import {
IAnonymizedRepositoryDocument,
IAnonymizedRepositoryModel,
} from "./anonymizedRepositories.types";
import AnonymizedRepositorySchema from "./anonymizedRepositories.schema";
const AnonymizedRepositoryModel = model<IAnonymizedRepositoryDocument>(
"AnonymizedRepository",
AnonymizedRepositorySchema
) as IAnonymizedRepositoryModel;
export default AnonymizedRepositoryModel;

View File

@@ -0,0 +1,73 @@
import { Schema } from "mongoose";
const AnonymizedRepositorySchema = new Schema({
repoId: {
type: String,
index: { unique: true, collation: { locale: "en", strength: 2 } },
},
status: {
type: String,
default: "preparing",
},
statusDate: Date,
statusMessage: String,
anonymizeDate: Date,
lastView: Date,
pageView: Number,
accessToken: String,
owner: {
type: Schema.Types.ObjectId,
ref: "user",
index: true,
},
conference: String,
source: {
type: { type: String },
branch: String,
commit: String,
commitDate: Date,
repositoryId: String,
repositoryName: String,
accessToken: String,
},
truckedFileList: {
type: Boolean,
default: false,
},
originalFiles: Schema.Types.Mixed,
options: {
terms: [String],
expirationMode: { type: String },
expirationDate: Date,
update: Boolean,
image: Boolean,
pdf: Boolean,
notebook: Boolean,
link: Boolean,
page: Boolean,
pageSource: {
branch: String,
path: String,
},
},
dateOfEntry: {
type: Date,
default: new Date(),
},
size: {
storage: {
type: Number,
default: 0,
},
file: {
type: Number,
default: 0,
},
},
isReseted: {
type: Boolean,
default: false,
},
});
export default AnonymizedRepositorySchema;

View File

@@ -0,0 +1,53 @@
import { Document, Model } from "mongoose";
import { RepositoryStatus, Tree } from "../../types";
export interface IAnonymizedRepository {
repoId: string;
status?: RepositoryStatus;
statusMessage?: string;
statusDate: Date;
anonymizeDate: Date;
source: {
type: "GitHubDownload" | "GitHubStream" | "Zip";
branch?: string;
commit?: string;
commitDate?: Date,
repositoryId?: string;
repositoryName?: string;
accessToken?: string;
};
owner: string;
truckedFileList: boolean;
originalFiles?: Tree;
conference: string;
options: {
terms: string[];
expirationMode: "never" | "redirect" | "remove";
expirationDate?: Date;
update: boolean;
image: boolean;
pdf: boolean;
notebook: boolean;
link: boolean;
page: boolean;
pageSource?: {
branch: string;
path: string;
};
};
pageView: number;
lastView: Date;
size: {
storage: number;
file: number;
};
isReseted: boolean;
}
export interface IAnonymizedRepositoryDocument
extends IAnonymizedRepository,
Document {
setLastUpdated: (this: IAnonymizedRepositoryDocument) => Promise<void>;
}
export interface IAnonymizedRepositoryModel
extends Model<IAnonymizedRepositoryDocument> {}

View File

@@ -0,0 +1,11 @@
import { model } from "mongoose";
import { IConferenceDocument, IConferenceModel } from "./conferences.types";
import ConferenceSchema from "./conferences.schema";
const ConferenceModel = model<IConferenceDocument>(
"Conference",
ConferenceSchema
) as IConferenceModel;
export default ConferenceModel;

View File

@@ -0,0 +1,58 @@
import { Schema } from "mongoose";
const RepositorySchema = new Schema({
name: String,
conferenceID: {
type: String,
index: { unique: true },
},
url: String,
startDate: Date,
endDate: Date,
status: String,
owners: { type: [Schema.Types.ObjectId] },
repositories: {
type: [
{
id: { type: Schema.Types.ObjectId },
addDate: { type: Date },
removeDate: { type: Date },
},
],
},
options: {
expirationMode: String,
expirationDate: Date,
update: Boolean,
image: Boolean,
pdf: Boolean,
notebook: Boolean,
link: Boolean,
page: Boolean,
},
dateOfEntry: {
type: Date,
default: new Date(),
},
plan: {
planID: String,
pricePerRepository: Number,
quota: {
repository: Number,
size: Number,
file: Number,
},
},
billing: {
name: String,
email: String,
address: String,
address2: String,
city: String,
zip: String,
country: String,
vat: String,
},
});
export default RepositorySchema;

View File

@@ -0,0 +1,49 @@
import { Document, Model } from "mongoose";
import { ConferenceStatus } from "../../types";
export interface IConference {
name: string;
conferenceID: string;
startDate: Date;
endDate: Date;
url: string;
status: ConferenceStatus;
owners: string[];
repositories: {
id: string;
addDate: Date;
removeDate?: Date;
}[];
options: {
expirationMode: "never" | "redirect" | "remove";
expirationDate?: Date;
update: boolean;
image: boolean;
pdf: boolean;
notebook: boolean;
link: boolean;
page: boolean;
};
plan: {
planID: string;
pricePerRepository: number;
quota: {
repository: number;
size: number;
file: number;
};
};
billing?: {
name: string;
email: string;
address: string;
address2?: string;
city: string;
zip: string;
country: string;
vat?: string;
};
}
export interface IConferenceDocument extends IConference, Document {}
export interface IConferenceModel extends Model<IConferenceDocument> {}

View File

@@ -0,0 +1,11 @@
import { model } from "mongoose";
import { IRepositoryDocument, IRepositoryModel } from "./repositories.types";
import RepositorySchema from "./repositories.schema";
const RepositoryModel = model<IRepositoryDocument>(
"Repository",
RepositorySchema
) as IRepositoryModel;
export default RepositoryModel;

View File

@@ -0,0 +1,41 @@
import { Schema } from "mongoose";
const RepositorySchema = new Schema({
externalId: {
type: String,
index: { unique: true },
},
name: {
type: String,
index: true,
},
url: String,
source: {
type: String,
default: "github",
},
hasPage: { type: Boolean, default: false },
pageSource: {
branch: { type: String },
path: String,
},
branches: [
{
name: { type: String },
commit: String,
readme: String,
},
],
defaultBranch: String,
size: Number,
status: {
type: String,
default: "ready",
},
dateOfEntry: {
type: Date,
default: new Date(),
},
});
export default RepositorySchema;

View File

@@ -0,0 +1,25 @@
import { Document, Model } from "mongoose";
export interface IRepository {
externalId: string;
name: string;
url?: string;
source: "github";
size?: number;
defaultBranch?: string;
hasPage: boolean;
pageSource?: {
branch: string;
path: string;
};
branches?: {
name: string;
commit: string;
readme?: string;
}[];
}
export interface IRepositoryDocument extends IRepository, Document {
setLastUpdated: (this: IRepositoryDocument) => Promise<void>;
}
export interface IRepositoryModel extends Model<IRepositoryDocument> {}

View File

@@ -0,0 +1,8 @@
import { model } from "mongoose";
import { IUserDocument, IUserModel } from "./users.types";
import UserSchema from "./users.schema";
const UserModel = model<IUserDocument>("user", UserSchema) as IUserModel;
export default UserModel;

View File

@@ -0,0 +1,50 @@
import { Schema } from "mongoose";
const UserSchema = new Schema({
accessTokens: {
github: { type: String },
},
externalIDs: {
github: { type: String, index: true },
},
username: {
type: String,
index: { unique: true },
},
emails: [
{
email: { type: String },
default: Boolean,
},
],
isAdmin: { type: Boolean, default: false },
photo: String,
repositories: [
{
type: String,
ref: "Repository",
},
],
default: {
terms: [String],
options: {
expirationMode: { type: String },
update: Boolean,
image: Boolean,
pdf: Boolean,
notebook: Boolean,
link: Boolean,
page: { type: String },
},
},
status: {
type: String,
default: "active",
},
dateOfEntry: {
type: Date,
default: new Date(),
},
});
export default UserSchema;

View File

@@ -0,0 +1,39 @@
import { Document, Model } from "mongoose";
export interface IUser {
accessTokens: {
github: string;
};
externalIDs: {
github: string;
};
username: string;
isAdmin: boolean;
emails: {
email: string;
default: boolean;
}[];
photo?: string;
repositories?: number[];
default?: {
terms: string[];
options: {
expirationMode: "never" | "redirect" | "";
update: boolean;
image: boolean;
pdf: boolean;
notebook: boolean;
link: boolean;
page: string | null;
};
};
status?: "active" | "removed";
dateOfEntry?: Date;
lastUpdated?: Date;
}
export interface IUserDocument extends IUser, Document {
setLastUpdated: (this: IUserDocument) => Promise<void>;
}
export interface IUserModel extends Model<IUserDocument> {}

View File

@@ -0,0 +1,27 @@
import { Readable } from "stream";
import AnonymizedFile from "../AnonymizedFile";
import { Tree } from "../types";
import { SourceBase } from "./Source";
export interface GitHubBaseData {
getToken: () => string | Promise<string>;
repoId: string;
organization: string;
repoName: string;
commit: string;
}
export default abstract class GitHubBase implements SourceBase {
abstract type: "GitHubDownload" | "GitHubStream" | "Zip";
accessToken: string | undefined;
constructor(readonly data: GitHubBaseData) {}
abstract getFileContent(
file: AnonymizedFile,
progress?: (status: string) => void
): Promise<Readable>;
abstract getFiles(progress?: (status: string) => void): Promise<Tree>;
}

View File

@@ -0,0 +1,121 @@
import got from "got";
import { Readable } from "stream";
import { OctokitResponse } from "@octokit/types";
import storage from "../storage";
import GitHubBase, { GitHubBaseData } from "./GitHubBase";
import { trace } from "@opentelemetry/api";
import { FILE_TYPE } from "../storage/Storage";
import { octokit } from "../GitHubUtils";
import AnonymousError from "../AnonymousError";
import AnonymizedFile from "../AnonymizedFile";
export default class GitHubDownload extends GitHubBase {
type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubDownload";
constructor(data: GitHubBaseData) {
super(data);
}
private async _getZipUrl(): Promise<OctokitResponse<unknown, 302>> {
const oct = octokit(await this.data.getToken());
return oct.rest.repos.downloadZipballArchive({
owner: this.data.organization,
repo: this.data.repoName,
ref: this.data.commit || "HEAD",
method: "HEAD",
});
}
async download(progress?: (status: string) => void) {
const span = trace.getTracer("ano-file").startSpan("GHDownload.download");
span.setAttribute("repoId", this.data.repoId);
try {
let response: OctokitResponse<unknown, number>;
try {
response = await this._getZipUrl();
} catch (error) {
span.recordException(error as Error);
throw new AnonymousError("repo_not_accessible", {
httpStatus: 404,
object: this.data,
cause: error as Error,
});
}
await storage.mk(this.data.repoId);
let downloadProgress: { transferred: number } | undefined = undefined;
let progressTimeout;
let inDownload = true;
async function updateProgress() {
if (inDownload) {
if (progress) {
progress(downloadProgress?.transferred?.toString() || "");
}
progressTimeout = setTimeout(updateProgress, 1500);
}
}
updateProgress();
try {
const downloadStream = got.stream(response.url);
downloadStream.addListener("downloadProgress", async (p) => {
downloadProgress = p;
});
await storage.extractZip(
this.data.repoId,
"",
downloadStream,
this.type
);
} catch (error) {
span.recordException(error as Error);
throw new AnonymousError("unable_to_download", {
httpStatus: 500,
cause: error as Error,
object: this.data,
});
} finally {
inDownload = false;
clearTimeout(progressTimeout);
}
} finally {
span.end();
}
}
async getFileContent(
file: AnonymizedFile,
progress?: (status: string) => void
): Promise<Readable> {
const span = trace
.getTracer("ano-file")
.startSpan("GHDownload.getFileContent");
span.setAttribute("repoId", file.repository.repoId);
try {
const exists = await storage.exists(file.filePath);
if (exists === FILE_TYPE.FILE) {
return storage.read(this.data.repoId, file.filePath);
} else if (exists === FILE_TYPE.FOLDER) {
throw new AnonymousError("folder_not_supported", {
httpStatus: 400,
object: file,
});
}
// will throw an error if the file is not in the repository
await file.originalPath();
// the cache is not ready, we need to download the repository
await this.download(progress);
return storage.read(this.data.repoId, file.filePath);
} finally {
span.end();
}
}
async getFiles(progress?: (status: string) => void) {
if ((await storage.exists(this.data.repoId)) === FILE_TYPE.NOT_FOUND) {
await this.download(progress);
}
return storage.listFiles(this.data.repoId);
}
}

View File

@@ -0,0 +1,299 @@
import { Branch } from "../types";
import * as gh from "parse-github-url";
import { RestEndpointMethodTypes } from "@octokit/rest";
import { trace } from "@opentelemetry/api";
import AnonymousError from "../AnonymousError";
import { isConnected } from "../../server/database";
import { octokit } from "../GitHubUtils";
import { IRepositoryDocument } from "../model/repositories/repositories.types";
import RepositoryModel from "../model/repositories/repositories.model";
export class GitHubRepository {
private _data: Partial<{
[P in keyof IRepositoryDocument]: IRepositoryDocument[P];
}>;
constructor(
data: Partial<{ [P in keyof IRepositoryDocument]: IRepositoryDocument[P] }>
) {
this._data = data;
}
toJSON() {
return {
repo: this.repo,
owner: this.owner,
hasPage: this._data.hasPage,
pageSource: this._data.pageSource,
fullName: this.fullName,
defaultBranch: this._data.defaultBranch,
size: this.size,
};
}
get model() {
return this._data;
}
public get fullName(): string | undefined {
return this._data.name;
}
public get id(): string | undefined {
return this._data.externalId;
}
public get size(): number | undefined {
return this._data.size;
}
async getCommitInfo(
sha: string,
opt: {
accessToken: string;
}
) {
const span = trace
.getTracer("ano-file")
.startSpan("GHRepository.getCommitInfo");
span.setAttribute("owner", this.owner);
span.setAttribute("repo", this.repo);
try {
const oct = octokit(opt.accessToken);
const commit = await oct.repos.getCommit({
owner: this.owner,
repo: this.repo,
ref: sha,
});
return commit.data;
} finally {
span.end();
}
}
async branches(opt: {
accessToken: string;
force?: boolean;
}): Promise<Branch[]> {
const span = trace.getTracer("ano-file").startSpan("GHRepository.branches");
span.setAttribute("owner", this.owner);
span.setAttribute("repo", this.repo);
try {
if (
!this._data.branches ||
this._data.branches.length == 0 ||
opt?.force === true
) {
// get the list of repo from github
const oct = octokit(opt.accessToken);
try {
const branches = (
await oct.paginate("GET /repos/{owner}/{repo}/branches", {
owner: this.owner,
repo: this.repo,
per_page: 100,
})
).map((b) => {
return {
name: b.name,
commit: b.commit.sha,
readme: this._data.branches?.filter(
(f: Branch) => f.name == b.name
)[0]?.readme,
} as Branch;
});
this._data.branches = branches;
if (isConnected) {
await RepositoryModel.updateOne(
{ externalId: this.id },
{ $set: { branches } }
);
}
} catch (error) {
span.recordException(error as Error);
throw new AnonymousError("repo_not_found", {
httpStatus: (error as any).status,
cause: error as Error,
object: this,
});
}
} else if (isConnected) {
const q = await RepositoryModel.findOne({ externalId: this.id }).select(
"branches"
);
this._data.branches = q?.branches;
}
return this._data.branches || [];
} finally {
span.end();
}
}
async readme(opt: {
branch?: string;
force?: boolean;
accessToken: string;
}): Promise<string | undefined> {
const span = trace.getTracer("ano-file").startSpan("GHRepository.readme");
span.setAttribute("owner", this.owner);
span.setAttribute("repo", this.repo);
try {
if (!opt.branch) opt.branch = this._data.defaultBranch || "master";
const model = await RepositoryModel.findOne({
externalId: this.id,
}).select("branches");
if (!model) {
throw new AnonymousError("repo_not_found", { httpStatus: 404 });
}
this._data.branches = await this.branches(opt);
model.branches = this._data.branches;
const selected = model.branches.filter((f) => f.name == opt.branch)[0];
if (selected && (!selected.readme || opt?.force === true)) {
// get the list of repo from github
const oct = octokit(opt.accessToken);
try {
const ghRes = await oct.repos.getReadme({
owner: this.owner,
repo: this.repo,
ref: selected?.commit,
});
const readme = Buffer.from(
ghRes.data.content,
ghRes.data.encoding as BufferEncoding
).toString("utf-8");
selected.readme = readme;
await model.save();
} catch (error) {
span.recordException(error as Error);
throw new AnonymousError("readme_not_available", {
httpStatus: 404,
cause: error as Error,
object: this,
});
}
}
if (!selected) {
throw new AnonymousError("readme_not_available", {
httpStatus: 404,
object: this,
});
}
return selected.readme;
} finally {
span.end();
}
}
public get owner(): string {
if (!this.fullName) {
throw new AnonymousError("invalid_repo", {
httpStatus: 400,
object: this,
});
}
const repo = gh(this.fullName);
if (!repo) {
throw new AnonymousError("invalid_repo", {
httpStatus: 400,
object: this,
});
}
return repo.owner || this.fullName;
}
public get repo(): string {
if (!this.fullName) {
throw new AnonymousError("invalid_repo", {
httpStatus: 400,
object: this,
});
}
const repo = gh(this.fullName);
if (!repo) {
throw new AnonymousError("invalid_repo", {
httpStatus: 400,
object: this,
});
}
return repo.name || this.fullName;
}
}
export async function getRepositoryFromGitHub(opt: {
owner: string;
repo: string;
accessToken: string;
}) {
const span = trace
.getTracer("ano-file")
.startSpan("GHRepository.getRepositoryFromGitHub");
span.setAttribute("owner", opt.owner);
span.setAttribute("repo", opt.repo);
try {
if (opt.repo.indexOf(".git") > -1) {
opt.repo = opt.repo.replace(".git", "");
}
const oct = octokit(opt.accessToken);
let r: RestEndpointMethodTypes["repos"]["get"]["response"]["data"];
try {
r = (
await oct.repos.get({
owner: opt.owner,
repo: opt.repo,
})
).data;
} catch (error) {
span.recordException(error as Error);
throw new AnonymousError("repo_not_found", {
httpStatus: (error as any).status,
object: {
owner: opt.owner,
repo: opt.repo,
},
cause: error as Error,
});
}
if (!r)
throw new AnonymousError("repo_not_found", {
httpStatus: 404,
object: {
owner: opt.owner,
repo: opt.repo,
},
});
let model = new RepositoryModel({ externalId: "gh_" + r.id });
if (isConnected) {
const dbModel = await RepositoryModel.findOne({
externalId: "gh_" + r.id,
});
if (dbModel) {
model = dbModel;
}
}
model.name = r.full_name;
model.url = r.html_url;
model.size = r.size;
model.defaultBranch = r.default_branch;
model.hasPage = r.has_pages;
if (model.hasPage) {
const ghPageRes = await oct.repos.getPages({
owner: opt.owner,
repo: opt.repo,
});
model.pageSource = ghPageRes.data.source;
}
if (isConnected) {
await model.save();
}
return new GitHubRepository(model);
} finally {
span.end();
}
}

View File

@@ -0,0 +1,301 @@
import AnonymizedFile from "../AnonymizedFile";
import GitHubBase, { GitHubBaseData } from "./GitHubBase";
import storage from "../storage";
import { Tree } from "../types";
import * as path from "path";
import got from "got";
import * as stream from "stream";
import AnonymousError from "../AnonymousError";
import config from "../../config";
import { trace } from "@opentelemetry/api";
import { FILE_TYPE } from "../storage/Storage";
import { octokit } from "../GitHubUtils";
export default class GitHubStream extends GitHubBase {
type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubStream";
constructor(data: GitHubBaseData) {
super(data);
}
downloadFile(token: string, sha: string) {
const span = trace.getTracer("ano-file").startSpan("GHStream.downloadFile");
span.setAttribute("sha", sha);
const oct = octokit(token);
try {
const { url } = oct.rest.git.getBlob.endpoint({
owner: this.data.organization,
repo: this.data.repoName,
file_sha: sha,
});
return got.stream(url, {
headers: {
"X-GitHub-Api-Version": "2022-11-28",
accept: "application/vnd.github.raw+json",
authorization: `token ${token}`,
},
});
} catch (error) {
console.error(error);
// span.recordException(error as Error);
throw new AnonymousError("repo_not_accessible", {
httpStatus: 404,
object: this.data,
cause: error as Error,
});
} finally {
span.end();
}
}
async getFileContent(file: AnonymizedFile): Promise<stream.Readable> {
const span = trace
.getTracer("ano-file")
.startSpan("GHStream.getFileContent");
span.setAttribute("repoId", file.repository.repoId);
span.setAttribute("file", file.anonymizedPath);
try {
try {
file.filePath;
} catch (_) {
// compute the original path if ambiguous
await file.originalPath();
}
const fileInfo = await storage.exists(
file.repository.repoId,
file.filePath
);
if (fileInfo == FILE_TYPE.FILE) {
return storage.read(file.repository.repoId, file.filePath);
} else if (fileInfo == FILE_TYPE.FOLDER) {
throw new AnonymousError("folder_not_supported", {
httpStatus: 400,
object: file,
});
}
span.setAttribute("path", file.filePath);
const file_sha = await file.sha();
if (!file_sha) {
throw new AnonymousError("file_not_accessible", {
httpStatus: 404,
object: file,
});
}
const content = this.downloadFile(await this.data.getToken(), file_sha);
// duplicate the stream to write it to the storage
const stream1 = content.pipe(new stream.PassThrough());
const stream2 = content.pipe(new stream.PassThrough());
content.on("error", (error) => {
error = new AnonymousError("file_not_found", {
httpStatus: (error as any).status || (error as any).httpStatus,
cause: error as Error,
object: file,
});
stream1.emit("error", error);
stream2.emit("error", error);
});
storage.write(file.repository.repoId, file.filePath, stream1, this.type);
return stream2;
} finally {
span.end();
}
}
async getFiles() {
const span = trace.getTracer("ano-file").startSpan("GHStream.getFiles");
span.setAttribute("repoId", this.data.repoId);
try {
return this.getTree(this.data.commit);
} finally {
span.end();
}
}
private async getTree(
sha: string,
truncatedTree: Tree = {},
parentPath: string = "",
count = {
file: 0,
request: 0,
}
) {
const span = trace.getTracer("ano-file").startSpan("GHStream.getTree");
span.setAttribute("sha", sha);
let ghRes: Awaited<ReturnType<typeof this.getGHTree>>;
try {
count.request++;
ghRes = await this.getGHTree(sha, { recursive: true });
} catch (error) {
console.error(error);
span.recordException(error as Error);
if ((error as any).status == 409) {
// cannot be empty otherwise it would try to download it again
span.end();
return { __: {} };
} else {
const err = new AnonymousError("repo_not_accessible", {
httpStatus: (error as any).status,
cause: error as Error,
object: {
tree_sha: sha,
},
});
span.recordException(err);
span.end();
throw err;
}
}
const tree = this.tree2Tree(ghRes.tree, truncatedTree, parentPath);
count.file += ghRes.tree.length;
if (ghRes.truncated) {
await this.getTruncatedTree(sha, tree, parentPath, count);
}
span.end();
return tree;
}
private async getGHTree(sha: string, opt = { recursive: true }) {
const span = trace.getTracer("ano-file").startSpan("GHStream.getGHTree");
span.setAttribute("sha", sha);
try {
const oct = octokit(await this.data.getToken());
const ghRes = await oct.git.getTree({
owner: this.data.organization,
repo: this.data.repoName,
tree_sha: sha,
recursive: opt.recursive ? "1" : undefined,
});
return ghRes.data;
} finally {
span.end();
}
}
private async getTruncatedTree(
sha: string,
truncatedTree: Tree = {},
parentPath: string = "",
count = {
file: 0,
request: 0,
},
depth = 0
) {
const span = trace
.getTracer("ano-file")
.startSpan("GHStream.getTruncatedTree");
span.setAttribute("sha", sha);
span.setAttribute("parentPath", parentPath);
try {
count.request++;
let data = null;
try {
data = await this.getGHTree(sha, {
recursive: false,
});
this.tree2Tree(data.tree, truncatedTree, parentPath);
} catch (error) {
span.recordException(error as Error);
return;
}
count.file += data.tree.length;
if (data.tree.length < 100 && count.request < 200) {
const promises: Promise<any>[] = [];
for (const file of data.tree) {
if (file.type == "tree" && file.path && file.sha) {
const elementPath = path.join(parentPath, file.path);
promises.push(
this.getTruncatedTree(
file.sha,
truncatedTree,
elementPath,
count,
depth + 1
)
);
}
}
await Promise.all(promises);
} else {
try {
const data = await this.getGHTree(sha, {
recursive: true,
});
this.tree2Tree(data.tree, truncatedTree, parentPath);
if (data.truncated) {
// TODO: TRUNCATED
}
} catch (error) {
span.recordException(error as Error);
}
}
} finally {
span.end();
}
}
private tree2Tree(
tree: {
path?: string;
mode?: string;
type?: string;
sha?: string;
size?: number;
url?: string;
}[],
partialTree: Tree = {},
parentPath: string = ""
) {
const span = trace.getTracer("ano-file").startSpan("GHStream.tree2Tree");
span.setAttribute("parentPath", parentPath);
try {
for (let elem of tree) {
let current = partialTree;
if (!elem.path) continue;
const paths = path.join(parentPath, elem.path).split("/");
// if elem is a folder iterate on all folders if it is a file stop before the filename
const end = elem.type == "tree" ? paths.length : paths.length - 1;
for (let i = 0; i < end; i++) {
let p = paths[i];
if (p[0] == "$") {
p = "\\" + p;
}
if (!current[p]) {
current[p] = {};
}
current = current[p] as Tree;
}
// if elem is a file add the file size in the file list
if (elem.type == "blob") {
if (Object.keys(current).length > config.MAX_FILE_FOLDER) {
// TODO: TRUNCATED
continue;
}
let p = paths[end];
if (p[0] == "$") {
p = "\\" + p;
}
current[p] = {
size: elem.size || 0, // size in bit
sha: elem.sha || "",
};
}
}
return partialTree;
} finally {
span.end();
}
}
}

24
src/core/source/Source.ts Normal file
View File

@@ -0,0 +1,24 @@
import { Readable } from "stream";
import AnonymizedFile from "../AnonymizedFile";
import { Tree } from "../types";
import GitHubDownload from "./GitHubDownload";
import GitHubStream from "./GitHubStream";
import Zip from "./Zip";
export type Source = GitHubDownload | GitHubStream | Zip;
export interface SourceBase {
readonly type: string;
/**
* Retrieve the fie content
* @param file the file of the content to retrieve
*/
getFileContent(file: AnonymizedFile): Promise<Readable>;
/**
* Get all the files from a specific source
*/
getFiles(progress?: (status: string) => void): Promise<Tree>;
}

28
src/core/source/Zip.ts Normal file
View File

@@ -0,0 +1,28 @@
import * as stream from "stream";
import AnonymizedFile from "../AnonymizedFile";
import storage from "../storage";
import { SourceBase } from "./Source";
export default class Zip implements SourceBase {
type = "Zip";
url?: string;
constructor(data: any, readonly repoId: string) {
this.url = data.url;
}
async getFiles() {
return storage.listFiles(this.repoId);
}
async getFileContent(file: AnonymizedFile): Promise<stream.Readable> {
return storage.read(file.repository.repoId, file.filePath);
}
toJSON(): any {
return {
type: this.type,
};
}
}

7
src/core/storage.ts Normal file
View File

@@ -0,0 +1,7 @@
import config from "../config";
import FileSystem from "./storage/FileSystem";
import S3Storage from "./storage/S3";
export default (() => {
return config.STORAGE == "s3" ? new S3Storage() : new FileSystem();
})();

View File

@@ -0,0 +1,224 @@
import { Tree } from "../types";
import config from "../../config";
import * as fs from "fs";
import { Extract } from "unzip-stream";
import { join, basename, dirname } from "path";
import { Response } from "express";
import { Readable, pipeline, Transform } from "stream";
import * as archiver from "archiver";
import { promisify } from "util";
import { lookup } from "mime-types";
import { trace } from "@opentelemetry/api";
import StorageBase, { FILE_TYPE } from "./Storage";
export default class FileSystem extends StorageBase {
type = "FileSystem";
constructor() {
super();
}
/** @override */
async exists(repoId: string, p: string = ""): Promise<FILE_TYPE> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
return trace
.getTracer("ano-file")
.startActiveSpan("fs.exists", async (span) => {
span.setAttribute("path", p);
span.setAttribute("full-path", fullPath);
try {
const stat = await fs.promises.stat(fullPath);
if (stat.isDirectory()) return FILE_TYPE.FOLDER;
if (stat.isFile()) return FILE_TYPE.FILE;
} catch (_) {
// ignore file not found or not downloaded
}
span.end();
return FILE_TYPE.NOT_FOUND;
});
}
/** @override */
async send(repoId: string, p: string, res: Response) {
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
return trace
.getTracer("ano-file")
.startActiveSpan("fs.send", async (span) => {
span.setAttribute("path", fullPath);
res.sendFile(fullPath, { dotfiles: "allow" }, (err) => {
if (err) {
span.recordException(err);
}
span.end();
});
});
}
/** @override */
async read(repoId: string, p: string): Promise<Readable> {
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
return fs.createReadStream(fullPath);
}
async fileInfo(repoId: string, path: string) {
const fullPath = join(config.FOLDER, this.repoPath(repoId), path);
const info = await fs.promises.stat(fullPath);
return {
size: info.size,
lastModified: info.mtime,
contentType: info.isDirectory()
? "application/x-directory"
: (lookup(fullPath) as string),
};
}
/** @override */
async write(
repoId: string,
p: string,
data: string | Readable
): Promise<void> {
const span = trace.getTracer("ano-file").startSpan("fs.write");
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
span.setAttribute("path", fullPath);
try {
await this.mk(repoId, dirname(p));
if (data instanceof Readable) {
data.on("error", (err) => {
this.rm(repoId, p);
});
}
return await fs.promises.writeFile(fullPath, data, "utf-8");
} catch (err: any) {
span.recordException(err);
// throw err;
} finally {
span.end();
}
}
/** @override */
async rm(repoId: string, dir: string = ""): Promise<void> {
const span = trace.getTracer("ano-file").startSpan("fs.rm");
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
span.setAttribute("path", fullPath);
try {
await fs.promises.rm(fullPath, {
force: true,
recursive: true,
});
} finally {
span.end();
}
}
/** @override */
async mk(repoId: string, dir: string = ""): Promise<void> {
const span = trace.getTracer("ano-file").startSpan("fs.mk");
span.setAttribute("path", dir);
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
try {
await fs.promises.mkdir(fullPath, {
recursive: true,
});
} catch (err: any) {
if (err.code !== "EEXIST") {
span.recordException(err);
throw err;
}
} finally {
span.end();
}
}
/** @override */
async listFiles(
repoId: string,
dir: string = "",
opt: {
onEntry?: (file: { path: string; size: number }) => void;
} = {}
): Promise<Tree> {
return trace
.getTracer("ano-file")
.startActiveSpan("fs.listFiles", async (span) => {
span.setAttribute("path", dir);
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
let files = await fs.promises.readdir(fullPath);
const output: Tree = {};
for (let file of files) {
let filePath = join(dir, file);
try {
const stats = await fs.promises.stat(join(fullPath, filePath));
if (file[0] == "$") {
file = "\\" + file;
}
if (stats.isDirectory()) {
output[file] = await this.listFiles(repoId, filePath, opt);
} else if (stats.isFile()) {
if (opt.onEntry) {
opt.onEntry({
path: filePath,
size: stats.size,
});
}
output[file] = { size: stats.size, sha: stats.ino.toString() };
}
} catch (error) {
span.recordException(error as Error);
}
}
span.end();
return output;
});
}
/** @override */
async extractZip(repoId: string, p: string, data: Readable): Promise<void> {
const pipe = promisify(pipeline);
const fullPath = join(config.FOLDER, this.repoPath(repoId), p);
return pipe(
data,
Extract({
path: fullPath,
decodeString: (buf) => {
const name = buf.toString();
const newName = name.substr(name.indexOf("/") + 1);
if (newName == "") return "/dev/null";
return newName;
},
})
);
}
/** @override */
async archive(
repoId: string,
dir: string,
opt?: {
format?: "zip" | "tar";
fileTransformer?: (path: string) => Transform;
}
) {
const archive = archiver(opt?.format || "zip", {});
const fullPath = join(config.FOLDER, this.repoPath(repoId), dir);
await this.listFiles(repoId, dir, {
onEntry: async (file) => {
let rs = await this.read(repoId, file.path);
if (opt?.fileTransformer) {
// apply transformation on the stream
rs = rs.pipe(opt.fileTransformer(file.path));
}
const f = file.path.replace(fullPath, "");
archive.append(rs, {
name: basename(f),
prefix: dirname(f),
});
},
}).then(() => {
archive.finalize();
});
return archive;
}
}

391
src/core/storage/S3.ts Normal file
View File

@@ -0,0 +1,391 @@
import {
GetObjectCommand,
ListObjectsV2CommandOutput,
PutObjectCommandInput,
S3,
} from "@aws-sdk/client-s3";
import { Upload } from "@aws-sdk/lib-storage";
import { NodeHttpHandler } from "@smithy/node-http-handler";
import config from "../../config";
import { pipeline, Readable, Transform } from "stream";
import ArchiveStreamToS3 from "decompress-stream-to-s3";
import { Response } from "express";
import { contentType } from "mime-types";
import * as archiver from "archiver";
import { trace } from "@opentelemetry/api";
import { dirname, basename, join } from "path";
import { Tree, TreeFile } from "../types";
import AnonymousError from "../AnonymousError";
import StorageBase, { FILE_TYPE } from "./Storage";
export default class S3Storage extends StorageBase {
type = "AWS";
constructor() {
super();
if (!config.S3_BUCKET)
throw new AnonymousError("s3_config_not_provided", {
httpStatus: 500,
});
}
private client(timeout = 10000) {
if (!config.S3_CLIENT_ID) throw new Error("S3_CLIENT_ID not set");
if (!config.S3_CLIENT_SECRET) throw new Error("S3_CLIENT_SECRET not set");
if (!config.S3_REGION) throw new Error("S3_REGION not set");
if (!config.S3_ENDPOINT) throw new Error("S3_ENDPOINT not set");
return new S3({
credentials: {
accessKeyId: config.S3_CLIENT_ID,
secretAccessKey: config.S3_CLIENT_SECRET,
},
region: config.S3_REGION,
endpoint: config.S3_ENDPOINT,
requestHandler: new NodeHttpHandler({
requestTimeout: timeout,
connectionTimeout: timeout,
}),
});
}
/** @override */
async exists(repoId: string, path: string = ""): Promise<FILE_TYPE> {
const span = trace.getTracer("ano-file").startSpan("s3.exists");
span.setAttribute("path", path);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
try {
// if we can get the file info, it is a file
await this.fileInfo(repoId, path);
return FILE_TYPE.FILE;
} catch (err) {
// check if it is a directory
const data = await this.client().listObjectsV2({
Bucket: config.S3_BUCKET,
Prefix: join(this.repoPath(repoId), path),
MaxKeys: 1,
});
return (data.Contents?.length || 0) > 0
? FILE_TYPE.FOLDER
: FILE_TYPE.NOT_FOUND;
}
} finally {
span.end();
}
}
/** @override */
async mk(repoId: string, dir: string = ""): Promise<void> {
// no need to create folder on S3
}
/** @override */
async rm(repoId: string, dir: string = ""): Promise<void> {
const span = trace.getTracer("ano-file").startSpan("s3.rm");
span.setAttribute("repoId", repoId);
span.setAttribute("path", dir);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
const data = await this.client(200000).listObjectsV2({
Bucket: config.S3_BUCKET,
Prefix: join(this.repoPath(repoId), dir),
MaxKeys: 100,
});
const params = {
Bucket: config.S3_BUCKET,
Delete: { Objects: new Array<{ Key: string }>() },
};
data.Contents?.forEach(function (content) {
if (content.Key) {
params.Delete.Objects.push({ Key: content.Key });
}
});
if (params.Delete.Objects.length == 0) {
// nothing to remove
return;
}
await this.client(200000).deleteObjects(params);
if (data.IsTruncated) {
await this.rm(repoId, dir);
}
} finally {
span.end();
}
}
/** @override */
async send(repoId: string, path: string, res: Response) {
const span = trace.getTracer("ano-file").startSpan("s3.send");
span.setAttribute("repoId", repoId);
span.setAttribute("path", path);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
try {
const command = new GetObjectCommand({
Bucket: config.S3_BUCKET,
Key: join(this.repoPath(repoId), path),
});
const s = await this.client().send(command);
res.status(200);
if (s.ContentType) {
res.contentType(s.ContentType);
}
if (s.ContentLength) {
res.set("Content-Length", s.ContentLength.toString());
}
if (s.Body) {
(s.Body as Readable)?.pipe(res);
} else {
res.end();
}
} catch (error) {
span.recordException(error as Error);
try {
res.status(500);
} catch (err) {
console.error(`[ERROR] S3 send ${path}`, err);
}
}
} finally {
span.end();
}
}
async fileInfo(repoId: string, path: string) {
const span = trace.getTracer("ano-file").startSpan("s3.fileInfo");
span.setAttribute("repoId", repoId);
span.setAttribute("path", path);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
const info = await this.client(3000).headObject({
Bucket: config.S3_BUCKET,
Key: join(this.repoPath(repoId), path),
});
return {
size: info.ContentLength,
lastModified: info.LastModified,
contentType: info.ContentType
? info.ContentType
: (contentType(path) as string),
};
} finally {
span.end();
}
}
/** @override */
async read(repoId: string, path: string): Promise<Readable> {
const span = trace.getTracer("ano-file").startSpan("s3.rreadm");
span.setAttribute("repoId", repoId);
span.setAttribute("path", path);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
const command = new GetObjectCommand({
Bucket: config.S3_BUCKET,
Key: join(this.repoPath(repoId), path),
});
const res = (await this.client(3000).send(command)).Body;
if (!res) {
throw new AnonymousError("file_not_found", {
httpStatus: 404,
object: join(this.repoPath(repoId), path),
});
}
return res as Readable;
} finally {
span.end();
}
}
/** @override */
async write(
repoId: string,
path: string,
data: string | Readable,
source?: string
): Promise<void> {
const span = trace.getTracer("ano-file").startSpan("s3.rm");
span.setAttribute("repoId", repoId);
span.setAttribute("path", path);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
if (data instanceof Readable) {
data.on("error", (err) => {
console.error(`[ERROR] S3 write ${path}`, err);
span.recordException(err as Error);
this.rm(repoId, path);
});
}
const params: PutObjectCommandInput = {
Bucket: config.S3_BUCKET,
Key: join(this.repoPath(repoId), path),
Body: data,
ContentType: contentType(path).toString(),
};
if (source) {
params.Tagging = `source=${source}`;
}
const parallelUploads3 = new Upload({
// 30s timeout
client: this.client(30000),
params,
});
await parallelUploads3.done();
} finally {
span.end();
}
}
/** @override */
async listFiles(repoId: string, dir: string = ""): Promise<Tree> {
const span = trace.getTracer("ano-file").startSpan("s3.listFiles");
span.setAttribute("path", dir);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
if (dir && dir[dir.length - 1] != "/") dir = dir + "/";
const out: Tree = {};
let req: ListObjectsV2CommandOutput;
let nextContinuationToken: string | undefined;
do {
req = await this.client(30000).listObjectsV2({
Bucket: config.S3_BUCKET,
Prefix: join(this.repoPath(repoId), dir),
MaxKeys: 250,
ContinuationToken: nextContinuationToken,
});
if (!req.Contents) return out;
nextContinuationToken = req.NextContinuationToken;
for (const f of req.Contents) {
if (!f.Key) continue;
f.Key = f.Key.replace(join(this.repoPath(repoId), dir), "");
const paths = f.Key.split("/");
let current: Tree = out;
for (let i = 0; i < paths.length - 1; i++) {
let p = paths[i];
if (!p) continue;
if (!(current[p] as Tree)) {
current[p] = {} as Tree;
}
current = current[p] as Tree;
}
if (f.ETag) {
const fileInfo: TreeFile = { size: f.Size || 0, sha: f.ETag };
const fileName = paths[paths.length - 1];
if (fileName) current[fileName] = fileInfo;
}
}
} while (req && req.Contents && req.IsTruncated);
return out;
} finally {
span.end();
}
}
/** @override */
async extractZip(
repoId: string,
path: string,
data: Readable,
source?: string
): Promise<void> {
let toS3: ArchiveStreamToS3;
const span = trace.getTracer("ano-file").startSpan("s3.extractZip");
span.setAttribute("path", path);
return new Promise((resolve, reject) => {
if (!config.S3_BUCKET) return reject("S3_BUCKET not set");
toS3 = new ArchiveStreamToS3({
bucket: config.S3_BUCKET,
prefix: join(this.repoPath(repoId), path),
s3: this.client(2 * 60 * 60 * 1000), // 2h timeout
type: "zip",
onEntry: (header) => {
header.name = header.name.substring(header.name.indexOf("/") + 1);
if (source) {
header.Tagging = `source=${source}`;
header.Metadata = {
source: source,
};
}
},
maxParallel: 10,
});
pipeline(data, toS3, (err) => {
if (err) {
span.recordException(err as Error);
return reject(err);
}
span.end();
resolve();
})
.on("finish", () => {
span.end();
resolve();
})
.on("error", reject);
});
}
/** @override */
async archive(
repoId: string,
dir: string = "",
opt?: {
format?: "zip" | "tar";
fileTransformer?: (p: string) => Transform;
}
) {
const span = trace.getTracer("ano-file").startSpan("s3.archive");
span.setAttribute("repoId", repoId);
span.setAttribute("path", dir);
try {
if (!config.S3_BUCKET) throw new Error("S3_BUCKET not set");
const archive = archiver(opt?.format || "zip", {});
if (dir && dir[dir.length - 1] != "/") dir = dir + "/";
let req: ListObjectsV2CommandOutput;
let nextContinuationToken: string | undefined;
do {
req = await this.client(30000).listObjectsV2({
Bucket: config.S3_BUCKET,
Prefix: join(this.repoPath(repoId), dir),
MaxKeys: 250,
ContinuationToken: nextContinuationToken,
});
nextContinuationToken = req.NextContinuationToken;
for (const f of req.Contents || []) {
if (!f.Key) continue;
const filename = basename(f.Key);
const prefix = dirname(
f.Key.replace(join(this.repoPath(repoId), dir), "")
);
let rs = await this.read(repoId, f.Key);
if (opt?.fileTransformer) {
// apply transformation on the stream
rs = rs.pipe(opt.fileTransformer(f.Key));
}
archive.append(rs, {
name: filename,
prefix,
});
}
} while (req && req.Contents?.length && req.IsTruncated);
archive.finalize();
return archive;
} finally {
span.end();
}
}
}

118
src/core/storage/Storage.ts Normal file
View File

@@ -0,0 +1,118 @@
import { join } from "path";
import { Transform, Readable } from "stream";
import * as archiver from "archiver";
import { Response } from "express";
import { Tree } from "../types";
import S3Storage from "./S3";
import FileSystem from "./FileSystem";
export type Storage = S3Storage | FileSystem;
export enum FILE_TYPE {
FILE = "file",
FOLDER = "folder",
NOT_FOUND = "not_found",
}
export default abstract class StorageBase {
/**
* The type of storage
*/
abstract type: string;
/**
* check if the path exists
* @param path the path to check
*/
abstract exists(repoId: string, path: string): Promise<FILE_TYPE>;
abstract send(repoId: string, path: string, res: Response): Promise<void>;
/**
* Read the content of a file
* @param path the path to the file
*/
abstract read(repoId: string, path: string): Promise<Readable>;
abstract fileInfo(
repoId: string,
path: string
): Promise<{
size: number | undefined;
lastModified: Date | undefined;
contentType: string;
}>;
/**
* Write data to a file
* @param path the path to the file
* @param data the content of the file
* @param file the file
* @param source the source of the file
*/
abstract write(
repoId: string,
path: string,
data: string | Readable,
source?: string
): Promise<void>;
/**
* List the files from dir
* @param dir
*/
abstract listFiles(repoId: string, dir: string): Promise<Tree>;
/**
* Extract the content of tar to dir
* @param dir
* @param tar
* @param file the file
* @param source the source of the file
*/
abstract extractZip(
repoId: string,
dir: string,
tar: Readable,
source?: string
): Promise<void>;
/**
* Remove the path
* @param dir
*/
abstract rm(repoId: string, dir: string): Promise<void>;
/**
* Archive the content of dir
* @param dir
* @param opt
*/
abstract archive(
repoId: string,
dir: string,
opt?: {
/**
* Archive format
*/
format?: "zip" | "tar";
/**
* Transformer to apply on the content of the file
*/
fileTransformer?: (p: string) => Transform;
}
): Promise<archiver.Archiver>;
/**
* Create a directory
* @param dir
*/
abstract mk(repoId: string, dir: string): Promise<void>;
repoPath(repoId: string) {
return (
join(repoId, "original") + (process.platform === "win32" ? "\\" : "/")
);
}
}

32
src/core/types.ts Normal file
View File

@@ -0,0 +1,32 @@
export interface Branch {
name: string;
commit: string;
readme?: string;
}
export enum RepositoryStatus {
QUEUE = "queue",
PREPARING = "preparing",
DOWNLOAD = "download",
READY = "ready",
EXPIRED = "expired",
EXPIRING = "expiring",
REMOVED = "removed",
REMOVING = "removing",
ERROR = "error",
}
export type ConferenceStatus = "ready" | "expired" | "removed";
export type SourceStatus = "available" | "unavailable";
export type TreeElement = Tree | TreeFile;
export interface Tree {
[key: string]: TreeElement;
}
export interface TreeFile {
sha: string;
size: number;
}