From 864031d13afed05fa70ad18eca2f5cab56a307b3 Mon Sep 17 00:00:00 2001 From: tdurieux Date: Thu, 2 Feb 2023 15:45:01 +0100 Subject: [PATCH] fix: improve get tree in big repository by limiting the number of files --- config.ts | 2 + .../anonymizedRepositories.schema.ts | 4 + .../anonymizedRepositories.types.ts | 1 + src/queue.ts | 2 +- src/source/GitHubStream.ts | 112 ++++++++++-------- 5 files changed, 72 insertions(+), 49 deletions(-) diff --git a/config.ts b/config.ts index be58e52..96b0869 100644 --- a/config.ts +++ b/config.ts @@ -7,6 +7,7 @@ interface Config { CLIENT_SECRET: string; GITHUB_TOKEN: string; DEFAULT_QUOTA: number; + MAX_FILE_FOLDER: number; MAX_FILE_SIZE: number; MAX_REPO_SIZE: number; AUTO_DOWNLOAD_REPO_SIZE: number; @@ -38,6 +39,7 @@ const config: Config = { CLIENT_SECRET: "CLIENT_SECRET", GITHUB_TOKEN: "", DEFAULT_QUOTA: 2 * 1024 * 1024 * 1024 * 8, + MAX_FILE_FOLDER: 1000, MAX_FILE_SIZE: 100 * 1024 * 1024, // in b, 10MB MAX_REPO_SIZE: 60000, // in kb, 60MB AUTO_DOWNLOAD_REPO_SIZE: 150, // in kb, 150kb diff --git a/src/database/anonymizedRepositories/anonymizedRepositories.schema.ts b/src/database/anonymizedRepositories/anonymizedRepositories.schema.ts index a2fab65..43f8f23 100644 --- a/src/database/anonymizedRepositories/anonymizedRepositories.schema.ts +++ b/src/database/anonymizedRepositories/anonymizedRepositories.schema.ts @@ -25,6 +25,10 @@ const AnonymizedRepositorySchema = new Schema({ repositoryName: String, accessToken: String, }, + truckedFileList: { + type: Boolean, + default: false, + }, originalFiles: Schema.Types.Mixed, options: { terms: [String], diff --git a/src/database/anonymizedRepositories/anonymizedRepositories.types.ts b/src/database/anonymizedRepositories/anonymizedRepositories.types.ts index fb33ff5..9f155e8 100644 --- a/src/database/anonymizedRepositories/anonymizedRepositories.types.ts +++ b/src/database/anonymizedRepositories/anonymizedRepositories.types.ts @@ -16,6 +16,7 @@ export interface IAnonymizedRepository { accessToken?: string; }; owner: string; + truckedFileList: boolean; originalFiles: Tree; conference: string; options: { diff --git a/src/queue.ts b/src/queue.ts index 8547ffb..f5f9370 100644 --- a/src/queue.ts +++ b/src/queue.ts @@ -52,7 +52,7 @@ export function startWorker() { path.resolve("dist/src/processes/downloadRepository.js"), // downloadRepository, { - concurrency: 2, + concurrency: 3, connection: { host: config.REDIS_HOSTNAME, port: config.REDIS_PORT, diff --git a/src/source/GitHubStream.ts b/src/source/GitHubStream.ts index 71f5bc8..115bbc9 100644 --- a/src/source/GitHubStream.ts +++ b/src/source/GitHubStream.ts @@ -8,6 +8,7 @@ import * as path from "path"; import * as stream from "stream"; import AnonymousError from "../AnonymousError"; +import config from "../../config"; export default class GitHubStream extends GitHubBase implements SourceBase { constructor( @@ -83,21 +84,18 @@ export default class GitHubStream extends GitHubBase implements SourceBase { private async getTree( sha: string, truncatedTree: Tree = {}, - parentPath: string = "" + parentPath: string = "", + count = { + file: 0, + request: 0, + } ) { - const octokit = new Octokit({ - auth: await this.getToken(), - }); - - let ghRes; + this.repository.model.truckedFileList = false; + let ghRes: Awaited>; try { - ghRes = await octokit.git.getTree({ - owner: this.githubRepository.owner, - repo: this.githubRepository.repo, - tree_sha: sha, - recursive: "1", - }); + count.request++; + ghRes = await this.getGHTree(sha, { recursive: true }); } catch (error) { if (error.status == 409) { // empty tree @@ -106,6 +104,9 @@ export default class GitHubStream extends GitHubBase implements SourceBase { // cannot be empty otherwise it would try to download it again return { __: {} }; } else { + console.log( + `[ERROR] getTree ${this.repository.repoId}@${sha}: ${error.message}` + ); await this.repository.resetSate("error", "repo_not_accessible"); throw new AnonymousError("repo_not_accessible", { httpStatus: error.status, @@ -118,56 +119,67 @@ export default class GitHubStream extends GitHubBase implements SourceBase { }); } } - - const tree = this.tree2Tree(ghRes.data.tree, truncatedTree, parentPath); - if (ghRes.data.truncated) { - await this.getTruncatedTree(sha, tree, parentPath); + const tree = this.tree2Tree(ghRes.tree, truncatedTree, parentPath); + count.file += ghRes.tree.length; + if (ghRes.truncated) { + await this.getTruncatedTree(sha, tree, parentPath, count); } if (this.repository.status != "ready") await this.repository.updateStatus("ready"); return tree; } - private async getTruncatedTree( - sha: string, - truncatedTree: Tree = {}, - parentPath: string = "" - ) { + private async getGHTree(sha: string, opt = { recursive: true }) { const octokit = new Octokit({ auth: await this.getToken(), }); - try { - const ghRes = await octokit.git.getTree({ - owner: this.githubRepository.owner, - repo: this.githubRepository.repo, - tree_sha: sha, - }); - const tree = ghRes.data.tree; + const ghRes = await octokit.git.getTree({ + owner: this.githubRepository.owner, + repo: this.githubRepository.repo, + tree_sha: sha, + recursive: opt.recursive ? "1" : undefined, + }); + return ghRes.data; + } - for (let elem of tree) { - if (!elem.path) continue; - if (elem.type == "tree") { - const elementPath = path.join(parentPath, elem.path); - const paths = elementPath.split("/"); + private async getTruncatedTree( + sha: string, + truncatedTree: Tree = {}, + parentPath: string = "", + count = { + file: 0, + request: 0, + }, + depth = 0 + ) { + count.request++; + const data = await this.getGHTree(sha, { recursive: false }); + this.tree2Tree(data.tree, truncatedTree, parentPath); - let current = truncatedTree; - for (let i = 0; i < paths.length; i++) { - let p = paths[i]; - if (!current[p]) { - if (elem.sha) - await this.getTree(elem.sha, truncatedTree, elementPath); - break; - } - current = current[p] as Tree; - } + count.file += data.tree.length; + if (data.tree.length < 100 && count.request < 200) { + const promises: Promise[] = []; + for (const file of data.tree) { + const elementPath = path.join(parentPath, file.path); + if (file.type == "tree") { + promises.push( + this.getTruncatedTree( + file.sha, + truncatedTree, + elementPath, + count, + depth + 1 + ) + ); } } - this.tree2Tree(ghRes.data.tree, truncatedTree, parentPath); - return truncatedTree; - } catch (error) { - if (error.status == 409) { + await Promise.all(promises); + } else { + const data = await this.getGHTree(sha, { recursive: true }); + this.tree2Tree(data.tree, truncatedTree, parentPath); + if (data.truncated) { + this.repository.model.truckedFileList = true; } - return truncatedTree; } } @@ -205,6 +217,10 @@ export default class GitHubStream extends GitHubBase implements SourceBase { // if elem is a file add the file size in the file list if (elem.type == "blob") { + if (Object.keys(current).length > config.MAX_FILE_FOLDER) { + this.repository.model.truckedFileList = true; + continue; + } let p = paths[end]; if (p[0] == "$") { p = "\\" + p;