fix: improve get tree in big repository by limiting the number of files

This commit is contained in:
tdurieux
2023-02-02 15:45:01 +01:00
parent 0e8a40c0d7
commit 864031d13a
5 changed files with 72 additions and 49 deletions

View File

@@ -7,6 +7,7 @@ interface Config {
CLIENT_SECRET: string;
GITHUB_TOKEN: string;
DEFAULT_QUOTA: number;
MAX_FILE_FOLDER: number;
MAX_FILE_SIZE: number;
MAX_REPO_SIZE: number;
AUTO_DOWNLOAD_REPO_SIZE: number;
@@ -38,6 +39,7 @@ const config: Config = {
CLIENT_SECRET: "CLIENT_SECRET",
GITHUB_TOKEN: "",
DEFAULT_QUOTA: 2 * 1024 * 1024 * 1024 * 8,
MAX_FILE_FOLDER: 1000,
MAX_FILE_SIZE: 100 * 1024 * 1024, // in b, 10MB
MAX_REPO_SIZE: 60000, // in kb, 60MB
AUTO_DOWNLOAD_REPO_SIZE: 150, // in kb, 150kb

View File

@@ -25,6 +25,10 @@ const AnonymizedRepositorySchema = new Schema({
repositoryName: String,
accessToken: String,
},
truckedFileList: {
type: Boolean,
default: false,
},
originalFiles: Schema.Types.Mixed,
options: {
terms: [String],

View File

@@ -16,6 +16,7 @@ export interface IAnonymizedRepository {
accessToken?: string;
};
owner: string;
truckedFileList: boolean;
originalFiles: Tree;
conference: string;
options: {

View File

@@ -52,7 +52,7 @@ export function startWorker() {
path.resolve("dist/src/processes/downloadRepository.js"),
// downloadRepository,
{
concurrency: 2,
concurrency: 3,
connection: {
host: config.REDIS_HOSTNAME,
port: config.REDIS_PORT,

View File

@@ -8,6 +8,7 @@ import * as path from "path";
import * as stream from "stream";
import AnonymousError from "../AnonymousError";
import config from "../../config";
export default class GitHubStream extends GitHubBase implements SourceBase {
constructor(
@@ -83,21 +84,18 @@ export default class GitHubStream extends GitHubBase implements SourceBase {
private async getTree(
sha: string,
truncatedTree: Tree = {},
parentPath: string = ""
parentPath: string = "",
count = {
file: 0,
request: 0,
}
) {
const octokit = new Octokit({
auth: await this.getToken(),
});
let ghRes;
this.repository.model.truckedFileList = false;
let ghRes: Awaited<ReturnType<typeof this.getGHTree>>;
try {
ghRes = await octokit.git.getTree({
owner: this.githubRepository.owner,
repo: this.githubRepository.repo,
tree_sha: sha,
recursive: "1",
});
count.request++;
ghRes = await this.getGHTree(sha, { recursive: true });
} catch (error) {
if (error.status == 409) {
// empty tree
@@ -106,6 +104,9 @@ export default class GitHubStream extends GitHubBase implements SourceBase {
// cannot be empty otherwise it would try to download it again
return { __: {} };
} else {
console.log(
`[ERROR] getTree ${this.repository.repoId}@${sha}: ${error.message}`
);
await this.repository.resetSate("error", "repo_not_accessible");
throw new AnonymousError("repo_not_accessible", {
httpStatus: error.status,
@@ -118,56 +119,67 @@ export default class GitHubStream extends GitHubBase implements SourceBase {
});
}
}
const tree = this.tree2Tree(ghRes.data.tree, truncatedTree, parentPath);
if (ghRes.data.truncated) {
await this.getTruncatedTree(sha, tree, parentPath);
const tree = this.tree2Tree(ghRes.tree, truncatedTree, parentPath);
count.file += ghRes.tree.length;
if (ghRes.truncated) {
await this.getTruncatedTree(sha, tree, parentPath, count);
}
if (this.repository.status != "ready")
await this.repository.updateStatus("ready");
return tree;
}
private async getTruncatedTree(
sha: string,
truncatedTree: Tree = {},
parentPath: string = ""
) {
private async getGHTree(sha: string, opt = { recursive: true }) {
const octokit = new Octokit({
auth: await this.getToken(),
});
try {
const ghRes = await octokit.git.getTree({
owner: this.githubRepository.owner,
repo: this.githubRepository.repo,
tree_sha: sha,
});
const tree = ghRes.data.tree;
const ghRes = await octokit.git.getTree({
owner: this.githubRepository.owner,
repo: this.githubRepository.repo,
tree_sha: sha,
recursive: opt.recursive ? "1" : undefined,
});
return ghRes.data;
}
for (let elem of tree) {
if (!elem.path) continue;
if (elem.type == "tree") {
const elementPath = path.join(parentPath, elem.path);
const paths = elementPath.split("/");
private async getTruncatedTree(
sha: string,
truncatedTree: Tree = {},
parentPath: string = "",
count = {
file: 0,
request: 0,
},
depth = 0
) {
count.request++;
const data = await this.getGHTree(sha, { recursive: false });
this.tree2Tree(data.tree, truncatedTree, parentPath);
let current = truncatedTree;
for (let i = 0; i < paths.length; i++) {
let p = paths[i];
if (!current[p]) {
if (elem.sha)
await this.getTree(elem.sha, truncatedTree, elementPath);
break;
}
current = current[p] as Tree;
}
count.file += data.tree.length;
if (data.tree.length < 100 && count.request < 200) {
const promises: Promise<any>[] = [];
for (const file of data.tree) {
const elementPath = path.join(parentPath, file.path);
if (file.type == "tree") {
promises.push(
this.getTruncatedTree(
file.sha,
truncatedTree,
elementPath,
count,
depth + 1
)
);
}
}
this.tree2Tree(ghRes.data.tree, truncatedTree, parentPath);
return truncatedTree;
} catch (error) {
if (error.status == 409) {
await Promise.all(promises);
} else {
const data = await this.getGHTree(sha, { recursive: true });
this.tree2Tree(data.tree, truncatedTree, parentPath);
if (data.truncated) {
this.repository.model.truckedFileList = true;
}
return truncatedTree;
}
}
@@ -205,6 +217,10 @@ export default class GitHubStream extends GitHubBase implements SourceBase {
// if elem is a file add the file size in the file list
if (elem.type == "blob") {
if (Object.keys(current).length > config.MAX_FILE_FOLDER) {
this.repository.model.truckedFileList = true;
continue;
}
let p = paths[end];
if (p[0] == "$") {
p = "\\" + p;