mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-02-13 02:42:45 +00:00
v2 wip
This commit is contained in:
115
utils/anonymize.js
Normal file
115
utils/anonymize.js
Normal file
@@ -0,0 +1,115 @@
|
||||
const fs = require("fs").promises;
|
||||
const ofs = require("fs");
|
||||
const path = require("path");
|
||||
const fileUtils = require("./file");
|
||||
|
||||
const ananymiseContent = (content, repoConfig) => {
|
||||
const urlRegex = /<?\b((https?|ftp|file):\/\/)[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]\b\/?>?/g;
|
||||
|
||||
if (repoConfig.options.image === false) {
|
||||
// remove image in markdown
|
||||
content = content.replace(
|
||||
/!\[[^\]]*\]\((?<filename>.*?)(?=\"|\))(?<optionalpart>\".*\")?\)/g,
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
if (!repoConfig.options.link) {
|
||||
// remove all links
|
||||
content = content.replace(urlRegex, "XXX");
|
||||
}
|
||||
|
||||
content = content.replace(
|
||||
new RegExp(
|
||||
`https://github.com/${repoConfig.fullName}/blob/${repoConfig.branch}\\b`,
|
||||
"gi"
|
||||
),
|
||||
`https://anonymous.4open.science/r/${repoConfig.repoId}`
|
||||
);
|
||||
content = content.replace(
|
||||
new RegExp(
|
||||
`https://github.com/${repoConfig.fullName}/tree/${repoConfig.branch}\\b`,
|
||||
"gi"
|
||||
),
|
||||
`https://anonymous.4open.science/r/${repoConfig.repoId}`
|
||||
);
|
||||
content = content.replace(
|
||||
new RegExp(`https://github.com/${repoConfig.fullName}`, "gi"),
|
||||
`https://anonymous.4open.science/r/${repoConfig.repoId}`
|
||||
);
|
||||
|
||||
for (let term of repoConfig.terms) {
|
||||
if (term.trim() == "") {
|
||||
continue;
|
||||
}
|
||||
// remove whole url if it contains the term
|
||||
content = content.replace(urlRegex, (match) => {
|
||||
if (new RegExp(`\\b${term}\\b`, "gi").test(match)) return "XXX";
|
||||
return match;
|
||||
});
|
||||
|
||||
// remove the term in the text
|
||||
content = content.replace(new RegExp(`\\b${term}\\b`, "gi"), "XXX");
|
||||
}
|
||||
return content;
|
||||
};
|
||||
|
||||
const ananymisePath = (path, repoConfig) => {
|
||||
for (let term of repoConfig.terms) {
|
||||
if (term.trim() == "") {
|
||||
continue;
|
||||
}
|
||||
path = path.replace(new RegExp(term, "gi"), "XXX");
|
||||
}
|
||||
return path;
|
||||
};
|
||||
|
||||
async function* walk(dir) {
|
||||
for await (const d of await fs.opendir(dir)) {
|
||||
const entry = path.join(dir, d.name);
|
||||
if (d.isDirectory()) yield* await walk(entry);
|
||||
else if (d.isFile()) yield entry;
|
||||
}
|
||||
}
|
||||
|
||||
const anonymizeFolder = async (root, destination, repoConfig) => {
|
||||
if (!ofs.existsSync(destination)) {
|
||||
await fs.mkdir(destination, { recursive: true });
|
||||
}
|
||||
try {
|
||||
for await (const originalFilePath of walk(root)) {
|
||||
const destinationFilePath = path.join(
|
||||
destination,
|
||||
ananymisePath(originalFilePath.replace(root, ""), repoConfig)
|
||||
);
|
||||
const destinationFolder = path.dirname(destinationFilePath);
|
||||
if (!ofs.existsSync(destinationFolder)) {
|
||||
await fs.mkdir(destinationFolder, { recursive: true });
|
||||
}
|
||||
await ananymiseFile(originalFilePath, destinationFilePath, repoConfig);
|
||||
}
|
||||
} catch (error) {
|
||||
fs.rm(destination, { recursive: true, force: true });
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
const ananymiseFile = async (filePath, target, repoConfig) => {
|
||||
if (!ofs.existsSync(path.dirname(target))) {
|
||||
await fs.mkdir(path.dirname(target), { recursive: true });
|
||||
}
|
||||
if (fileUtils.isText(filePath)) {
|
||||
const content = ananymiseContent(
|
||||
(await fs.readFile(filePath)).toString(),
|
||||
repoConfig
|
||||
);
|
||||
await fs.writeFile(target, content);
|
||||
} else {
|
||||
await fs.copyFile(filePath, target);
|
||||
}
|
||||
};
|
||||
|
||||
module.exports.ananymiseFile = ananymiseFile;
|
||||
module.exports.ananymisePath = ananymisePath;
|
||||
module.exports.anonymizeFolder = anonymizeFolder;
|
||||
module.exports.ananymiseContent = ananymiseContent;
|
||||
39
utils/database.js
Normal file
39
utils/database.js
Normal file
@@ -0,0 +1,39 @@
|
||||
const config = require("../config");
|
||||
|
||||
var MongoClient = require("mongodb").MongoClient;
|
||||
const MONGO_URL = "mongodb://root:rootpassword@mongodb:27017/?authSource=admin";
|
||||
let mongoClient = null;
|
||||
let DB = null;
|
||||
|
||||
module.exports.get = (collection) => {
|
||||
if (!collection) return DB;
|
||||
return DB.collection(collection);
|
||||
};
|
||||
|
||||
module.exports.connect = async () => {
|
||||
mongoClient = await MongoClient.connect(
|
||||
MONGO_URL,
|
||||
{ useNewUrlParser: true, useUnifiedTopology: true }
|
||||
);
|
||||
DB = mongoClient.db("anonymous_github");
|
||||
await DB.collection("anonymized_repositories").createIndex(
|
||||
{ repoId: 1 },
|
||||
{ unique: true, name: "repoId" }
|
||||
);
|
||||
await DB.collection("anonymized_repositories").createIndex(
|
||||
{ fullName: 1 },
|
||||
{ name: "fullName" }
|
||||
);
|
||||
await DB.collection("repositories").createIndex(
|
||||
{ fullName: 1 },
|
||||
{ unique: true, name: "fullName" }
|
||||
);
|
||||
await DB.collection("users").createIndex(
|
||||
{ username: 1 },
|
||||
{ unique: true, name: "username" }
|
||||
);
|
||||
return DB;
|
||||
};
|
||||
module.exports.close = async () => {
|
||||
return await mongoClient.close();
|
||||
};
|
||||
489
utils/file.js
Normal file
489
utils/file.js
Normal file
@@ -0,0 +1,489 @@
|
||||
const ofs = require("fs");
|
||||
const fs = require("fs").promises;
|
||||
const path = require("path");
|
||||
const { Octokit } = require("@octokit/rest");
|
||||
const gh = require("parse-github-url");
|
||||
const loc = require("@umijs/linguist");
|
||||
const { isText } = require("istextorbinary");
|
||||
|
||||
const db = require("./database");
|
||||
const repoUtils = require("./repository");
|
||||
const githubUtils = require("./github");
|
||||
const anonymizeUtils = require("./anonymize");
|
||||
const config = require("../config");
|
||||
|
||||
async function walk(dir, root) {
|
||||
if (root == null) {
|
||||
root = dir;
|
||||
}
|
||||
let files = await fs.readdir(dir);
|
||||
const output = { child: {} };
|
||||
for (let file of files) {
|
||||
let filePath = path.join(dir, file);
|
||||
const stats = await fs.stat(filePath);
|
||||
if (file[0] == "$") {
|
||||
file = "\\" + file;
|
||||
}
|
||||
if (stats.isDirectory()) {
|
||||
output.child[file] = await walk(filePath, root);
|
||||
output.child[file].sha = stats.ino;
|
||||
} else if (stats.isFile()) {
|
||||
output.child[file] = { size: stats.size, sha: stats.ino };
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
function tree2tree(tree, partialTree, parentPath) {
|
||||
if (!parentPath) parentPath = "";
|
||||
if (partialTree == null) {
|
||||
partialTree = { child: Object.create(null) };
|
||||
}
|
||||
for (let elem of tree) {
|
||||
const paths = path.join(parentPath, elem.path).split("/");
|
||||
let current = partialTree;
|
||||
|
||||
// if elem is a folder iterate on all folders if it is a file stop before the filename
|
||||
const end = elem.type == "tree" ? paths.length : paths.length - 1;
|
||||
for (let i = 0; i < end; i++) {
|
||||
let p = paths[i];
|
||||
if (p[0] == "$") {
|
||||
p = "\\" + p;
|
||||
}
|
||||
if (!current.child[p]) {
|
||||
current.child[p] = {
|
||||
child: Object.create(null),
|
||||
};
|
||||
}
|
||||
current = current.child[p];
|
||||
}
|
||||
|
||||
// if elem is a file add the file size in the file list
|
||||
if (elem.type == "blob") {
|
||||
let p = paths[end];
|
||||
if (p[0] == "$") {
|
||||
p = "\\" + p;
|
||||
}
|
||||
current.child[p] = {
|
||||
size: elem.size,
|
||||
sha: elem.sha,
|
||||
};
|
||||
} else {
|
||||
current.sha = elem.sha;
|
||||
}
|
||||
}
|
||||
return partialTree;
|
||||
}
|
||||
async function getTruncatedTree(repoConfig, truncatedTree, sha, parentPath) {
|
||||
const repo = gh(repoConfig.fullName);
|
||||
|
||||
if (!sha) {
|
||||
sha = repoConfig.commit ? repoConfig.commit : "HEAD";
|
||||
}
|
||||
|
||||
const octokit = new Octokit({
|
||||
auth: await githubUtils.getToken(repoConfig),
|
||||
});
|
||||
const ghRes = await octokit.git.getTree({
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
tree_sha: sha,
|
||||
});
|
||||
const tree = ghRes.data.tree;
|
||||
|
||||
for (let elem of tree) {
|
||||
if (elem.type == "tree") {
|
||||
const elementPath = path.join(parentPath, elem.path);
|
||||
const paths = elementPath.split("/");
|
||||
|
||||
let current = truncatedTree;
|
||||
for (let i = 0; i < paths.length; i++) {
|
||||
let p = paths[i];
|
||||
if (!current.child[p]) {
|
||||
await module.exports.getTree(
|
||||
repoConfig,
|
||||
elem.sha,
|
||||
truncatedTree,
|
||||
elementPath
|
||||
);
|
||||
break;
|
||||
}
|
||||
current = current.child[p];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tree2tree(ghRes.data.tree, truncatedTree, parentPath);
|
||||
|
||||
return truncatedTree;
|
||||
}
|
||||
module.exports.getTree = async (repoConfig, sha, truncatedTree, parentPath) => {
|
||||
const repo = gh(repoConfig.fullName);
|
||||
|
||||
if (!sha) {
|
||||
sha = repoConfig.commit ? repoConfig.commit : "HEAD";
|
||||
}
|
||||
if (!parentPath) parentPath = "";
|
||||
|
||||
const octokit = new Octokit({
|
||||
auth: await githubUtils.getToken(repoConfig),
|
||||
});
|
||||
const ghRes = await octokit.git.getTree({
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
tree_sha: sha,
|
||||
recursive: true,
|
||||
});
|
||||
|
||||
const tree = tree2tree(ghRes.data.tree, truncatedTree, parentPath);
|
||||
if (ghRes.data.truncated) {
|
||||
await getTruncatedTree(repoConfig, tree, sha, parentPath);
|
||||
}
|
||||
return tree;
|
||||
};
|
||||
module.exports.getFileList = async (options) => {
|
||||
let repoConfig = options.repoConfig;
|
||||
if (!repoConfig) {
|
||||
repoConfig = await repoUtils.getConfig(options.repoId);
|
||||
}
|
||||
|
||||
if (repoConfig == null) {
|
||||
throw "repo_not_found";
|
||||
}
|
||||
|
||||
const r = await db.get("anonymized_repositories").findOne(
|
||||
{ repoId: repoConfig.repoId },
|
||||
{
|
||||
projection: { files: 1 },
|
||||
}
|
||||
);
|
||||
if (r && r.files) {
|
||||
return r.files;
|
||||
}
|
||||
|
||||
if (repoConfig.options.mode == "stream") {
|
||||
// get file list from github
|
||||
const tree = await module.exports.getTree(repoConfig, repoConfig.commit);
|
||||
const files = anonymizeTree(tree, repoConfig);
|
||||
await db.get("anonymized_repositories").updateOne(
|
||||
{ repoId: repoConfig.repoId },
|
||||
{
|
||||
$set: {
|
||||
originalFiles: tree.child,
|
||||
files,
|
||||
},
|
||||
},
|
||||
{ upsert: true }
|
||||
);
|
||||
return files;
|
||||
} else if (repoConfig.options.mode == "download") {
|
||||
const originalFiles = await walk(
|
||||
repoUtils.getOriginalPath(repoConfig.repoId)
|
||||
);
|
||||
const files = anonymizeTree(originalFiles, repoConfig);
|
||||
await db.get("anonymized_repositories").updateOne(
|
||||
{ repoId: repoConfig.repoId },
|
||||
{
|
||||
$set: {
|
||||
originalFiles: originalFiles.child,
|
||||
files,
|
||||
},
|
||||
},
|
||||
{ upsert: true }
|
||||
);
|
||||
return files;
|
||||
} else {
|
||||
throw "non_supported_mode";
|
||||
}
|
||||
};
|
||||
function anonymizeTree(tree, repoConfig) {
|
||||
if (Number.isInteger(tree.size)) {
|
||||
return tree;
|
||||
}
|
||||
const output = {};
|
||||
for (let file in tree.child) {
|
||||
const anonymizedPath = anonymizeUtils.ananymisePath(file, repoConfig);
|
||||
output[anonymizedPath] = anonymizeTree(tree.child[file], repoConfig);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
function tree2sha(tree, output, parent) {
|
||||
if (!output) {
|
||||
output = {};
|
||||
parent = "";
|
||||
}
|
||||
for (let i in tree) {
|
||||
if (tree[i].sha) {
|
||||
output[tree[i].sha] = path.join(parent, i);
|
||||
}
|
||||
if (tree[i].child) {
|
||||
tree2sha(tree[i].child, output, path.join(parent, i));
|
||||
}
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
function getFile(tree, elementPath) {
|
||||
const paths = elementPath.trim().split("/");
|
||||
let current = tree;
|
||||
if (!tree.child) {
|
||||
current = { child: tree };
|
||||
}
|
||||
for (let i = 0; i < paths.length; i++) {
|
||||
let p = paths[i];
|
||||
if (p == "") {
|
||||
continue;
|
||||
}
|
||||
let tmp = current;
|
||||
if (current.child) {
|
||||
tmp = current.child;
|
||||
}
|
||||
if (!tmp[p]) {
|
||||
return null;
|
||||
}
|
||||
current = tmp[p];
|
||||
}
|
||||
return current;
|
||||
}
|
||||
module.exports.additionalExtensions = [
|
||||
"license",
|
||||
"dockerfile",
|
||||
"sbt",
|
||||
"ipynb",
|
||||
"gp",
|
||||
"out",
|
||||
];
|
||||
module.exports.isText = (p) => {
|
||||
if (isText(p)) {
|
||||
return true;
|
||||
}
|
||||
const filename = path.basename(p);
|
||||
const extensions = filename.split(".").reverse();
|
||||
const extension = extensions[0].toLowerCase();
|
||||
if (module.exports.additionalExtensions.includes(extension)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
module.exports.isFileSupported = (repoConfig, p) => {
|
||||
if (module.exports.isText(p)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const filename = path.basename(p);
|
||||
const extensions = filename.split(".").reverse();
|
||||
const extension = extensions[0].toLowerCase();
|
||||
|
||||
if (repoConfig.options.pdf && extension == "pdf") {
|
||||
return true;
|
||||
}
|
||||
if (
|
||||
repoConfig.options.image &&
|
||||
(extension == "png" ||
|
||||
extension == "jpg" ||
|
||||
extension == "jpeg" ||
|
||||
extension == "gif")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
module.exports.isFilePathValid = async (options) => {
|
||||
if (options.path == null) {
|
||||
throw "invalid_path";
|
||||
}
|
||||
let repoConfig = options.repoConfig;
|
||||
if (!repoConfig) {
|
||||
repoConfig = await repoUtils.getConfig(options.repoId);
|
||||
}
|
||||
|
||||
if (repoConfig == null) {
|
||||
throw "repo_not_found";
|
||||
}
|
||||
if (repoConfig.status == "expired") {
|
||||
throw "repository_expired";
|
||||
}
|
||||
if (repoConfig.status == "removed") {
|
||||
throw "repository_expired";
|
||||
}
|
||||
if (repoConfig.status != "ready") {
|
||||
throw "repository_not_ready";
|
||||
}
|
||||
|
||||
const anonymizedFilePath = path.join(
|
||||
repoUtils.getAnonymizedPath(repoConfig.repoId),
|
||||
options.path
|
||||
);
|
||||
|
||||
if (!module.exports.isFileSupported(repoConfig, anonymizedFilePath)) {
|
||||
throw "file_not_supported";
|
||||
}
|
||||
|
||||
let unanonymizePath = options.path;
|
||||
if (unanonymizePath.indexOf("XXX") > -1) {
|
||||
const files = await module.exports.getFileList({ repoConfig });
|
||||
|
||||
const file = getFile(files, options.path);
|
||||
if (file) {
|
||||
const r = await db
|
||||
.get("anonymized_repositories")
|
||||
.findOne(
|
||||
{ repoId: repoConfig.repoId },
|
||||
{ projection: { originalFiles: 1 } }
|
||||
);
|
||||
|
||||
const shatree = tree2sha(r.originalFiles);
|
||||
if (shatree[file.sha]) {
|
||||
unanonymizePath = shatree[file.sha];
|
||||
}
|
||||
}
|
||||
}
|
||||
const orignalFilePath = path.join(
|
||||
repoUtils.getOriginalPath(repoConfig.repoId),
|
||||
unanonymizePath
|
||||
);
|
||||
if (ofs.existsSync(anonymizedFilePath)) {
|
||||
return true;
|
||||
}
|
||||
if (ofs.existsSync(orignalFilePath)) {
|
||||
if (!module.exports.isFileSupported(repoConfig, anonymizedFilePath)) {
|
||||
throw "file_not_supported";
|
||||
}
|
||||
await anonymizeUtils.ananymiseFile(
|
||||
orignalFilePath,
|
||||
anonymizedFilePath,
|
||||
repoConfig
|
||||
);
|
||||
return true;
|
||||
}
|
||||
// if stream mode check download the file
|
||||
if (repoConfig.options.mode == "stream") {
|
||||
const repo = gh(repoConfig.fullName);
|
||||
const files = await module.exports.getFileList({ repoConfig });
|
||||
let file = getFile(files, options.path);
|
||||
if (file == null) {
|
||||
throw "file_not_found";
|
||||
}
|
||||
if (!file.sha) {
|
||||
throw "is_folder";
|
||||
}
|
||||
if (file.size > config.MAX_FILE_SIZE) {
|
||||
// file bigger than 10mb
|
||||
throw "file_too_big";
|
||||
}
|
||||
const octokit = new Octokit({
|
||||
auth: await githubUtils.getToken(repoConfig),
|
||||
});
|
||||
|
||||
let ghRes = null;
|
||||
if (file) {
|
||||
if (!module.exports.isFileSupported(repoConfig, anonymizedFilePath)) {
|
||||
throw "file_not_supported";
|
||||
}
|
||||
try {
|
||||
ghRes = await octokit.request(
|
||||
"GET /repos/{owner}/{repo}/git/blobs/{file_sha}",
|
||||
{
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
file_sha: file.sha,
|
||||
}
|
||||
);
|
||||
} catch (error) {
|
||||
if (error.status == 401 && config.GITHUB_TOKEN) {
|
||||
try {
|
||||
response = await getZip(config.GITHUB_TOKEN);
|
||||
} catch (error) {
|
||||
throw "repo_not_accessible";
|
||||
}
|
||||
} else if (error.status == 403) {
|
||||
throw "file_too_big";
|
||||
}
|
||||
console.error(error);
|
||||
throw "file_not_accessible";
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
ghRes = await octokit.repos.getContents({
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
path: options.path,
|
||||
ref: repoConfig.commit ? repoConfig.commit : "HEAD",
|
||||
});
|
||||
} catch (error) {
|
||||
if (error.status == 404) {
|
||||
return false;
|
||||
}
|
||||
if (error.status == 403) {
|
||||
console.log(error);
|
||||
throw "content_too_large";
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
if (!ghRes.data.content && ghRes.data.size != 0) {
|
||||
throw "content_not_accessible";
|
||||
}
|
||||
// empty file
|
||||
let content = "";
|
||||
if (ghRes.data.content) {
|
||||
content = new Buffer.from(ghRes.data.content, ghRes.data.encoding);
|
||||
}
|
||||
|
||||
try {
|
||||
await fs.mkdir(path.dirname(orignalFilePath), { recursive: true });
|
||||
} catch (_) {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
await fs.writeFile(orignalFilePath, content, { encoding: "utf-8" });
|
||||
await anonymizeUtils.ananymiseFile(
|
||||
orignalFilePath,
|
||||
anonymizedFilePath,
|
||||
repoConfig
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
throw "unable_to_write_file";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
module.exports.getStats = async (options) => {
|
||||
let repoConfig = options.repoConfig;
|
||||
if (!repoConfig) {
|
||||
repoConfig = await repoUtils.getConfig(options.repoId);
|
||||
}
|
||||
|
||||
if (repoConfig == null) {
|
||||
throw "repo_not_found";
|
||||
}
|
||||
if (repoConfig.mode == "stream") {
|
||||
throw "stats_unsupported";
|
||||
}
|
||||
|
||||
if (repoConfig.loc) {
|
||||
return repoConfig.loc;
|
||||
}
|
||||
|
||||
const repoCache = repoUtils.getOriginalPath(repoConfig.repoId);
|
||||
try {
|
||||
await fs.access(repoCache, ofs.constants.R_OK);
|
||||
} catch (error) {
|
||||
throw "repo_not_found";
|
||||
}
|
||||
const o = loc(repoCache);
|
||||
|
||||
await db.get("anonymized_repositories").updateOne(
|
||||
{ repoId: repoConfig.repoId },
|
||||
{
|
||||
$set: {
|
||||
loc: o,
|
||||
},
|
||||
},
|
||||
{ upsert: true }
|
||||
);
|
||||
return o;
|
||||
};
|
||||
59
utils/github.js
Normal file
59
utils/github.js
Normal file
@@ -0,0 +1,59 @@
|
||||
const ofs = require("fs");
|
||||
|
||||
const db = require("./database");
|
||||
const repoUtils = require("./repository");
|
||||
const fileUtils = require("./file");
|
||||
|
||||
const config = require("../config");
|
||||
|
||||
module.exports.getToken = async (repoConfig) => {
|
||||
if (repoConfig.owner) {
|
||||
const user = await db
|
||||
.get()
|
||||
.collection("users")
|
||||
.findOne(
|
||||
{ username: repoConfig.owner },
|
||||
{ projection: { accessToken: 1 } }
|
||||
);
|
||||
if (user && user.accessToken) {
|
||||
return user.accessToken;
|
||||
}
|
||||
}
|
||||
if (repoConfig.token) {
|
||||
return repoConfig.token;
|
||||
}
|
||||
return config.GITHUB_TOKEN;
|
||||
};
|
||||
|
||||
module.exports.downloadRepoAndAnonymize = async (repoConfig) => {
|
||||
const cachePath = repoUtils.getAnonymizedPath(repoConfig.repoId);
|
||||
const originalPath = repoUtils.getOriginalPath(repoConfig.repoId);
|
||||
if (ofs.existsSync(cachePath) || ofs.existsSync(originalPath)) {
|
||||
return true;
|
||||
}
|
||||
if (repoConfig.options.mode == "download") {
|
||||
// if cache folder does not exist download and anonumize it
|
||||
|
||||
const originalPath = repoUtils.getOriginalPath(repoConfig.repoId);
|
||||
|
||||
await repoUtils.updateStatus(repoConfig, "downloading");
|
||||
await repoUtils.downloadOriginalRepo(repoConfig, originalPath);
|
||||
await repoUtils.updateStatus(repoConfig, "ready");
|
||||
|
||||
// anonymize all the files
|
||||
// await repoUtils.updateStatus(repoConfig, "anonymize");
|
||||
|
||||
// await anonymizeUtils.anonymizeFolder(originalPath, cachePath, repoConfig);
|
||||
// await repoUtils.updateStatus(repoConfig, "anonymized");
|
||||
|
||||
// clean up
|
||||
// await fs.rm(originalPath, { recursive: true, force: true });
|
||||
return true;
|
||||
} else if (repoConfig.options.mode == "stream") {
|
||||
// in stream mode only download the list of file from github
|
||||
await fileUtils.getFileList({ repoConfig });
|
||||
await repoUtils.updateStatus(repoConfig, "ready");
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
356
utils/repository.js
Normal file
356
utils/repository.js
Normal file
@@ -0,0 +1,356 @@
|
||||
const fs = require("fs").promises;
|
||||
const ofs = require("fs");
|
||||
const path = require("path");
|
||||
const gh = require("parse-github-url");
|
||||
const { Octokit } = require("@octokit/rest");
|
||||
const extract = require("extract-zip");
|
||||
|
||||
const db = require("./database");
|
||||
const githubUtils = require("./github");
|
||||
const config = require("../config");
|
||||
|
||||
module.exports.getPath = (repoId) => {
|
||||
return path.resolve(__dirname, "..", "repositories", repoId);
|
||||
};
|
||||
module.exports.getOriginalPath = (repoId) => {
|
||||
return path.resolve(__dirname, "..", "repositories", repoId, "original");
|
||||
};
|
||||
module.exports.getAnonymizedPath = (repoId) => {
|
||||
return path.resolve(__dirname, "..", "repositories", repoId, "cache");
|
||||
};
|
||||
|
||||
module.exports.getConfig = async (repoId) => {
|
||||
const repo = await db
|
||||
.get()
|
||||
.collection("anonymized_repositories")
|
||||
.findOne(
|
||||
{ repoId },
|
||||
{
|
||||
projection: {
|
||||
// files: 1,
|
||||
token: 1,
|
||||
branch: 1,
|
||||
commit: 1,
|
||||
owner: 1,
|
||||
fullName: 1,
|
||||
repoId: 1,
|
||||
terms: 1,
|
||||
options: 1,
|
||||
loc: 1,
|
||||
status: 1,
|
||||
lastView: 1,
|
||||
},
|
||||
}
|
||||
);
|
||||
|
||||
if (repo && repo.options.expirationDate) {
|
||||
repo.options.expirationDate = new Date(repo.options.expirationDate);
|
||||
repo.lastView = new Date(repo.lastView);
|
||||
}
|
||||
return repo;
|
||||
};
|
||||
|
||||
module.exports.getRepoDetails = async (options) => {
|
||||
const query = {};
|
||||
if (options.fullName) {
|
||||
query.fullName = options.fullName;
|
||||
} else if (options.repoConfig) {
|
||||
query.fullName = options.repoConfig.fullName;
|
||||
options.fullName = query.fullName;
|
||||
} else if (options.owner && options.repo) {
|
||||
query.fullName = `${options.owner}/${options.repo}`;
|
||||
options.fullName = query.fullName;
|
||||
} else {
|
||||
throw "invalid_options";
|
||||
}
|
||||
|
||||
if (options.force !== true) {
|
||||
const repository = await db
|
||||
.get("repositories")
|
||||
.findOne(query, { projection: { readme: 0 } });
|
||||
if (repository && repository.id) return repository;
|
||||
}
|
||||
|
||||
try {
|
||||
const repo = gh(options.fullName);
|
||||
|
||||
const octokit = new Octokit({ auth: options.token });
|
||||
let ghRes = await octokit.repos.get({
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
});
|
||||
ghRes.data.fullName = ghRes.data.full_name;
|
||||
if (ghRes.data.fullName != query.fullName) {
|
||||
// repo renamed keep the old name
|
||||
ghRes.data.fullName = query.fullName;
|
||||
}
|
||||
if (ghRes.data.has_pages) {
|
||||
ghPageRes = await octokit.request("GET /repos/{owner}/{repo}/pages", {
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
});
|
||||
ghRes.data.pageSource = ghPageRes.data.source;
|
||||
}
|
||||
|
||||
delete ghRes.data.full_name;
|
||||
await db
|
||||
.get("repositories")
|
||||
.updateOne(query, { $set: ghRes.data }, { upsert: true });
|
||||
return ghRes.data;
|
||||
} catch (error) {
|
||||
console.log(query, error);
|
||||
if (error.status == 401 && options.token != config.GITHUB_TOKEN) {
|
||||
options.token = config.GITHUB_TOKEN;
|
||||
return await module.exports.getRepoDetails(options);
|
||||
}
|
||||
throw "repo_not_found";
|
||||
}
|
||||
};
|
||||
|
||||
module.exports.downloadRepoZip = async (repoConfig, target) => {
|
||||
const repo = gh(repoConfig.fullName);
|
||||
|
||||
async function getZip(token) {
|
||||
const octokit = new Octokit({ auth: token });
|
||||
return await octokit.request("GET /repos/{owner}/{repo}/zipball/{ref}", {
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
ref: repoConfig.commit,
|
||||
});
|
||||
}
|
||||
let response = null;
|
||||
try {
|
||||
response = await getZip(await githubUtils.getToken(repoConfig));
|
||||
} catch (error) {
|
||||
if (error.status == 401 && config.GITHUB_TOKEN) {
|
||||
try {
|
||||
response = await getZip(config.GITHUB_TOKEN);
|
||||
} catch (error) {
|
||||
throw "repo_not_accessible";
|
||||
}
|
||||
} else {
|
||||
throw "repo_not_accessible";
|
||||
}
|
||||
}
|
||||
|
||||
await fs.mkdir(path.dirname(target), { recursive: true });
|
||||
await fs.writeFile(target, Buffer.from(response.data), {
|
||||
encoding: "binary",
|
||||
});
|
||||
};
|
||||
|
||||
module.exports.updateStatus = async (repoConfig, status) => {
|
||||
repoConfig.status = status;
|
||||
await db
|
||||
.get("anonymized_repositories")
|
||||
.updateOne({ repoId: repoConfig.repoId }, { $set: { status } });
|
||||
};
|
||||
|
||||
module.exports.downloadOriginalRepo = async (repoConfig, destination) => {
|
||||
const zipPath = path.join(
|
||||
module.exports.getPath(repoConfig.repoId),
|
||||
"content.zip"
|
||||
);
|
||||
const destinationZip = destination + "_zip";
|
||||
|
||||
// download the repository and unzip it
|
||||
await module.exports.downloadRepoZip(repoConfig, zipPath);
|
||||
await extract(zipPath, { dir: destinationZip });
|
||||
|
||||
const folders = await fs.readdir(destinationZip);
|
||||
fs.rename(path.join(destinationZip, folders[0]), destination);
|
||||
await fs.rm(zipPath);
|
||||
await fs.rm(destinationZip, { recursive: true });
|
||||
};
|
||||
|
||||
module.exports.getAnonymizedRepoDetails = async (repoId, user) => {
|
||||
return db.get("anonymized_repositories").findOne(
|
||||
{
|
||||
repoId,
|
||||
owner: user.username,
|
||||
},
|
||||
{ projection: { token: 0, files: 0, originalFiles: 0, loc: 0 } }
|
||||
);
|
||||
};
|
||||
|
||||
module.exports.getRepoCommit = async (options) => {
|
||||
let repoConfig = options.repoConfig;
|
||||
if (!repoConfig) {
|
||||
repoConfig = await module.exports.getConfig(options.repoId);
|
||||
}
|
||||
|
||||
if (repoConfig == null) {
|
||||
throw "repo_not_found";
|
||||
}
|
||||
|
||||
if (options.force !== true) {
|
||||
const query = { fullName: repoConfig.fullName };
|
||||
query["branches." + repoConfig.branch + ""] = { $exists: true };
|
||||
const repository = await db
|
||||
.get("repositories")
|
||||
.findOne(query, { projection: { branches: 1 } });
|
||||
if (
|
||||
repository &&
|
||||
repository.branches &&
|
||||
repository.branches[repoConfig.branch]
|
||||
)
|
||||
return repository.branches[repoConfig.branch].commit.sha;
|
||||
}
|
||||
const branches = await module.exports.getRepoBranches({
|
||||
repoConfig,
|
||||
token: await githubUtils.getToken(repoConfig),
|
||||
force: options.force,
|
||||
});
|
||||
if (!branches[repoConfig.branch]) {
|
||||
console.log(branches, repoConfig.branch);
|
||||
throw "branch_not_found";
|
||||
}
|
||||
return branches[repoConfig.branch].commit.sha;
|
||||
};
|
||||
|
||||
module.exports.getRepoBranches = async (options) => {
|
||||
const query = {};
|
||||
if (options.fullName) {
|
||||
query.fullName = options.fullName;
|
||||
} else if (options.repoConfig) {
|
||||
query.fullName = options.repoConfig.fullName;
|
||||
options.fullName = query.fullName;
|
||||
} else if (options.owner && options.repo) {
|
||||
query.fullName = `${options.owner}/${options.repo}`;
|
||||
options.fullName = query.fullName;
|
||||
} else {
|
||||
throw new Error("Invalid options");
|
||||
}
|
||||
|
||||
if (options.force !== true) {
|
||||
let repository = await db
|
||||
.get("repositories")
|
||||
.findOne(query, { projection: { branches: 1 } });
|
||||
if (repository && repository.branches) return repository.branches;
|
||||
}
|
||||
|
||||
try {
|
||||
const repo = gh(options.fullName);
|
||||
|
||||
const octokit = new Octokit({ auth: options.token });
|
||||
const data = await octokit.paginate(octokit.repos.listBranches, {
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
per_page: 100,
|
||||
});
|
||||
|
||||
const branches = {};
|
||||
for (let b of data) {
|
||||
branches[b.name] = b;
|
||||
}
|
||||
await db
|
||||
.get("repositories")
|
||||
.updateOne(query, { $set: { branches } }, { upsert: true });
|
||||
return branches;
|
||||
} catch (error) {
|
||||
if (error.status == 401 && options.token != config.GITHUB_TOKEN) {
|
||||
options.token = config.GITHUB_TOKEN;
|
||||
return await module.exports.getRepoBranches(options);
|
||||
}
|
||||
if (error.status == 404) {
|
||||
throw "repo_not_found";
|
||||
}
|
||||
console.error(error);
|
||||
throw "branches_not_found";
|
||||
}
|
||||
};
|
||||
|
||||
module.exports.getRepoReadme = async (options) => {
|
||||
const query = {};
|
||||
if (options.fullName) {
|
||||
query.fullName = options.fullName;
|
||||
} else if (options.repoConfig) {
|
||||
query.fullName = options.repoConfig.fullName;
|
||||
options.fullName = query.fullName;
|
||||
} else if (options.owner && options.repo) {
|
||||
query.fullName = `${options.owner}/${options.repo}`;
|
||||
options.fullName = query.fullName;
|
||||
} else {
|
||||
throw new Error("Invalid options");
|
||||
}
|
||||
|
||||
if (options.force !== true) {
|
||||
let repository = await db
|
||||
.get("repositories")
|
||||
.findOne(query, { projection: { readme: 1 } });
|
||||
if (repository && repository.readme) return repository.readme;
|
||||
}
|
||||
|
||||
try {
|
||||
const repo = gh(options.fullName);
|
||||
|
||||
const octokit = new Octokit({ auth: options.token });
|
||||
const ghRes = await octokit.repos.getReadme({
|
||||
owner: repo.owner,
|
||||
repo: repo.name,
|
||||
});
|
||||
const readme = new Buffer.from(ghRes.data.content, "base64").toString(
|
||||
"utf-8"
|
||||
);
|
||||
await db
|
||||
.get("repositories")
|
||||
.updateOne(query, { $set: { readme } }, { upsert: true });
|
||||
return readme;
|
||||
} catch (error) {
|
||||
throw "readme_not_available";
|
||||
}
|
||||
};
|
||||
|
||||
module.exports.updateAnonimizedRepository = async (repoConfig) => {
|
||||
if (repoConfig.status == "updating") {
|
||||
throw "repo_is_updating";
|
||||
}
|
||||
repoConfig = await module.exports.getConfig(repoConfig.repoId);
|
||||
if (repoConfig.status == "updating") {
|
||||
throw "repo_is_updating";
|
||||
}
|
||||
// check new commit
|
||||
const commit = await module.exports.getRepoCommit({
|
||||
repoConfig,
|
||||
force: true,
|
||||
});
|
||||
if (commit == repoConfig.commit) {
|
||||
console.log(`${repoConfig.repoId} is up to date`);
|
||||
return true;
|
||||
}
|
||||
console.log(`${repoConfig.repoId} will be updated to ${commit}`);
|
||||
await module.exports.updateStatus(repoConfig, "updating");
|
||||
await db
|
||||
.get("anonymized_repositories")
|
||||
.updateOne({ repoId: repoConfig.repoId }, { $set: { commit } });
|
||||
await module.exports.removeRepository(repoConfig);
|
||||
await githubUtils.downloadRepoAndAnonymize(repoConfig);
|
||||
await module.exports.updateStatus(repoConfig, "ready");
|
||||
};
|
||||
|
||||
module.exports.removeRepository = async (repoConfig) => {
|
||||
try {
|
||||
if (ofs.existsSync(module.exports.getOriginalPath(repoConfig.repoId))) {
|
||||
await fs.rm(module.exports.getOriginalPath(repoConfig.repoId), {
|
||||
recursive: true,
|
||||
force: true,
|
||||
});
|
||||
}
|
||||
if (ofs.existsSync(module.exports.getAnonymizedPath(repoConfig.repoId))) {
|
||||
await fs.rm(module.exports.getAnonymizedPath(repoConfig.repoId), {
|
||||
recursive: true,
|
||||
force: true,
|
||||
});
|
||||
}
|
||||
|
||||
await db
|
||||
.get("anonymized_repositories")
|
||||
.updateOne(
|
||||
{ repoId: repoConfig.repoId },
|
||||
{ $unset: { files: "", originalFiles: "", loc: "" } }
|
||||
);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user