refactor AnonymizedFile

This commit is contained in:
tdurieux
2021-08-13 00:03:28 +02:00
parent b2426f70b2
commit 47f44fe41e
6 changed files with 155 additions and 133 deletions

View File

@@ -2,71 +2,127 @@ import * as path from "path";
import * as express from "express";
import * as stream from "stream";
import Repository from "./Repository";
import { Tree, TreeFile } from "./types";
import { Tree, TreeElement, TreeFile } from "./types";
import storage from "./storage";
import config from "../config";
import { anonymizeStream } from "./anonymize-utils";
import { anonymizePath, anonymizeStream } from "./anonymize-utils";
function tree2sha(
tree: any,
output: { [key: string]: string } = {},
parent: string = ""
): { [key: string]: string } {
for (let i in tree) {
const sha = tree[i].sha as string;
const size = tree[i].size as number;
if (sha != null && size != null) {
output[sha] = path.join(parent, i);
} else if (tree[i].child) {
tree2sha(tree[i].child as Tree, output, path.join(parent, i));
} else {
tree2sha(tree[i] as Tree, output, path.join(parent, i));
}
}
return output;
}
/**
* Represent a file in a anonymized repository
*/
export default class AnonymizedFile {
repository: Repository;
sha?: string;
size?: number;
path?: string;
anonymizedPath: string;
private _originalPath: string;
private fileSize?: number;
constructor(
repository: Repository,
data: {
path?: string;
anonymizedPath: string;
sha?: string;
size?: number;
}
) {
this.repository = repository;
repository: Repository;
anonymizedPath: string;
sha?: string;
constructor(data: { repository: Repository; anonymizedPath: string }) {
this.repository = data.repository;
if (!this.repository.options.terms) throw new Error("terms_not_specified");
this.anonymizedPath = data.anonymizedPath;
if (data.path) {
this.path = data.path;
}
if (!data.anonymizedPath && this.path) {
// anonymize the path
this.anonymizedPath = this.path;
for (let term of this.repository.options.terms) {
if (term.trim() == "") {
continue;
}
this.anonymizedPath = this.anonymizedPath.replace(
new RegExp(term, "gi"),
config.ANONYMIZATION_MASK
);
}
}
if (!this.sha) this.sha = data.sha;
if (!this.size) this.size = data.size;
}
async send(res: express.Response): Promise<void> {
try {
const s = await this.anonymizedContent();
s.on("error", (err) => {
console.log(err);
res.status(500).send({ error: err.message });
});
s.pipe(res);
} catch (error) {
console.log("Error during anonymization", error);
res.status(500).send({ error: error.message });
/**
* De-anonymize the path
*
* @returns the origin relative path of the file
*/
async originalPath(): Promise<string> {
// console.log(new Error().stack);
if (this._originalPath) return this._originalPath;
if (!this.anonymizedPath) throw new Error("path_not_specified");
const paths = this.anonymizedPath.trim().split("/");
let currentAnonymized: TreeElement =
await this.repository.anonymizedFiles();
let currentOriginal: TreeElement = await this.repository.files();
let currentOriginalPath = "";
let isAmbiguous = false;
for (let i = 0; i < paths.length; i++) {
const fileName = paths[i];
if (fileName == "") {
continue;
}
if (!currentAnonymized[fileName]) {
throw new Error("file_not_found");
}
currentAnonymized = currentAnonymized[fileName];
if (!isAmbiguous && !currentOriginal[fileName]) {
// anonymize all the file in the folder and check if there is one that match the current filename
const options = [];
for (let originalFileName in currentOriginal) {
if (
anonymizePath(originalFileName, this.repository.options.terms) ==
fileName
) {
options.push(originalFileName);
}
}
// if only one option we found the original filename
if (options.length == 1) {
currentOriginalPath = path.join(currentOriginalPath, options[0]);
currentOriginal = currentOriginal[options[0]];
} else {
isAmbiguous = true;
}
} else if (!isAmbiguous) {
currentOriginalPath = path.join(currentOriginalPath, fileName);
currentOriginal = currentOriginal[fileName];
}
}
if (
currentAnonymized.sha === undefined ||
currentAnonymized.size === undefined
) {
throw new Error("folder_not_supported");
}
const file: TreeFile = currentAnonymized as TreeFile;
this.fileSize = file.size;
this.sha = file.sha;
if (isAmbiguous) {
// it should never happen
const shaTree = tree2sha(currentOriginal);
if (!currentAnonymized.sha || !shaTree[file.sha]) {
throw new Error("file_not_found");
}
this._originalPath = path.join(currentOriginalPath, shaTree[file.sha]);
} else {
this._originalPath = currentOriginalPath;
}
return this._originalPath;
}
async isFileSupported() {
this.path = await this.getOriginalPath();
const filename = path.basename(this.path);
const filename = path.basename(await this.originalPath());
const extensions = filename.split(".").reverse();
const extension = extensions[0].toLowerCase();
if (!this.repository.options.pdf && extension == "pdf") {
@@ -85,16 +141,8 @@ export default class AnonymizedFile {
return true;
}
get originalCachePath() {
if (!this.path) throw "path_not_defined";
return path.join(
this.repository.originalCachePath,
this.path
);
}
async content(): Promise<stream.Readable> {
if (this.size && this.size > config.MAX_FILE_SIZE) {
if (this.fileSize && this.fileSize > config.MAX_FILE_SIZE) {
throw new Error("file_too_big");
}
if (await storage.exists(this.originalCachePath)) {
@@ -105,64 +153,27 @@ export default class AnonymizedFile {
}
async anonymizedContent() {
await this.getOriginalPath();
if (!this.path) throw new Error("path_not_specified");
if (!this.repository.options.terms) throw new Error("terms_not_specified");
await this.originalPath();
const rs = await this.content();
const contentStream = rs.pipe(anonymizeStream(this.path, this.repository));
return contentStream;
return rs.pipe(anonymizeStream(await this.originalPath(), this.repository));
}
/**
* De-anonymize the path
*
* @returns the origin relative path of the file
*/
async getOriginalPath(): Promise<string> {
if (!this.anonymizedPath) throw new Error("path_not_specified");
get originalCachePath() {
if (!this.originalPath) throw new Error("path_not_defined");
return path.join(this.repository.originalCachePath, this._originalPath);
}
const files = await this.repository.files();
const paths = this.anonymizedPath.trim().split("/");
let current: any = await this.repository.anonymizedFiles();
for (let i = 0; i < paths.length; i++) {
const fileName = paths[i];
if (fileName == "") {
continue;
}
if (current[fileName]) {
current = current[fileName];
} else {
throw new Error("file_not_found");
}
async send(res: express.Response): Promise<void> {
try {
const s = await this.anonymizedContent();
s.on("error", (err) => {
console.log(err);
res.status(500).send({ error: err.message });
});
s.pipe(res);
} catch (error) {
console.log("Error during anonymization", error);
res.status(500).send({ error: error.message });
}
function tree2sha(
tree: any,
output: { [key: string]: string } = {},
parent: string = ""
): { [key: string]: string } {
for (let i in tree) {
const sha = tree[i].sha as string;
const size = tree[i].size as number;
if (sha != null && size != null) {
output[sha] = path.join(parent, i);
} else if (tree[i].child) {
tree2sha(tree[i].child as Tree, output, path.join(parent, i));
} else {
tree2sha(tree[i] as Tree, output, path.join(parent, i));
}
}
return output;
}
const shaTree = tree2sha(files);
if (!current.sha || !shaTree[current.sha]) {
throw new Error("file_not_found");
}
this.path = shaTree[current.sha];
this.sha = current.sha;
if ((current as TreeFile).size) this.size = (current as TreeFile).size;
return this.path;
}
}

View File

@@ -1,6 +1,6 @@
import * as path from "path";
import storage from "./storage";
import { RepositoryStatus, Source, Tree } from "./types";
import { RepositoryStatus, Source, Tree, TreeElement, TreeFile } from "./types";
import * as stream from "stream";
import User from "./User";
import GitHubStream from "./source/GitHubStream";
@@ -43,23 +43,23 @@ export default class Repository {
async anonymizedFiles(opt?: { force?: boolean }): Promise<Tree> {
const terms = this._model.options.terms || [];
function anonymizeTreeRecursive(tree: Tree): any {
if (Number.isInteger(tree.size)) {
return tree;
function anonymizeTreeRecursive(tree: TreeElement): TreeElement {
if (Number.isInteger(tree.size) && tree.sha !== undefined) {
return tree as TreeFile;
}
const output: any = {};
let current: any = tree;
if (current.child) {
current = current.child;
}
for (const file in current) {
const output: Tree = {};
for (const file in tree) {
const anonymizedPath = anonymizePath(file, terms);
output[anonymizedPath] = anonymizeTreeRecursive(current[file]);
if (output[anonymizedPath]) {
// file anonymization conflict
}
output[anonymizedPath] = anonymizeTreeRecursive(tree[file]);
}
return output;
}
return anonymizeTreeRecursive(await this.files(opt));
return anonymizeTreeRecursive(await this.files(opt)) as Tree;
}
/**
@@ -85,6 +85,9 @@ export default class Repository {
return files;
}
/**
* Check the status of the repository
*/
check() {
if (this._model.options.expirationMode != "never") {
if (this._model.options.expirationDate > new Date()) {

View File

@@ -37,7 +37,7 @@ export function anonymizeStream(filename: string, repository: Repository) {
if (isTextFile(filename, data)) {
data = anonymizeContent(data.toString(), repository);
}
chunks = [];
len = 0;
@@ -105,32 +105,38 @@ export function anonymizeContent(content: string, repository: Repository) {
);
}
for (let term of repository.options.terms || []) {
const terms = repository.options.terms || [];
for (let i = 0; i < terms.length; i++) {
const term = terms[i];
if (term.trim() == "") {
continue;
}
// remove whole url if it contains the term
content = content.replace(urlRegex, (match) => {
if (new RegExp(`\\b${term}\\b`, "gi").test(match))
return config.ANONYMIZATION_MASK;
return config.ANONYMIZATION_MASK + "-" + (i + 1);
return match;
});
// remove the term in the text
content = content.replace(
new RegExp(`\\b${term}\\b`, "gi"),
config.ANONYMIZATION_MASK
config.ANONYMIZATION_MASK + "-" + (i + 1)
);
}
return content;
}
export function anonymizePath(path: string, terms: string[]) {
for (let term of terms) {
for (let i = 0; i < terms.length; i++) {
const term = terms[i];
if (term.trim() == "") {
continue;
}
path = path.replace(new RegExp(term, "gi"), config.ANONYMIZATION_MASK);
path = path.replace(
new RegExp(term, "gi"),
config.ANONYMIZATION_MASK + "-" + (i + 1)
);
}
return path;
}

View File

@@ -19,7 +19,8 @@ router.get(
try {
await repo.countView();
const f = new AnonymizedFile(repo, {
const f = new AnonymizedFile({
repository: repo,
anonymizedPath,
});
if (!(await f.isFileSupported())) {

View File

@@ -34,7 +34,8 @@ async function webView(req: express.Request, res: express.Response) {
requestPath = path.join(requestPath, "index.html");
}
requestPath = requestPath;
const f = new AnonymizedFile(repo, {
const f = new AnonymizedFile({
repository: repo,
anonymizedPath: requestPath,
});
if (!(await f.isFileSupported())) {

View File

@@ -56,7 +56,7 @@ export default async function start() {
app.use("/github", rate, connection.router);
// app routes
// api routes
app.use("/api/user", rate, router.user);
app.use("/api/repo", rate, router.repositoryPublic);
app.use("/api/repo", rate, router.file);