From 66d5d91e3eadbba2c4a9331c1fd62b9044413003 Mon Sep 17 00:00:00 2001 From: tdurieux Date: Mon, 8 May 2023 13:56:15 +0200 Subject: [PATCH] fix(#206): make sure that all text files are anonimized --- src/AnonymizedFile.ts | 22 ++-- src/anonymize-utils.ts | 233 +++++++++++++++++++++++++++-------------- src/storage/S3.ts | 3 +- 3 files changed, 172 insertions(+), 86 deletions(-) diff --git a/src/AnonymizedFile.ts b/src/AnonymizedFile.ts index ef469b3..34cd11c 100644 --- a/src/AnonymizedFile.ts +++ b/src/AnonymizedFile.ts @@ -216,18 +216,28 @@ export default class AnonymizedFile { res.contentType("text/plain"); } res.header("Accept-Ranges", "none"); + let fileInfo: Awaited>; try { - const fileInfo = await storage.fileInfo(this.originalCachePath); - // the text files may be anonymized and therefore the size may be different - if (!isTextFile(this.anonymizedPath) && fileInfo.size) { - res.header("Content-Length", fileInfo.size.toString()); - } + fileInfo = await storage.fileInfo(this.originalCachePath); } catch (error) { // unable to get file size console.error(error); } + + const anonymizer = new AnonymizeTransformer(this); + + anonymizer.once("transform", (data) => { + if (data.isText && !mime) { + res.contentType("text/plain"); + } + if (fileInfo?.size && !data.wasAnonimized) { + // the text files may be anonymized and therefore the size may be different + res.header("Content-Length", fileInfo.size.toString()); + } + }); + content - .pipe(new AnonymizeTransformer(this)) + .pipe(anonymizer) .pipe(res) .on("close", () => { if (!content.closed && !content.destroyed) { diff --git a/src/anonymize-utils.ts b/src/anonymize-utils.ts index 97a5588..81ffc20 100644 --- a/src/anonymize-utils.ts +++ b/src/anonymize-utils.ts @@ -32,16 +32,41 @@ export function isTextFile(filePath: string, content?: Buffer) { } export class AnonymizeTransformer extends Transform { + public wasAnonimized = false; + public isText = false; + constructor(private readonly file: AnonymizedFile) { super(); } _transform(chunk: Buffer, encoding: string, callback: () => void) { - if (isTextFile(this.file.anonymizedPath, chunk)) { - chunk = Buffer.from( - anonymizeContent(chunk.toString(), this.file.repository) - ); + const isText = isTextFile(this.file.anonymizedPath, chunk); + + if (isText) { + this.isText = true; + const anonimizer = new ContentAnonimizer(chunk.toString(), { + repoId: this.file.repository.repoId, + image: this.file.repository.options.image, + link: this.file.repository.options.link, + terms: this.file.repository.options.terms, + repoName: (this.file.repository.source as GitHubBase).githubRepository + ?.fullName, + branchName: + (this.file.repository.source as GitHubBase).branch?.name || "main", + }); + anonimizer.anonymize(); + if (anonimizer.wasAnonymized) { + this.wasAnonimized = true; + chunk = Buffer.from(anonimizer.content); + } } + + this.emit("transform", { + isText, + wasAnonimized: this.wasAnonimized, + chunk, + }); + this.push(chunk); callback(); } @@ -61,86 +86,138 @@ interface Anonymizationptions { }; } +export class ContentAnonimizer { + public wasAnonymized = false; + + constructor( + public content: string, + readonly opt: { + image?: boolean; + link?: boolean; + terms?: string[]; + repoName?: string; + branchName?: string; + repoId?: string; + } + ) {} + + private removeImage() { + if (this.opt.image !== false) { + return; + } + // remove image in markdown + this.content = this.content.replace( + /!\[[^\]]*\]\((?.*?)(?=\"|\))(?\".*\")?\)/g, + () => { + this.wasAnonymized = true; + return config.ANONYMIZATION_MASK; + } + ); + } + private removeLink() { + if (this.opt.link !== false) { + return; + } + // remove image in markdown + this.content = this.content.replace(urlRegex, () => { + this.wasAnonymized = true; + return config.ANONYMIZATION_MASK; + }); + } + + private replaceGitHubSelfLinks() { + if (!this.opt.repoName || !this.opt.branchName) { + return; + } + const repoName = this.opt.repoName; + const branchName = this.opt.branchName; + + const replaceCallback = () => { + this.wasAnonymized = true; + return `https://${config.APP_HOSTNAME}/r/${this.opt.repoId}`; + }; + this.content = this.content.replace( + new RegExp( + `https://raw.githubusercontent.com/${repoName}/${branchName}\\b`, + "gi" + ), + replaceCallback + ); + this.content = this.content.replace( + new RegExp(`https://github.com/${repoName}/blob/${branchName}\\b`, "gi"), + replaceCallback + ); + this.content = this.content.replace( + new RegExp(`https://github.com/${repoName}/tree/${branchName}\\b`, "gi"), + replaceCallback + ); + this.content = this.content.replace( + new RegExp(`https://github.com/${repoName}`, "gi"), + replaceCallback + ); + } + + private replaceTerms() { + const terms = this.opt.terms || []; + for (let i = 0; i < terms.length; i++) { + let term = terms[i]; + if (term.trim() == "") { + continue; + } + const mask = config.ANONYMIZATION_MASK + "-" + (i + 1); + try { + new RegExp(term, "gi"); + } catch { + // escape regex characters + term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); + } + // remove whole url if it contains the term + this.content = this.content.replace(urlRegex, (match) => { + if (new RegExp(`\\b${term}\\b`, "gi").test(match)) { + this.wasAnonymized = true; + return mask; + } + return match; + }); + + // remove the term in the text + this.content = this.content.replace( + new RegExp(`\\b${term}\\b`, "gi"), + () => { + this.wasAnonymized = true; + return mask; + } + ); + } + } + + anonymize() { + this.removeImage(); + this.removeLink(); + this.replaceGitHubSelfLinks(); + this.replaceTerms(); + return this.content; + } +} + export function anonymizeContent( content: string, repository: Anonymizationptions ) { - if (repository.options?.image === false) { - // remove image in markdown - content = content.replace( - /!\[[^\]]*\]\((?.*?)(?=\"|\))(?\".*\")?\)/g, - "" - ); - } - - if (!repository.options?.link) { - // remove all links - content = content.replace(urlRegex, config.ANONYMIZATION_MASK); - } - + let repoName: string | undefined; + let branchName: string | undefined; if (repository.source instanceof GitHubBase) { - content = content.replace( - new RegExp( - `https://raw.githubusercontent.com/${ - repository.source.githubRepository.fullName - }/${repository.source.branch?.name || "main"}\\b`, - "gi" - ), - `https://${config.APP_HOSTNAME}/r/${repository.repoId}` - ); - content = content.replace( - new RegExp( - `https://github.com/${ - repository.source.githubRepository.fullName - }/blob/${repository.source.branch?.name || "main"}\\b`, - "gi" - ), - `https://${config.APP_HOSTNAME}/r/${repository.repoId}` - ); - content = content.replace( - new RegExp( - `https://github.com/${ - repository.source.githubRepository.fullName - }/tree/${(repository.source as GitHubBase).branch?.name || "main"}\\b`, - "gi" - ), - `https://${config.APP_HOSTNAME}/r/${repository.repoId}` - ); - content = content.replace( - new RegExp( - `https://github.com/${repository.source.githubRepository.fullName}`, - "gi" - ), - `https://${config.APP_HOSTNAME}/r/${repository.repoId}` - ); + repoName = repository.source.githubRepository.fullName; + branchName = repository.source.branch.name; } - - const terms = repository.options.terms || []; - for (let i = 0; i < terms.length; i++) { - let term = terms[i]; - if (term.trim() == "") { - continue; - } - try { - new RegExp(term, "gi"); - } catch { - // escape regex characters - term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&"); - } - // remove whole url if it contains the term - content = content.replace(urlRegex, (match) => { - if (new RegExp(`\\b${term}\\b`, "gi").test(match)) - return config.ANONYMIZATION_MASK + "-" + (i + 1); - return match; - }); - - // remove the term in the text - content = content.replace( - new RegExp(`\\b${term}\\b`, "gi"), - config.ANONYMIZATION_MASK + "-" + (i + 1) - ); - } - return content; + return new ContentAnonimizer(content, { + repoId: repository.repoId, + image: repository.options.image, + link: repository.options.link, + terms: repository.options.terms, + repoName, + branchName, + }).anonymize(); } export function anonymizePath(path: string, terms: string[]) { diff --git a/src/storage/S3.ts b/src/storage/S3.ts index b2d1835..610ae07 100644 --- a/src/storage/S3.ts +++ b/src/storage/S3.ts @@ -11,7 +11,6 @@ import { pipeline, Readable, Transform } from "stream"; import ArchiveStreamToS3 from "decompress-stream-to-s3"; import { Response } from "express"; import { lookup } from "mime-types"; -import * as flow from "xml-flow"; import * as archiver from "archiver"; import { dirname, basename } from "path"; import AnonymousError from "../AnonymousError"; @@ -239,7 +238,7 @@ export default class S3Storage implements StorageBase { s3: this.client(2 * 60 * 60 * 1000), // 2h timeout type: "zip", onEntry: (header) => { - header.name = header.name.substr(header.name.indexOf("/") + 1); + header.name = header.name.substring(header.name.indexOf("/") + 1); if (source) { header.Tagging = `source=${source.type}`; header.Metadata = {