mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-02-13 10:52:53 +00:00
fix(#206): make sure that all text files are anonimized
This commit is contained in:
@@ -216,18 +216,28 @@ export default class AnonymizedFile {
|
||||
res.contentType("text/plain");
|
||||
}
|
||||
res.header("Accept-Ranges", "none");
|
||||
let fileInfo: Awaited<ReturnType<typeof storage.fileInfo>>;
|
||||
try {
|
||||
const fileInfo = await storage.fileInfo(this.originalCachePath);
|
||||
// the text files may be anonymized and therefore the size may be different
|
||||
if (!isTextFile(this.anonymizedPath) && fileInfo.size) {
|
||||
res.header("Content-Length", fileInfo.size.toString());
|
||||
}
|
||||
fileInfo = await storage.fileInfo(this.originalCachePath);
|
||||
} catch (error) {
|
||||
// unable to get file size
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
const anonymizer = new AnonymizeTransformer(this);
|
||||
|
||||
anonymizer.once("transform", (data) => {
|
||||
if (data.isText && !mime) {
|
||||
res.contentType("text/plain");
|
||||
}
|
||||
if (fileInfo?.size && !data.wasAnonimized) {
|
||||
// the text files may be anonymized and therefore the size may be different
|
||||
res.header("Content-Length", fileInfo.size.toString());
|
||||
}
|
||||
});
|
||||
|
||||
content
|
||||
.pipe(new AnonymizeTransformer(this))
|
||||
.pipe(anonymizer)
|
||||
.pipe(res)
|
||||
.on("close", () => {
|
||||
if (!content.closed && !content.destroyed) {
|
||||
|
||||
@@ -32,16 +32,41 @@ export function isTextFile(filePath: string, content?: Buffer) {
|
||||
}
|
||||
|
||||
export class AnonymizeTransformer extends Transform {
|
||||
public wasAnonimized = false;
|
||||
public isText = false;
|
||||
|
||||
constructor(private readonly file: AnonymizedFile) {
|
||||
super();
|
||||
}
|
||||
|
||||
_transform(chunk: Buffer, encoding: string, callback: () => void) {
|
||||
if (isTextFile(this.file.anonymizedPath, chunk)) {
|
||||
chunk = Buffer.from(
|
||||
anonymizeContent(chunk.toString(), this.file.repository)
|
||||
);
|
||||
const isText = isTextFile(this.file.anonymizedPath, chunk);
|
||||
|
||||
if (isText) {
|
||||
this.isText = true;
|
||||
const anonimizer = new ContentAnonimizer(chunk.toString(), {
|
||||
repoId: this.file.repository.repoId,
|
||||
image: this.file.repository.options.image,
|
||||
link: this.file.repository.options.link,
|
||||
terms: this.file.repository.options.terms,
|
||||
repoName: (this.file.repository.source as GitHubBase).githubRepository
|
||||
?.fullName,
|
||||
branchName:
|
||||
(this.file.repository.source as GitHubBase).branch?.name || "main",
|
||||
});
|
||||
anonimizer.anonymize();
|
||||
if (anonimizer.wasAnonymized) {
|
||||
this.wasAnonimized = true;
|
||||
chunk = Buffer.from(anonimizer.content);
|
||||
}
|
||||
}
|
||||
|
||||
this.emit("transform", {
|
||||
isText,
|
||||
wasAnonimized: this.wasAnonimized,
|
||||
chunk,
|
||||
});
|
||||
|
||||
this.push(chunk);
|
||||
callback();
|
||||
}
|
||||
@@ -61,86 +86,138 @@ interface Anonymizationptions {
|
||||
};
|
||||
}
|
||||
|
||||
export class ContentAnonimizer {
|
||||
public wasAnonymized = false;
|
||||
|
||||
constructor(
|
||||
public content: string,
|
||||
readonly opt: {
|
||||
image?: boolean;
|
||||
link?: boolean;
|
||||
terms?: string[];
|
||||
repoName?: string;
|
||||
branchName?: string;
|
||||
repoId?: string;
|
||||
}
|
||||
) {}
|
||||
|
||||
private removeImage() {
|
||||
if (this.opt.image !== false) {
|
||||
return;
|
||||
}
|
||||
// remove image in markdown
|
||||
this.content = this.content.replace(
|
||||
/!\[[^\]]*\]\((?<filename>.*?)(?=\"|\))(?<optionalpart>\".*\")?\)/g,
|
||||
() => {
|
||||
this.wasAnonymized = true;
|
||||
return config.ANONYMIZATION_MASK;
|
||||
}
|
||||
);
|
||||
}
|
||||
private removeLink() {
|
||||
if (this.opt.link !== false) {
|
||||
return;
|
||||
}
|
||||
// remove image in markdown
|
||||
this.content = this.content.replace(urlRegex, () => {
|
||||
this.wasAnonymized = true;
|
||||
return config.ANONYMIZATION_MASK;
|
||||
});
|
||||
}
|
||||
|
||||
private replaceGitHubSelfLinks() {
|
||||
if (!this.opt.repoName || !this.opt.branchName) {
|
||||
return;
|
||||
}
|
||||
const repoName = this.opt.repoName;
|
||||
const branchName = this.opt.branchName;
|
||||
|
||||
const replaceCallback = () => {
|
||||
this.wasAnonymized = true;
|
||||
return `https://${config.APP_HOSTNAME}/r/${this.opt.repoId}`;
|
||||
};
|
||||
this.content = this.content.replace(
|
||||
new RegExp(
|
||||
`https://raw.githubusercontent.com/${repoName}/${branchName}\\b`,
|
||||
"gi"
|
||||
),
|
||||
replaceCallback
|
||||
);
|
||||
this.content = this.content.replace(
|
||||
new RegExp(`https://github.com/${repoName}/blob/${branchName}\\b`, "gi"),
|
||||
replaceCallback
|
||||
);
|
||||
this.content = this.content.replace(
|
||||
new RegExp(`https://github.com/${repoName}/tree/${branchName}\\b`, "gi"),
|
||||
replaceCallback
|
||||
);
|
||||
this.content = this.content.replace(
|
||||
new RegExp(`https://github.com/${repoName}`, "gi"),
|
||||
replaceCallback
|
||||
);
|
||||
}
|
||||
|
||||
private replaceTerms() {
|
||||
const terms = this.opt.terms || [];
|
||||
for (let i = 0; i < terms.length; i++) {
|
||||
let term = terms[i];
|
||||
if (term.trim() == "") {
|
||||
continue;
|
||||
}
|
||||
const mask = config.ANONYMIZATION_MASK + "-" + (i + 1);
|
||||
try {
|
||||
new RegExp(term, "gi");
|
||||
} catch {
|
||||
// escape regex characters
|
||||
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
||||
}
|
||||
// remove whole url if it contains the term
|
||||
this.content = this.content.replace(urlRegex, (match) => {
|
||||
if (new RegExp(`\\b${term}\\b`, "gi").test(match)) {
|
||||
this.wasAnonymized = true;
|
||||
return mask;
|
||||
}
|
||||
return match;
|
||||
});
|
||||
|
||||
// remove the term in the text
|
||||
this.content = this.content.replace(
|
||||
new RegExp(`\\b${term}\\b`, "gi"),
|
||||
() => {
|
||||
this.wasAnonymized = true;
|
||||
return mask;
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
anonymize() {
|
||||
this.removeImage();
|
||||
this.removeLink();
|
||||
this.replaceGitHubSelfLinks();
|
||||
this.replaceTerms();
|
||||
return this.content;
|
||||
}
|
||||
}
|
||||
|
||||
export function anonymizeContent(
|
||||
content: string,
|
||||
repository: Anonymizationptions
|
||||
) {
|
||||
if (repository.options?.image === false) {
|
||||
// remove image in markdown
|
||||
content = content.replace(
|
||||
/!\[[^\]]*\]\((?<filename>.*?)(?=\"|\))(?<optionalpart>\".*\")?\)/g,
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
if (!repository.options?.link) {
|
||||
// remove all links
|
||||
content = content.replace(urlRegex, config.ANONYMIZATION_MASK);
|
||||
}
|
||||
|
||||
let repoName: string | undefined;
|
||||
let branchName: string | undefined;
|
||||
if (repository.source instanceof GitHubBase) {
|
||||
content = content.replace(
|
||||
new RegExp(
|
||||
`https://raw.githubusercontent.com/${
|
||||
repository.source.githubRepository.fullName
|
||||
}/${repository.source.branch?.name || "main"}\\b`,
|
||||
"gi"
|
||||
),
|
||||
`https://${config.APP_HOSTNAME}/r/${repository.repoId}`
|
||||
);
|
||||
content = content.replace(
|
||||
new RegExp(
|
||||
`https://github.com/${
|
||||
repository.source.githubRepository.fullName
|
||||
}/blob/${repository.source.branch?.name || "main"}\\b`,
|
||||
"gi"
|
||||
),
|
||||
`https://${config.APP_HOSTNAME}/r/${repository.repoId}`
|
||||
);
|
||||
content = content.replace(
|
||||
new RegExp(
|
||||
`https://github.com/${
|
||||
repository.source.githubRepository.fullName
|
||||
}/tree/${(repository.source as GitHubBase).branch?.name || "main"}\\b`,
|
||||
"gi"
|
||||
),
|
||||
`https://${config.APP_HOSTNAME}/r/${repository.repoId}`
|
||||
);
|
||||
content = content.replace(
|
||||
new RegExp(
|
||||
`https://github.com/${repository.source.githubRepository.fullName}`,
|
||||
"gi"
|
||||
),
|
||||
`https://${config.APP_HOSTNAME}/r/${repository.repoId}`
|
||||
);
|
||||
repoName = repository.source.githubRepository.fullName;
|
||||
branchName = repository.source.branch.name;
|
||||
}
|
||||
|
||||
const terms = repository.options.terms || [];
|
||||
for (let i = 0; i < terms.length; i++) {
|
||||
let term = terms[i];
|
||||
if (term.trim() == "") {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
new RegExp(term, "gi");
|
||||
} catch {
|
||||
// escape regex characters
|
||||
term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
|
||||
}
|
||||
// remove whole url if it contains the term
|
||||
content = content.replace(urlRegex, (match) => {
|
||||
if (new RegExp(`\\b${term}\\b`, "gi").test(match))
|
||||
return config.ANONYMIZATION_MASK + "-" + (i + 1);
|
||||
return match;
|
||||
});
|
||||
|
||||
// remove the term in the text
|
||||
content = content.replace(
|
||||
new RegExp(`\\b${term}\\b`, "gi"),
|
||||
config.ANONYMIZATION_MASK + "-" + (i + 1)
|
||||
);
|
||||
}
|
||||
return content;
|
||||
return new ContentAnonimizer(content, {
|
||||
repoId: repository.repoId,
|
||||
image: repository.options.image,
|
||||
link: repository.options.link,
|
||||
terms: repository.options.terms,
|
||||
repoName,
|
||||
branchName,
|
||||
}).anonymize();
|
||||
}
|
||||
|
||||
export function anonymizePath(path: string, terms: string[]) {
|
||||
|
||||
@@ -11,7 +11,6 @@ import { pipeline, Readable, Transform } from "stream";
|
||||
import ArchiveStreamToS3 from "decompress-stream-to-s3";
|
||||
import { Response } from "express";
|
||||
import { lookup } from "mime-types";
|
||||
import * as flow from "xml-flow";
|
||||
import * as archiver from "archiver";
|
||||
import { dirname, basename } from "path";
|
||||
import AnonymousError from "../AnonymousError";
|
||||
@@ -239,7 +238,7 @@ export default class S3Storage implements StorageBase {
|
||||
s3: this.client(2 * 60 * 60 * 1000), // 2h timeout
|
||||
type: "zip",
|
||||
onEntry: (header) => {
|
||||
header.name = header.name.substr(header.name.indexOf("/") + 1);
|
||||
header.name = header.name.substring(header.name.indexOf("/") + 1);
|
||||
if (source) {
|
||||
header.Tagging = `source=${source.type}`;
|
||||
header.Metadata = {
|
||||
|
||||
Reference in New Issue
Block a user