mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-05-16 06:49:09 +02:00
multiple fixes
This commit is contained in:
+10
-16
@@ -157,8 +157,18 @@ export default class Repository {
|
||||
files.forEach((f) => (f.repoId = this.repoId));
|
||||
await FileModel.insertMany(files);
|
||||
|
||||
const sourceWithTruncation = this.source as unknown as {
|
||||
truncatedFolderList?: string[];
|
||||
};
|
||||
if (Array.isArray(sourceWithTruncation.truncatedFolderList)) {
|
||||
this._model.truncatedFolders = sourceWithTruncation.truncatedFolderList;
|
||||
}
|
||||
|
||||
this._model.size = { storage: 0, file: 0 };
|
||||
await this.computeSize();
|
||||
if (isConnected) {
|
||||
await this._model.save();
|
||||
}
|
||||
}
|
||||
if (opt.path?.includes(config.ANONYMIZATION_MASK)) {
|
||||
const f = new AnonymizedFile({
|
||||
@@ -304,22 +314,6 @@ export default class Repository {
|
||||
force: true,
|
||||
});
|
||||
|
||||
if (ghRepo.size) {
|
||||
if (
|
||||
ghRepo.size > config.AUTO_DOWNLOAD_REPO_SIZE &&
|
||||
this.model.source.type == "GitHubDownload"
|
||||
) {
|
||||
this.model.source.type = "GitHubStream";
|
||||
await this.model.save();
|
||||
} else if (
|
||||
ghRepo.size < config.AUTO_DOWNLOAD_REPO_SIZE &&
|
||||
this.model.source.type == "GitHubStream"
|
||||
) {
|
||||
this.model.source.type = "GitHubDownload";
|
||||
await this.model.save();
|
||||
}
|
||||
}
|
||||
|
||||
// update the repository name if it has changed
|
||||
this.model.source.repositoryName = ghRepo.fullName;
|
||||
const branches = await ghRepo.branches({
|
||||
|
||||
+61
-15
@@ -1,5 +1,6 @@
|
||||
import { basename } from "path";
|
||||
import { Transform, Readable } from "stream";
|
||||
import { StringDecoder } from "string_decoder";
|
||||
import { isText } from "istextorbinary";
|
||||
|
||||
import config from "../config";
|
||||
@@ -30,8 +31,14 @@ export function isTextFile(filePath: string, content?: Buffer) {
|
||||
}
|
||||
|
||||
export class AnonymizeTransformer extends Transform {
|
||||
public isText: boolean | null = null;
|
||||
public isText: boolean;
|
||||
anonimizer: ContentAnonimizer;
|
||||
private decoder = new StringDecoder("utf8");
|
||||
// Trailing decoded text held back between chunks so that terms, URLs, or
|
||||
// markdown image patterns straddling a stream chunk boundary still match.
|
||||
// Must exceed the longest pattern we replace (terms + URLs + images).
|
||||
private pending = "";
|
||||
private static readonly OVERLAP = 4096;
|
||||
|
||||
constructor(
|
||||
readonly opt: {
|
||||
@@ -39,7 +46,11 @@ export class AnonymizeTransformer extends Transform {
|
||||
} & ConstructorParameters<typeof ContentAnonimizer>[0]
|
||||
) {
|
||||
super();
|
||||
this.isText = isTextFile(this.opt.filePath);
|
||||
// isTextFile may return null for unknown extensions; treat unknown as
|
||||
// binary. Sniffing from chunk content is unsafe — split archives,
|
||||
// compressed blobs, etc. can have an ASCII-looking first 64 KB and get
|
||||
// misclassified as text, which then UTF-8-round-trips and corrupts them.
|
||||
this.isText = isTextFile(this.opt.filePath) === true;
|
||||
this.anonimizer = new ContentAnonimizer(this.opt);
|
||||
}
|
||||
|
||||
@@ -48,23 +59,58 @@ export class AnonymizeTransformer extends Transform {
|
||||
}
|
||||
|
||||
_transform(chunk: Buffer, encoding: string, callback: () => void) {
|
||||
if (this.isText === null) {
|
||||
this.isText = isTextFile(this.opt.filePath, chunk);
|
||||
if (!this.isText) {
|
||||
this.emit("transform", {
|
||||
isText: this.isText,
|
||||
wasAnonimized: this.wasAnonimized,
|
||||
chunk,
|
||||
});
|
||||
this.push(chunk);
|
||||
return callback();
|
||||
}
|
||||
|
||||
// StringDecoder buffers trailing partial UTF-8 sequences across chunk
|
||||
// boundaries so we never decode half a codepoint into U+FFFD.
|
||||
this.pending += this.decoder.write(chunk);
|
||||
|
||||
if (this.pending.length > AnonymizeTransformer.OVERLAP) {
|
||||
let split = this.pending.length - AnonymizeTransformer.OVERLAP;
|
||||
// Avoid splitting a UTF-16 surrogate pair.
|
||||
const code = this.pending.charCodeAt(split);
|
||||
if (code >= 0xdc00 && code <= 0xdfff) {
|
||||
split -= 1;
|
||||
}
|
||||
const toProcess = this.pending.slice(0, split);
|
||||
this.pending = this.pending.slice(split);
|
||||
|
||||
const out = this.anonimizer.anonymize(toProcess);
|
||||
const outChunk = Buffer.from(out, "utf8");
|
||||
|
||||
this.emit("transform", {
|
||||
isText: this.isText,
|
||||
wasAnonimized: this.wasAnonimized,
|
||||
chunk: outChunk,
|
||||
});
|
||||
this.push(outChunk);
|
||||
}
|
||||
callback();
|
||||
}
|
||||
|
||||
_flush(callback: () => void) {
|
||||
if (this.isText) {
|
||||
const content = this.anonimizer.anonymize(chunk.toString());
|
||||
if (this.anonimizer.wasAnonymized) {
|
||||
chunk = Buffer.from(content);
|
||||
this.pending += this.decoder.end();
|
||||
if (this.pending) {
|
||||
const out = this.anonimizer.anonymize(this.pending);
|
||||
this.pending = "";
|
||||
const outChunk = Buffer.from(out, "utf8");
|
||||
this.emit("transform", {
|
||||
isText: this.isText,
|
||||
wasAnonimized: this.wasAnonimized,
|
||||
chunk: outChunk,
|
||||
});
|
||||
this.push(outChunk);
|
||||
}
|
||||
}
|
||||
|
||||
this.emit("transform", {
|
||||
isText: this.isText,
|
||||
wasAnonimized: this.wasAnonimized,
|
||||
chunk,
|
||||
});
|
||||
|
||||
this.push(chunk);
|
||||
callback();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,9 +30,9 @@ const AnonymizedRepositorySchema = new Schema({
|
||||
repositoryName: String,
|
||||
accessToken: String,
|
||||
},
|
||||
truckedFileList: {
|
||||
type: Boolean,
|
||||
default: false,
|
||||
truncatedFolders: {
|
||||
type: [String],
|
||||
default: [],
|
||||
},
|
||||
options: {
|
||||
terms: [String],
|
||||
|
||||
@@ -17,7 +17,7 @@ export interface IAnonymizedRepository {
|
||||
accessToken?: string;
|
||||
};
|
||||
owner: string;
|
||||
truckedFileList: boolean;
|
||||
truncatedFolders: string[];
|
||||
conference: string;
|
||||
options: {
|
||||
terms: string[];
|
||||
|
||||
@@ -21,6 +21,14 @@ const UserSchema = new Schema({
|
||||
},
|
||||
],
|
||||
isAdmin: { type: Boolean, default: false },
|
||||
apiTokens: [
|
||||
{
|
||||
tokenHash: { type: String, index: true },
|
||||
name: { type: String },
|
||||
createdAt: { type: Date, default: Date.now },
|
||||
lastUsedAt: { type: Date },
|
||||
},
|
||||
],
|
||||
photo: String,
|
||||
repositories: [
|
||||
{
|
||||
|
||||
@@ -12,6 +12,13 @@ export interface IUser {
|
||||
};
|
||||
username: string;
|
||||
isAdmin: boolean;
|
||||
apiTokens?: {
|
||||
_id?: string;
|
||||
tokenHash: string;
|
||||
name?: string;
|
||||
createdAt?: Date;
|
||||
lastUsedAt?: Date;
|
||||
}[];
|
||||
emails: {
|
||||
email: string;
|
||||
default: boolean;
|
||||
|
||||
@@ -209,8 +209,8 @@ export async function getRepositoryFromGitHub(opt: {
|
||||
accessToken: string;
|
||||
force?: boolean;
|
||||
}) {
|
||||
if (opt.repo.indexOf(".git") > -1) {
|
||||
opt.repo = opt.repo.replace(".git", "");
|
||||
if (opt.repo.endsWith(".git")) {
|
||||
opt.repo = opt.repo.slice(0, -4);
|
||||
}
|
||||
let dbModel;
|
||||
if (opt.repositoryID) {
|
||||
|
||||
@@ -15,10 +15,16 @@ import { IFile } from "../model/files/files.types";
|
||||
export default class GitHubStream extends GitHubBase {
|
||||
type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubStream";
|
||||
|
||||
private _truncatedFolders: string[] = [];
|
||||
|
||||
constructor(data: GitHubBaseData) {
|
||||
super(data);
|
||||
}
|
||||
|
||||
get truncatedFolderList(): string[] {
|
||||
return this._truncatedFolders;
|
||||
}
|
||||
|
||||
downloadFile(token: string, sha: string) {
|
||||
const oct = octokit(token);
|
||||
try {
|
||||
@@ -106,6 +112,7 @@ export default class GitHubStream extends GitHubBase {
|
||||
}
|
||||
|
||||
async getFiles(progress?: (status: string) => void) {
|
||||
this._truncatedFolders = [];
|
||||
return this.getTruncatedTree(this.data.commit, progress);
|
||||
}
|
||||
|
||||
@@ -149,19 +156,32 @@ export default class GitHubStream extends GitHubBase {
|
||||
}
|
||||
},
|
||||
});
|
||||
if (data.truncated) {
|
||||
this._truncatedFolders.push(parentPath);
|
||||
}
|
||||
output.push(...this.tree2Tree(data.tree, parentPath));
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
if ((error as { status?: number }).status == 409 || (error as { status?: number }).status == 404) {
|
||||
// empty repo
|
||||
data = { tree: [] };
|
||||
} else {
|
||||
throw new AnonymousError("repo_not_found", {
|
||||
httpStatus: (error as { status?: number }).status || 404,
|
||||
const status = (error as { status?: number }).status;
|
||||
if (status === 409) {
|
||||
throw new AnonymousError("repo_empty", {
|
||||
httpStatus: 409,
|
||||
object: this.data,
|
||||
cause: error as Error,
|
||||
});
|
||||
}
|
||||
if (status === 404) {
|
||||
throw new AnonymousError("repo_not_found", {
|
||||
httpStatus: 404,
|
||||
object: this.data,
|
||||
cause: error as Error,
|
||||
});
|
||||
}
|
||||
throw new AnonymousError("repo_not_found", {
|
||||
httpStatus: status || 500,
|
||||
object: this.data,
|
||||
cause: error as Error,
|
||||
});
|
||||
}
|
||||
const promises: ReturnType<GitHubStream["getGHTree"]>[] = [];
|
||||
const parentPaths: string[] = [];
|
||||
@@ -183,7 +203,7 @@ export default class GitHubStream extends GitHubBase {
|
||||
}
|
||||
(await Promise.all(promises)).forEach((data, i) => {
|
||||
if (data.truncated) {
|
||||
// TODO: the tree is truncated
|
||||
this._truncatedFolders.push(parentPaths[i]);
|
||||
}
|
||||
output.push(...this.tree2Tree(data.tree, parentPaths[i]));
|
||||
});
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
import got from "got";
|
||||
import { Parse } from "unzip-stream";
|
||||
import archiver = require("archiver");
|
||||
|
||||
import GitHubDownload from "./source/GitHubDownload";
|
||||
import { AnonymizeTransformer, anonymizePath } from "./anonymize-utils";
|
||||
|
||||
export interface StreamAnonymizedZipOptions {
|
||||
repoId: string;
|
||||
organization: string;
|
||||
repoName: string;
|
||||
commit: string;
|
||||
getToken: () => string | Promise<string>;
|
||||
anonymizerOptions: ConstructorParameters<typeof AnonymizeTransformer>[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Stream the GitHub source zip for a repository, anonymize each entry on the
|
||||
* fly, and pipe the resulting archive into the provided writable response.
|
||||
*
|
||||
* No data is written to local storage — the zip flows GitHub → unzip → per
|
||||
* file anonymizer → archiver → response.
|
||||
*/
|
||||
export async function streamAnonymizedZip(
|
||||
opt: StreamAnonymizedZipOptions,
|
||||
res: NodeJS.WritableStream & {
|
||||
on(event: string, listener: (...args: unknown[]) => void): unknown;
|
||||
}
|
||||
): Promise<void> {
|
||||
const source = new GitHubDownload({
|
||||
repoId: opt.repoId,
|
||||
organization: opt.organization,
|
||||
repoName: opt.repoName,
|
||||
commit: opt.commit,
|
||||
getToken: opt.getToken,
|
||||
});
|
||||
|
||||
const response = await source.getZipUrl();
|
||||
const downloadStream = got.stream(response.url);
|
||||
|
||||
res.on("error", (error) => {
|
||||
console.error(error);
|
||||
downloadStream.destroy();
|
||||
});
|
||||
res.on("close", () => {
|
||||
downloadStream.destroy();
|
||||
});
|
||||
|
||||
const archive = archiver("zip", {});
|
||||
downloadStream
|
||||
.on("error", (error) => {
|
||||
console.error(error);
|
||||
try {
|
||||
archive.finalize();
|
||||
} catch {
|
||||
/* ignored */
|
||||
}
|
||||
})
|
||||
.on("close", () => {
|
||||
try {
|
||||
archive.finalize();
|
||||
} catch {
|
||||
/* ignored */
|
||||
}
|
||||
})
|
||||
.pipe(Parse())
|
||||
.on("entry", (entry: NodeJS.ReadableStream & { type: string; path: string; autodrain: () => void }) => {
|
||||
if (entry.type === "File") {
|
||||
try {
|
||||
const fileName = anonymizePath(
|
||||
entry.path.substring(entry.path.indexOf("/") + 1),
|
||||
opt.anonymizerOptions.terms || []
|
||||
);
|
||||
const anonymizer = new AnonymizeTransformer(opt.anonymizerOptions);
|
||||
anonymizer.opt.filePath = fileName;
|
||||
const st = entry.pipe(anonymizer);
|
||||
archive.append(st, { name: fileName });
|
||||
} catch (error) {
|
||||
entry.autodrain();
|
||||
console.error(error);
|
||||
}
|
||||
} else {
|
||||
entry.autodrain();
|
||||
}
|
||||
})
|
||||
.on("error", (error: Error) => {
|
||||
console.error(error);
|
||||
try {
|
||||
archive.finalize();
|
||||
} catch {
|
||||
/* ignored */
|
||||
}
|
||||
})
|
||||
.on("finish", () => {
|
||||
try {
|
||||
archive.finalize();
|
||||
} catch {
|
||||
/* ignored */
|
||||
}
|
||||
});
|
||||
|
||||
archive.pipe(res).on("error", (error) => {
|
||||
console.error(error);
|
||||
(res as { end?: () => void }).end?.();
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user