multiple fixes

This commit is contained in:
tdurieux
2026-05-03 15:30:54 +02:00
parent 1968e3341a
commit a5f66d6844
31 changed files with 1513 additions and 464 deletions
+10 -16
View File
@@ -157,8 +157,18 @@ export default class Repository {
files.forEach((f) => (f.repoId = this.repoId));
await FileModel.insertMany(files);
const sourceWithTruncation = this.source as unknown as {
truncatedFolderList?: string[];
};
if (Array.isArray(sourceWithTruncation.truncatedFolderList)) {
this._model.truncatedFolders = sourceWithTruncation.truncatedFolderList;
}
this._model.size = { storage: 0, file: 0 };
await this.computeSize();
if (isConnected) {
await this._model.save();
}
}
if (opt.path?.includes(config.ANONYMIZATION_MASK)) {
const f = new AnonymizedFile({
@@ -304,22 +314,6 @@ export default class Repository {
force: true,
});
if (ghRepo.size) {
if (
ghRepo.size > config.AUTO_DOWNLOAD_REPO_SIZE &&
this.model.source.type == "GitHubDownload"
) {
this.model.source.type = "GitHubStream";
await this.model.save();
} else if (
ghRepo.size < config.AUTO_DOWNLOAD_REPO_SIZE &&
this.model.source.type == "GitHubStream"
) {
this.model.source.type = "GitHubDownload";
await this.model.save();
}
}
// update the repository name if it has changed
this.model.source.repositoryName = ghRepo.fullName;
const branches = await ghRepo.branches({
+61 -15
View File
@@ -1,5 +1,6 @@
import { basename } from "path";
import { Transform, Readable } from "stream";
import { StringDecoder } from "string_decoder";
import { isText } from "istextorbinary";
import config from "../config";
@@ -30,8 +31,14 @@ export function isTextFile(filePath: string, content?: Buffer) {
}
export class AnonymizeTransformer extends Transform {
public isText: boolean | null = null;
public isText: boolean;
anonimizer: ContentAnonimizer;
private decoder = new StringDecoder("utf8");
// Trailing decoded text held back between chunks so that terms, URLs, or
// markdown image patterns straddling a stream chunk boundary still match.
// Must exceed the longest pattern we replace (terms + URLs + images).
private pending = "";
private static readonly OVERLAP = 4096;
constructor(
readonly opt: {
@@ -39,7 +46,11 @@ export class AnonymizeTransformer extends Transform {
} & ConstructorParameters<typeof ContentAnonimizer>[0]
) {
super();
this.isText = isTextFile(this.opt.filePath);
// isTextFile may return null for unknown extensions; treat unknown as
// binary. Sniffing from chunk content is unsafe — split archives,
// compressed blobs, etc. can have an ASCII-looking first 64 KB and get
// misclassified as text, which then UTF-8-round-trips and corrupts them.
this.isText = isTextFile(this.opt.filePath) === true;
this.anonimizer = new ContentAnonimizer(this.opt);
}
@@ -48,23 +59,58 @@ export class AnonymizeTransformer extends Transform {
}
_transform(chunk: Buffer, encoding: string, callback: () => void) {
if (this.isText === null) {
this.isText = isTextFile(this.opt.filePath, chunk);
if (!this.isText) {
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk,
});
this.push(chunk);
return callback();
}
// StringDecoder buffers trailing partial UTF-8 sequences across chunk
// boundaries so we never decode half a codepoint into U+FFFD.
this.pending += this.decoder.write(chunk);
if (this.pending.length > AnonymizeTransformer.OVERLAP) {
let split = this.pending.length - AnonymizeTransformer.OVERLAP;
// Avoid splitting a UTF-16 surrogate pair.
const code = this.pending.charCodeAt(split);
if (code >= 0xdc00 && code <= 0xdfff) {
split -= 1;
}
const toProcess = this.pending.slice(0, split);
this.pending = this.pending.slice(split);
const out = this.anonimizer.anonymize(toProcess);
const outChunk = Buffer.from(out, "utf8");
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk: outChunk,
});
this.push(outChunk);
}
callback();
}
_flush(callback: () => void) {
if (this.isText) {
const content = this.anonimizer.anonymize(chunk.toString());
if (this.anonimizer.wasAnonymized) {
chunk = Buffer.from(content);
this.pending += this.decoder.end();
if (this.pending) {
const out = this.anonimizer.anonymize(this.pending);
this.pending = "";
const outChunk = Buffer.from(out, "utf8");
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk: outChunk,
});
this.push(outChunk);
}
}
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk,
});
this.push(chunk);
callback();
}
}
@@ -30,9 +30,9 @@ const AnonymizedRepositorySchema = new Schema({
repositoryName: String,
accessToken: String,
},
truckedFileList: {
type: Boolean,
default: false,
truncatedFolders: {
type: [String],
default: [],
},
options: {
terms: [String],
@@ -17,7 +17,7 @@ export interface IAnonymizedRepository {
accessToken?: string;
};
owner: string;
truckedFileList: boolean;
truncatedFolders: string[];
conference: string;
options: {
terms: string[];
+8
View File
@@ -21,6 +21,14 @@ const UserSchema = new Schema({
},
],
isAdmin: { type: Boolean, default: false },
apiTokens: [
{
tokenHash: { type: String, index: true },
name: { type: String },
createdAt: { type: Date, default: Date.now },
lastUsedAt: { type: Date },
},
],
photo: String,
repositories: [
{
+7
View File
@@ -12,6 +12,13 @@ export interface IUser {
};
username: string;
isAdmin: boolean;
apiTokens?: {
_id?: string;
tokenHash: string;
name?: string;
createdAt?: Date;
lastUsedAt?: Date;
}[];
emails: {
email: string;
default: boolean;
+2 -2
View File
@@ -209,8 +209,8 @@ export async function getRepositoryFromGitHub(opt: {
accessToken: string;
force?: boolean;
}) {
if (opt.repo.indexOf(".git") > -1) {
opt.repo = opt.repo.replace(".git", "");
if (opt.repo.endsWith(".git")) {
opt.repo = opt.repo.slice(0, -4);
}
let dbModel;
if (opt.repositoryID) {
+27 -7
View File
@@ -15,10 +15,16 @@ import { IFile } from "../model/files/files.types";
export default class GitHubStream extends GitHubBase {
type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubStream";
private _truncatedFolders: string[] = [];
constructor(data: GitHubBaseData) {
super(data);
}
get truncatedFolderList(): string[] {
return this._truncatedFolders;
}
downloadFile(token: string, sha: string) {
const oct = octokit(token);
try {
@@ -106,6 +112,7 @@ export default class GitHubStream extends GitHubBase {
}
async getFiles(progress?: (status: string) => void) {
this._truncatedFolders = [];
return this.getTruncatedTree(this.data.commit, progress);
}
@@ -149,19 +156,32 @@ export default class GitHubStream extends GitHubBase {
}
},
});
if (data.truncated) {
this._truncatedFolders.push(parentPath);
}
output.push(...this.tree2Tree(data.tree, parentPath));
} catch (error) {
console.log(error);
if ((error as { status?: number }).status == 409 || (error as { status?: number }).status == 404) {
// empty repo
data = { tree: [] };
} else {
throw new AnonymousError("repo_not_found", {
httpStatus: (error as { status?: number }).status || 404,
const status = (error as { status?: number }).status;
if (status === 409) {
throw new AnonymousError("repo_empty", {
httpStatus: 409,
object: this.data,
cause: error as Error,
});
}
if (status === 404) {
throw new AnonymousError("repo_not_found", {
httpStatus: 404,
object: this.data,
cause: error as Error,
});
}
throw new AnonymousError("repo_not_found", {
httpStatus: status || 500,
object: this.data,
cause: error as Error,
});
}
const promises: ReturnType<GitHubStream["getGHTree"]>[] = [];
const parentPaths: string[] = [];
@@ -183,7 +203,7 @@ export default class GitHubStream extends GitHubBase {
}
(await Promise.all(promises)).forEach((data, i) => {
if (data.truncated) {
// TODO: the tree is truncated
this._truncatedFolders.push(parentPaths[i]);
}
output.push(...this.tree2Tree(data.tree, parentPaths[i]));
});
+106
View File
@@ -0,0 +1,106 @@
import got from "got";
import { Parse } from "unzip-stream";
import archiver = require("archiver");
import GitHubDownload from "./source/GitHubDownload";
import { AnonymizeTransformer, anonymizePath } from "./anonymize-utils";
export interface StreamAnonymizedZipOptions {
repoId: string;
organization: string;
repoName: string;
commit: string;
getToken: () => string | Promise<string>;
anonymizerOptions: ConstructorParameters<typeof AnonymizeTransformer>[0];
}
/**
* Stream the GitHub source zip for a repository, anonymize each entry on the
* fly, and pipe the resulting archive into the provided writable response.
*
* No data is written to local storage — the zip flows GitHub → unzip → per
* file anonymizer → archiver → response.
*/
export async function streamAnonymizedZip(
opt: StreamAnonymizedZipOptions,
res: NodeJS.WritableStream & {
on(event: string, listener: (...args: unknown[]) => void): unknown;
}
): Promise<void> {
const source = new GitHubDownload({
repoId: opt.repoId,
organization: opt.organization,
repoName: opt.repoName,
commit: opt.commit,
getToken: opt.getToken,
});
const response = await source.getZipUrl();
const downloadStream = got.stream(response.url);
res.on("error", (error) => {
console.error(error);
downloadStream.destroy();
});
res.on("close", () => {
downloadStream.destroy();
});
const archive = archiver("zip", {});
downloadStream
.on("error", (error) => {
console.error(error);
try {
archive.finalize();
} catch {
/* ignored */
}
})
.on("close", () => {
try {
archive.finalize();
} catch {
/* ignored */
}
})
.pipe(Parse())
.on("entry", (entry: NodeJS.ReadableStream & { type: string; path: string; autodrain: () => void }) => {
if (entry.type === "File") {
try {
const fileName = anonymizePath(
entry.path.substring(entry.path.indexOf("/") + 1),
opt.anonymizerOptions.terms || []
);
const anonymizer = new AnonymizeTransformer(opt.anonymizerOptions);
anonymizer.opt.filePath = fileName;
const st = entry.pipe(anonymizer);
archive.append(st, { name: fileName });
} catch (error) {
entry.autodrain();
console.error(error);
}
} else {
entry.autodrain();
}
})
.on("error", (error: Error) => {
console.error(error);
try {
archive.finalize();
} catch {
/* ignored */
}
})
.on("finish", () => {
try {
archive.finalize();
} catch {
/* ignored */
}
});
archive.pipe(res).on("error", (error) => {
console.error(error);
(res as { end?: () => void }).end?.();
});
}