multiple fixes

This commit is contained in:
tdurieux
2026-05-03 15:30:54 +02:00
parent 1968e3341a
commit a5f66d6844
31 changed files with 1513 additions and 464 deletions
+10 -16
View File
@@ -157,8 +157,18 @@ export default class Repository {
files.forEach((f) => (f.repoId = this.repoId));
await FileModel.insertMany(files);
const sourceWithTruncation = this.source as unknown as {
truncatedFolderList?: string[];
};
if (Array.isArray(sourceWithTruncation.truncatedFolderList)) {
this._model.truncatedFolders = sourceWithTruncation.truncatedFolderList;
}
this._model.size = { storage: 0, file: 0 };
await this.computeSize();
if (isConnected) {
await this._model.save();
}
}
if (opt.path?.includes(config.ANONYMIZATION_MASK)) {
const f = new AnonymizedFile({
@@ -304,22 +314,6 @@ export default class Repository {
force: true,
});
if (ghRepo.size) {
if (
ghRepo.size > config.AUTO_DOWNLOAD_REPO_SIZE &&
this.model.source.type == "GitHubDownload"
) {
this.model.source.type = "GitHubStream";
await this.model.save();
} else if (
ghRepo.size < config.AUTO_DOWNLOAD_REPO_SIZE &&
this.model.source.type == "GitHubStream"
) {
this.model.source.type = "GitHubDownload";
await this.model.save();
}
}
// update the repository name if it has changed
this.model.source.repositoryName = ghRepo.fullName;
const branches = await ghRepo.branches({
+61 -15
View File
@@ -1,5 +1,6 @@
import { basename } from "path";
import { Transform, Readable } from "stream";
import { StringDecoder } from "string_decoder";
import { isText } from "istextorbinary";
import config from "../config";
@@ -30,8 +31,14 @@ export function isTextFile(filePath: string, content?: Buffer) {
}
export class AnonymizeTransformer extends Transform {
public isText: boolean | null = null;
public isText: boolean;
anonimizer: ContentAnonimizer;
private decoder = new StringDecoder("utf8");
// Trailing decoded text held back between chunks so that terms, URLs, or
// markdown image patterns straddling a stream chunk boundary still match.
// Must exceed the longest pattern we replace (terms + URLs + images).
private pending = "";
private static readonly OVERLAP = 4096;
constructor(
readonly opt: {
@@ -39,7 +46,11 @@ export class AnonymizeTransformer extends Transform {
} & ConstructorParameters<typeof ContentAnonimizer>[0]
) {
super();
this.isText = isTextFile(this.opt.filePath);
// isTextFile may return null for unknown extensions; treat unknown as
// binary. Sniffing from chunk content is unsafe — split archives,
// compressed blobs, etc. can have an ASCII-looking first 64 KB and get
// misclassified as text, which then UTF-8-round-trips and corrupts them.
this.isText = isTextFile(this.opt.filePath) === true;
this.anonimizer = new ContentAnonimizer(this.opt);
}
@@ -48,23 +59,58 @@ export class AnonymizeTransformer extends Transform {
}
_transform(chunk: Buffer, encoding: string, callback: () => void) {
if (this.isText === null) {
this.isText = isTextFile(this.opt.filePath, chunk);
if (!this.isText) {
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk,
});
this.push(chunk);
return callback();
}
// StringDecoder buffers trailing partial UTF-8 sequences across chunk
// boundaries so we never decode half a codepoint into U+FFFD.
this.pending += this.decoder.write(chunk);
if (this.pending.length > AnonymizeTransformer.OVERLAP) {
let split = this.pending.length - AnonymizeTransformer.OVERLAP;
// Avoid splitting a UTF-16 surrogate pair.
const code = this.pending.charCodeAt(split);
if (code >= 0xdc00 && code <= 0xdfff) {
split -= 1;
}
const toProcess = this.pending.slice(0, split);
this.pending = this.pending.slice(split);
const out = this.anonimizer.anonymize(toProcess);
const outChunk = Buffer.from(out, "utf8");
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk: outChunk,
});
this.push(outChunk);
}
callback();
}
_flush(callback: () => void) {
if (this.isText) {
const content = this.anonimizer.anonymize(chunk.toString());
if (this.anonimizer.wasAnonymized) {
chunk = Buffer.from(content);
this.pending += this.decoder.end();
if (this.pending) {
const out = this.anonimizer.anonymize(this.pending);
this.pending = "";
const outChunk = Buffer.from(out, "utf8");
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk: outChunk,
});
this.push(outChunk);
}
}
this.emit("transform", {
isText: this.isText,
wasAnonimized: this.wasAnonimized,
chunk,
});
this.push(chunk);
callback();
}
}
@@ -30,9 +30,9 @@ const AnonymizedRepositorySchema = new Schema({
repositoryName: String,
accessToken: String,
},
truckedFileList: {
type: Boolean,
default: false,
truncatedFolders: {
type: [String],
default: [],
},
options: {
terms: [String],
@@ -17,7 +17,7 @@ export interface IAnonymizedRepository {
accessToken?: string;
};
owner: string;
truckedFileList: boolean;
truncatedFolders: string[];
conference: string;
options: {
terms: string[];
+8
View File
@@ -21,6 +21,14 @@ const UserSchema = new Schema({
},
],
isAdmin: { type: Boolean, default: false },
apiTokens: [
{
tokenHash: { type: String, index: true },
name: { type: String },
createdAt: { type: Date, default: Date.now },
lastUsedAt: { type: Date },
},
],
photo: String,
repositories: [
{
+7
View File
@@ -12,6 +12,13 @@ export interface IUser {
};
username: string;
isAdmin: boolean;
apiTokens?: {
_id?: string;
tokenHash: string;
name?: string;
createdAt?: Date;
lastUsedAt?: Date;
}[];
emails: {
email: string;
default: boolean;
+2 -2
View File
@@ -209,8 +209,8 @@ export async function getRepositoryFromGitHub(opt: {
accessToken: string;
force?: boolean;
}) {
if (opt.repo.indexOf(".git") > -1) {
opt.repo = opt.repo.replace(".git", "");
if (opt.repo.endsWith(".git")) {
opt.repo = opt.repo.slice(0, -4);
}
let dbModel;
if (opt.repositoryID) {
+27 -7
View File
@@ -15,10 +15,16 @@ import { IFile } from "../model/files/files.types";
export default class GitHubStream extends GitHubBase {
type: "GitHubDownload" | "GitHubStream" | "Zip" = "GitHubStream";
private _truncatedFolders: string[] = [];
constructor(data: GitHubBaseData) {
super(data);
}
get truncatedFolderList(): string[] {
return this._truncatedFolders;
}
downloadFile(token: string, sha: string) {
const oct = octokit(token);
try {
@@ -106,6 +112,7 @@ export default class GitHubStream extends GitHubBase {
}
async getFiles(progress?: (status: string) => void) {
this._truncatedFolders = [];
return this.getTruncatedTree(this.data.commit, progress);
}
@@ -149,19 +156,32 @@ export default class GitHubStream extends GitHubBase {
}
},
});
if (data.truncated) {
this._truncatedFolders.push(parentPath);
}
output.push(...this.tree2Tree(data.tree, parentPath));
} catch (error) {
console.log(error);
if ((error as { status?: number }).status == 409 || (error as { status?: number }).status == 404) {
// empty repo
data = { tree: [] };
} else {
throw new AnonymousError("repo_not_found", {
httpStatus: (error as { status?: number }).status || 404,
const status = (error as { status?: number }).status;
if (status === 409) {
throw new AnonymousError("repo_empty", {
httpStatus: 409,
object: this.data,
cause: error as Error,
});
}
if (status === 404) {
throw new AnonymousError("repo_not_found", {
httpStatus: 404,
object: this.data,
cause: error as Error,
});
}
throw new AnonymousError("repo_not_found", {
httpStatus: status || 500,
object: this.data,
cause: error as Error,
});
}
const promises: ReturnType<GitHubStream["getGHTree"]>[] = [];
const parentPaths: string[] = [];
@@ -183,7 +203,7 @@ export default class GitHubStream extends GitHubBase {
}
(await Promise.all(promises)).forEach((data, i) => {
if (data.truncated) {
// TODO: the tree is truncated
this._truncatedFolders.push(parentPaths[i]);
}
output.push(...this.tree2Tree(data.tree, parentPaths[i]));
});
+106
View File
@@ -0,0 +1,106 @@
import got from "got";
import { Parse } from "unzip-stream";
import archiver = require("archiver");
import GitHubDownload from "./source/GitHubDownload";
import { AnonymizeTransformer, anonymizePath } from "./anonymize-utils";
export interface StreamAnonymizedZipOptions {
repoId: string;
organization: string;
repoName: string;
commit: string;
getToken: () => string | Promise<string>;
anonymizerOptions: ConstructorParameters<typeof AnonymizeTransformer>[0];
}
/**
* Stream the GitHub source zip for a repository, anonymize each entry on the
* fly, and pipe the resulting archive into the provided writable response.
*
* No data is written to local storage the zip flows GitHub unzip per
* file anonymizer archiver response.
*/
export async function streamAnonymizedZip(
opt: StreamAnonymizedZipOptions,
res: NodeJS.WritableStream & {
on(event: string, listener: (...args: unknown[]) => void): unknown;
}
): Promise<void> {
const source = new GitHubDownload({
repoId: opt.repoId,
organization: opt.organization,
repoName: opt.repoName,
commit: opt.commit,
getToken: opt.getToken,
});
const response = await source.getZipUrl();
const downloadStream = got.stream(response.url);
res.on("error", (error) => {
console.error(error);
downloadStream.destroy();
});
res.on("close", () => {
downloadStream.destroy();
});
const archive = archiver("zip", {});
downloadStream
.on("error", (error) => {
console.error(error);
try {
archive.finalize();
} catch {
/* ignored */
}
})
.on("close", () => {
try {
archive.finalize();
} catch {
/* ignored */
}
})
.pipe(Parse())
.on("entry", (entry: NodeJS.ReadableStream & { type: string; path: string; autodrain: () => void }) => {
if (entry.type === "File") {
try {
const fileName = anonymizePath(
entry.path.substring(entry.path.indexOf("/") + 1),
opt.anonymizerOptions.terms || []
);
const anonymizer = new AnonymizeTransformer(opt.anonymizerOptions);
anonymizer.opt.filePath = fileName;
const st = entry.pipe(anonymizer);
archive.append(st, { name: fileName });
} catch (error) {
entry.autodrain();
console.error(error);
}
} else {
entry.autodrain();
}
})
.on("error", (error: Error) => {
console.error(error);
try {
archive.finalize();
} catch {
/* ignored */
}
})
.on("finish", () => {
try {
archive.finalize();
} catch {
/* ignored */
}
});
archive.pipe(res).on("error", (error) => {
console.error(error);
(res as { end?: () => void }).end?.();
});
}
+2
View File
@@ -12,6 +12,7 @@ import * as compression from "compression";
import * as passport from "passport";
import { connect } from "./database";
import { initSession, router as connectionRouter } from "./routes/connection";
import { bearerTokenAuth } from "./routes/token-auth";
import router from "./routes";
import AnonymizedRepositoryModel from "../core/model/anonymizedRepositories/anonymizedRepositories.model";
import { conferenceStatusCheck, repositoryStatusCheck } from "./schedule";
@@ -56,6 +57,7 @@ export default async function start() {
app.use(initSession());
app.use(passport.initialize());
app.use(passport.session());
app.use(bearerTokenAuth);
startWorker();
+77
View File
@@ -0,0 +1,77 @@
import * as express from "express";
import { handleError, getUser, isOwnerOrAdmin } from "./route-utils";
import UserModel from "../../core/model/users/users.model";
import { generateToken, hashToken } from "./token-auth";
const router = express.Router();
router.use(async (req, res, next) => {
try {
const user = await getUser(req);
isOwnerOrAdmin([], user);
next();
} catch (error) {
handleError(error, res, req);
}
});
router.get("/", async (req, res) => {
try {
const user = await getUser(req);
const model = await UserModel.findById(user.model.id);
if (!model) return res.status(404).json({ error: "user_not_found" });
const tokens = (model.apiTokens || []).map((t) => ({
id: t._id,
name: t.name,
createdAt: t.createdAt,
lastUsedAt: t.lastUsedAt,
}));
res.json(tokens);
} catch (error) {
handleError(error, res, req);
}
});
router.post("/", async (req, res) => {
try {
const user = await getUser(req);
const name = (req.body?.name || "").toString().trim() || "unnamed";
const plaintext = generateToken();
const tokenHash = hashToken(plaintext);
const model = await UserModel.findById(user.model.id);
if (!model) return res.status(404).json({ error: "user_not_found" });
if (!model.apiTokens) model.apiTokens = [];
model.apiTokens.push({
tokenHash,
name,
createdAt: new Date(),
});
await model.save();
const created = model.apiTokens[model.apiTokens.length - 1];
res.json({
id: created._id,
name: created.name,
createdAt: created.createdAt,
token: plaintext,
});
} catch (error) {
handleError(error, res, req);
}
});
router.delete("/:id", async (req, res) => {
try {
const user = await getUser(req);
const result = await UserModel.updateOne(
{ _id: user.model.id },
{ $pull: { apiTokens: { _id: req.params.id } } }
);
res.json({ removed: result.modifiedCount });
} catch (error) {
handleError(error, res, req);
}
});
export default router;
+3
View File
@@ -9,6 +9,7 @@ import Repository from "../../core/Repository";
import User from "../../core/User";
import { ensureAuthenticated } from "./connection";
import { handleError, getUser, isOwnerOrAdmin, getRepo } from "./route-utils";
import adminTokensRouter from "./admin-tokens";
const router = express.Router();
@@ -31,6 +32,8 @@ router.use(
}
);
router.use("/tokens", adminTokensRouter);
router.post("/queue/:name/:repo_id", async (req, res) => {
let queue: Queue<Repository, void>;
if (req.params.name == "download") {
+26 -11
View File
@@ -31,13 +31,23 @@ const verify = async (
): Promise<void> => {
let user: IUserDocument | null;
try {
const now = new Date();
user = await UserModel.findOne({ "externalIDs.github": profile.id });
if (user) {
user.accessTokens.github = accessToken;
await UserModel.updateOne(
{ _id: user._id },
{
$set: {
"accessTokens.github": accessToken,
"accessTokenDates.github": now,
},
}
);
await AnonymizedPullRequestModel.updateMany(
{ owner: user._id },
{ "source.accessToken": accessToken }
);
user = await UserModel.findById(user._id);
} else {
// Check if a user with this username already exists (e.g. created
// manually without externalIDs.github). Link the GitHub ID to the
@@ -45,8 +55,17 @@ const verify = async (
// the isAdmin flag.
user = await UserModel.findOne({ username: profile.username });
if (user) {
user.externalIDs.github = profile.id;
user.accessTokens.github = accessToken;
await UserModel.updateOne(
{ _id: user._id },
{
$set: {
"externalIDs.github": profile.id,
"accessTokens.github": accessToken,
"accessTokenDates.github": now,
},
}
);
user = await UserModel.findById(user._id);
} else {
const photo = profile.photos ? profile.photos[0]?.value : null;
user = new UserModel({
@@ -54,6 +73,9 @@ const verify = async (
accessTokens: {
github: accessToken,
},
accessTokenDates: {
github: now,
},
externalIDs: {
github: profile.id,
},
@@ -63,16 +85,9 @@ const verify = async (
photo,
});
if (user.emails?.length) user.emails[0].default = true;
await user.save();
}
}
if (!user.accessTokenDates) {
user.accessTokenDates = {
github: new Date(),
};
} else {
user.accessTokenDates.github = new Date();
}
await user.save();
done(null, {
username: profile.username,
accessToken,
-34
View File
@@ -17,7 +17,6 @@ import User from "../../core/User";
import { RepositoryStatus } from "../../core/types";
import { IUserDocument } from "../../core/model/users/users.types";
import { checkToken } from "../../core/GitHubUtils";
import config from "../../config";
const router = express.Router();
@@ -404,20 +403,6 @@ router.post(
httpStatus: 404,
});
}
if (repository.size) {
if (
repository.size > config.AUTO_DOWNLOAD_REPO_SIZE &&
repo.model.source.type == "GitHubDownload"
) {
repo.model.source.type = "GitHubStream";
} else if (
repository.size < config.AUTO_DOWNLOAD_REPO_SIZE &&
repo.model.source.type == "GitHubStream"
) {
repo.model.source.type = "GitHubDownload";
}
}
const removeRepoFromConference = async (conferenceID: string) => {
const conf = await ConferenceModel.findOne({
conferenceID,
@@ -528,25 +513,6 @@ router.post("/", async (req: express.Request, res: express.Response) => {
repo.source.accessToken = user.accessToken;
repo.source.repositoryId = repository.model.id;
repo.source.repositoryName = repoUpdate.fullName;
if (
repository.size !== undefined &&
repository.size < config.AUTO_DOWNLOAD_REPO_SIZE
) {
repo.source.type = "GitHubDownload";
}
if (repository.size) {
if (
repository.size > config.AUTO_DOWNLOAD_REPO_SIZE &&
repo.source.type == "GitHubDownload"
) {
repo.source.type = "GitHubStream";
} else if (
repository.size < config.AUTO_DOWNLOAD_REPO_SIZE &&
repo.source.type == "GitHubStream"
) {
repo.source.type = "GitHubDownload";
}
}
repo.conference = repoUpdate.conference;
+23 -6
View File
@@ -1,6 +1,4 @@
import { promisify } from "util";
import * as express from "express";
import * as stream from "stream";
import config from "../../config";
import got from "got";
import { join } from "path";
@@ -10,14 +8,14 @@ import AnonymousError from "../../core/AnonymousError";
import { downloadQueue } from "../../queue";
import { RepositoryStatus } from "../../core/types";
import User from "../../core/User";
import { streamAnonymizedZip } from "../../core/zipStream";
import gh = require("parse-github-url");
const router = express.Router();
router.get(
"/:repoId/zip",
async (req: express.Request, res: express.Response) => {
const pipeline = promisify(stream.pipeline);
try {
if (!config.ENABLE_DOWNLOAD) {
throw new AnonymousError("download_not_enabled", {
@@ -87,10 +85,28 @@ router.get(
}
res.attachment(`${repo.repoId}.zip`);
// cache the file for 6 hours
res.header("Cache-Control", "max-age=21600");
await pipeline(await repo.zip(), res);
const parsed = gh(repo.model.source.repositoryName || "");
if (!parsed?.owner || !parsed?.name) {
throw new AnonymousError("repo_not_found", {
httpStatus: 404,
object: repo.model.source.repositoryName,
});
}
const anonymizer = repo.generateAnonymizeTransformer("");
await streamAnonymizedZip(
{
repoId: repo.repoId,
organization: parsed.owner,
repoName: parsed.name,
commit: repo.model.source.commit || "HEAD",
getToken: () => repo.getToken(),
anonymizerOptions: anonymizer.opt,
},
res
);
} catch (error) {
handleError(error, res, req);
}
@@ -197,6 +213,7 @@ router.get(
isAdmin: user?.isAdmin === true,
isOwner: user?.id == repo.model.owner,
hasWebsite: !!repo.options.page && !!repo.options.pageSource,
truncatedFolders: repo.model.truncatedFolders || [],
});
} catch (error) {
handleError(error, res, req);
+46
View File
@@ -0,0 +1,46 @@
import * as express from "express";
import * as crypto from "crypto";
import UserModel from "../../core/model/users/users.model";
export function hashToken(token: string): string {
return crypto.createHash("sha256").update(token).digest("hex");
}
export function generateToken(): string {
return crypto.randomBytes(32).toString("hex");
}
export async function bearerTokenAuth(
req: express.Request,
_res: express.Response,
next: express.NextFunction
): Promise<void> {
if (req.user) return next();
const header = req.headers["authorization"];
if (!header || typeof header !== "string") return next();
const match = header.match(/^Bearer\s+(.+)$/i);
if (!match) return next();
const tokenHash = hashToken(match[1].trim());
try {
const model = await UserModel.findOne({ "apiTokens.tokenHash": tokenHash });
if (!model) return next();
// Mirror the shape produced by passport's verify() in connection.ts
// so existing getUser()/route code works unchanged.
req.user = {
username: model.username,
user: model,
} as Express.User;
// fire-and-forget last-used update
UserModel.updateOne(
{ _id: model._id, "apiTokens.tokenHash": tokenHash },
{ $set: { "apiTokens.$.lastUsedAt": new Date() } }
).catch((err) => console.error("[token-auth] lastUsedAt update failed", err));
} catch (err) {
console.error("[token-auth] lookup failed", err);
}
return next();
}
+12 -72
View File
@@ -1,16 +1,12 @@
import * as express from "express";
import GitHubStream from "../core/source/GitHubStream";
import {
anonymizePath,
AnonymizeTransformer,
isTextFile,
} from "../core/anonymize-utils";
import { handleError } from "../server/routes/route-utils";
import { lookup } from "mime-types";
import GitHubDownload from "../core/source/GitHubDownload";
import got from "got";
import { Parse } from "unzip-stream";
import archiver = require("archiver");
import { streamAnonymizedZip } from "../core/zipStream";
export const router = express.Router();
@@ -24,73 +20,17 @@ router.post(
const anonymizerOptions = req.body.anonymizerOptions;
try {
const source = new GitHubDownload({
repoId,
organization: repoFullName[0],
repoName: repoFullName[1],
commit: commit,
getToken: () => token,
});
const response = await source.getZipUrl();
const downloadStream = got.stream(response.url);
res.on("error", (error) => {
console.error(error);
downloadStream.destroy();
});
res.on("close", () => {
downloadStream.destroy();
});
const archive = archiver("zip", {});
downloadStream
.on("error", (error) => {
console.error(error);
try {
archive.finalize();
} catch { /* ignored */ }
})
.on("close", () => {
try {
archive.finalize();
} catch { /* ignored */ }
})
.pipe(Parse())
.on("entry", (entry) => {
if (entry.type === "File") {
try {
const fileName = anonymizePath(
entry.path.substring(entry.path.indexOf("/") + 1),
anonymizerOptions.terms || []
);
const anonymizer = new AnonymizeTransformer(anonymizerOptions);
anonymizer.opt.filePath = fileName;
const st = entry.pipe(anonymizer);
archive.append(st, { name: fileName });
} catch (error) {
entry.autodrain();
console.error(error);
}
} else {
entry.autodrain();
}
})
.on("error", (error) => {
console.error(error);
try {
archive.finalize();
} catch { /* ignored */ }
})
.on("finish", () => {
try {
archive.finalize();
} catch { /* ignored */ }
});
archive.pipe(res).on("error", (error) => {
console.error(error);
res.end();
});
await streamAnonymizedZip(
{
repoId,
organization: repoFullName[0],
repoName: repoFullName[1],
commit,
getToken: () => token,
anonymizerOptions,
},
res
);
} catch (error) {
handleError(error, res);
}