multiple fixes

2026-05-16 14:59:07 +02:00 · 2026-05-05 10:32:31 +03:00
parent 5b72b630c4
commit f8c91ca0af
23 changed files with 1675 additions and 661 deletions
@@ -22,6 +22,7 @@ import {
 import { getToken } from "./GitHubUtils";
 import config from "../config";
 import FileModel from "./model/files/files.model";
+import AnonymizedRepositoryModel from "./model/anonymizedRepositories/anonymizedRepositories.model";
 import { IFile } from "./model/files/files.types";
 import AnonymizedFile from "./AnonymizedFile";
 import { FilterQuery } from "mongoose";
@@ -351,7 +352,7 @@ export default class Repository {
        );

        await this.resetSate(RepositoryStatus.PREPARING);
-        await downloadQueue.add(this.repoId, this, {
+        await downloadQueue.add(this.repoId, { repoId: this.repoId }, {
          jobId: this.repoId,
          attempts: 3,
        });
@@ -405,7 +406,16 @@ export default class Repository {
    this._model.statusDate = new Date();
    this._model.statusMessage = statusMessage;
    if (!isConnected) return this.model;
-    await this._model.save();
+    await AnonymizedRepositoryModel.updateOne(
+      { _id: this._model._id },
+      {
+        $set: {
+          status,
+          statusDate: this._model.statusDate,
+          statusMessage,
+        },
+      }
+    ).exec();
  }

  /**
@@ -80,37 +80,44 @@ export default class User {
        });
      });

-      // find the repositories that are already in the database
-      const finds = (
-        await RepositoryModel.find({
-          externalId: {
-            $in: repositories.map((repo) => repo.externalId),
-          },
-        }).select("externalId")
-      ).map((m) => m.externalId);
-
-      // save all the new repositories
-      await Promise.all(
-        repositories
-          .filter((r) => finds.indexOf(r.externalId) == -1)
-          .map((r) => r.save())
+      // find the repositories that are already in the database — fetch both
+      // externalId and id so we can both detect duplicates and reuse the
+      // ids of existing rows without re-querying.
+      const externalIds = repositories.map((repo) => repo.externalId);
+      const existing = await RepositoryModel.find({
+        externalId: { $in: externalIds },
+      }).select("id externalId");
+      const existingByExternalId = new Map(
+        existing.map((m) => [m.externalId, m.id])
      );

-      // save only the if of the repositories in the user model
-      this._model.repositories = (
-        await RepositoryModel.find({
-          externalId: {
-            $in: repositories.map((repo) => repo.externalId),
-          },
-        }).select("id")
-      ).map((m) => m.id);
+      // save all the new repositories
+      const newRepos = repositories.filter(
+        (r) => !existingByExternalId.has(r.externalId)
+      );
+      const saved = await Promise.all(newRepos.map((r) => r.save()));
+      for (const m of saved) {
+        existingByExternalId.set(m.externalId, m.id);
+      }
+
+      // collect ids in the order of the upstream repositories list
+      this._model.repositories = externalIds
+        .map((eid) => existingByExternalId.get(eid))
+        .filter((id) => !!id) as unknown as typeof this._model.repositories;

      // have the model
      await this._model.save();
      return repositories.map((r) => new GitHubRepository(r));
    } else {
+      // Only the fields read by GitHubRepository.toJSON() (and the immediate
+      // callers in user routes). Branches/readme are loaded on demand by
+      // GitHubRepository methods, which issue their own queries.
      const out = (
-        await RepositoryModel.find({ _id: { $in: this._model.repositories } })
+        await RepositoryModel.find({
+          _id: { $in: this._model.repositories },
+        }).select(
+          "externalId name url size hasPage pageSource defaultBranch"
+        )
      ).map((i) => new GitHubRepository(i));
      return out;
    }
@@ -192,8 +192,62 @@ export class AnonymizeTransformer extends Transform {
  }
 }

+// Markdown image pattern hoisted out of removeImage() so we don't recompile
+// it on every chunk of every file streamed through the anonymizer.
+const markdownImageRegex =
+  /!\[[^\]]*\]\((?<filename>.*?)(?="|\))(?<optionalpart>".*")?\)/g;
+
+interface CompiledTermVariant {
+  // Global regex used to replace matches in content (and paths).
+  replaceRegex: RegExp;
+  // Non-global twin used inside the URL callback to test() without
+  // mutating shared lastIndex state.
+  testRegex: RegExp;
+  mask: string;
+}
+
+function compileTerms(terms: string[] | undefined): CompiledTermVariant[] {
+  if (!terms || terms.length === 0) return [];
+  const compiled: CompiledTermVariant[] = [];
+  for (let i = 0; i < terms.length; i++) {
+    const spec = terms[i];
+    if (spec.trim() === "") continue;
+    // #285 — entries of the form "term=>replacement" override the default
+    // XXXX-N mask so users can scrub with their preferred token.
+    const parsed = parseTermSpec(spec);
+    let term = parsed.term;
+    const mask =
+      parsed.replacement !== null
+        ? parsed.replacement
+        : config.ANONYMIZATION_MASK + "-" + (i + 1);
+    try {
+      new RegExp(term, "gi");
+    } catch {
+      term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
+    }
+    for (const variant of termVariants(term)) {
+      const bounded = withWordBoundaries(variant.pattern, {
+        sniffSource: variant.sniff,
+        unicode: variant.unicode,
+      });
+      const baseFlags = variant.unicode ? "iu" : "i";
+      compiled.push({
+        replaceRegex: new RegExp(bounded, "g" + baseFlags),
+        testRegex: new RegExp(bounded, baseFlags),
+        mask,
+      });
+    }
+  }
+  return compiled;
+}
+
 export class ContentAnonimizer {
  public wasAnonymized = false;
+  // Compiled once per instance and reused for every anonymize() call.
+  // Streamed files invoke anonymize() many times per file (one per chunk),
+  // so caching here avoids rebuilding regexes on every chunk.
+  private compiledTerms: CompiledTermVariant[];
+  private selfLinkRegexes: RegExp[] | null = null;

  constructor(
    readonly opt: {
@@ -204,26 +258,33 @@ export class ContentAnonimizer {
      branchName?: string;
      repoId?: string;
    }
-  ) {}
+  ) {
+    this.compiledTerms = compileTerms(opt.terms);
+    if (opt.repoName && opt.branchName) {
+      const r = opt.repoName;
+      const b = opt.branchName;
+      this.selfLinkRegexes = [
+        new RegExp(`https://raw.githubusercontent.com/${r}/${b}\\b`, "gi"),
+        new RegExp(`https://github.com/${r}/blob/${b}\\b`, "gi"),
+        new RegExp(`https://github.com/${r}/tree/${b}\\b`, "gi"),
+        new RegExp(`https://github.com/${r}`, "gi"),
+      ];
+    }
+  }

  private removeImage(content: string): string {
    if (this.opt.image !== false) {
      return content;
    }
-    // remove image in markdown
-    return content.replace(
-      /!\[[^\]]*\]\((?<filename>.*?)(?="|\))(?<optionalpart>".*")?\)/g,
-      () => {
-        this.wasAnonymized = true;
-        return config.ANONYMIZATION_MASK;
-      }
-    );
+    return content.replace(markdownImageRegex, () => {
+      this.wasAnonymized = true;
+      return config.ANONYMIZATION_MASK;
+    });
  }
  private removeLink(content: string): string {
    if (this.opt.link !== false) {
      return content;
    }
-    // remove image in markdown
    return content.replace(urlRegex, () => {
      this.wasAnonymized = true;
      return config.ANONYMIZATION_MASK;
@@ -231,83 +292,33 @@ export class ContentAnonimizer {
  }

  private replaceGitHubSelfLinks(content: string): string {
-    if (!this.opt.repoName || !this.opt.branchName) {
-      return content;
-    }
-    const repoName = this.opt.repoName;
-    const branchName = this.opt.branchName;
-
-    const replaceCallback = () => {
+    if (!this.selfLinkRegexes) return content;
+    const replacement = `https://${config.APP_HOSTNAME}/r/${this.opt.repoId}`;
+    const cb = () => {
      this.wasAnonymized = true;
-      return `https://${config.APP_HOSTNAME}/r/${this.opt.repoId}`;
+      return replacement;
    };
-    content = content.replace(
-      new RegExp(
-        `https://raw.githubusercontent.com/${repoName}/${branchName}\\b`,
-        "gi"
-      ),
-      replaceCallback
-    );
-    content = content.replace(
-      new RegExp(`https://github.com/${repoName}/blob/${branchName}\\b`, "gi"),
-      replaceCallback
-    );
-    content = content.replace(
-      new RegExp(`https://github.com/${repoName}/tree/${branchName}\\b`, "gi"),
-      replaceCallback
-    );
-    return content.replace(
-      new RegExp(`https://github.com/${repoName}`, "gi"),
-      replaceCallback
-    );
+    for (const re of this.selfLinkRegexes) {
+      content = content.replace(re, cb);
+    }
+    return content;
  }

  private replaceTerms(content: string): string {
-    const terms = this.opt.terms || [];
-    for (let i = 0; i < terms.length; i++) {
-      const spec = terms[i];
-      if (spec.trim() == "") {
-        continue;
-      }
-      // #285 — entries of the form "term=>replacement" override the default
-      // XXXX-N mask so users can scrub with their preferred token (e.g.
-      // "ABC", "XYZ"), keeping anonymized identifiers valid in source code.
-      const parsed = parseTermSpec(spec);
-      let term = parsed.term;
-      const mask =
-        parsed.replacement !== null
-          ? parsed.replacement
-          : config.ANONYMIZATION_MASK + "-" + (i + 1);
-      try {
-        new RegExp(term, "gi");
-      } catch {
-        // escape regex characters
-        term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
-      }
-
-      // Try the term verbatim first, then a diacritic-insensitive expansion
-      // so "Davo" anonymizes "Davó" (and vice versa). See term-matching.ts.
-      for (const variant of termVariants(term)) {
-        const bounded = withWordBoundaries(variant.pattern, {
-          sniffSource: variant.sniff,
-          unicode: variant.unicode,
-        });
-        const flags = variant.unicode ? "giu" : "gi";
-        // remove whole url if it contains the term
-        content = content.replace(urlRegex, (match) => {
-          if (new RegExp(bounded, flags).test(match)) {
-            this.wasAnonymized = true;
-            return mask;
-          }
-          return match;
-        });
-
-        // remove the term in the text
-        content = content.replace(new RegExp(bounded, flags), () => {
+    for (const c of this.compiledTerms) {
+      // remove whole url if it contains the term
+      content = content.replace(urlRegex, (match) => {
+        if (c.testRegex.test(match)) {
          this.wasAnonymized = true;
-          return mask;
-        });
-      }
+          return c.mask;
+        }
+        return match;
+      });
+      // remove the term in the text
+      content = content.replace(c.replaceRegex, () => {
+        this.wasAnonymized = true;
+        return c.mask;
+      });
    }
    return content;
  }
@@ -322,24 +333,20 @@ export class ContentAnonimizer {
 }

 export function anonymizePath(path: string, terms: string[]) {
-  for (let i = 0; i < terms.length; i++) {
-    const spec = terms[i];
-    if (spec.trim() == "") {
-      continue;
-    }
-    const parsed = parseTermSpec(spec);
-    let term = parsed.term;
-    const mask =
-      parsed.replacement !== null
-        ? parsed.replacement
-        : config.ANONYMIZATION_MASK + "-" + (i + 1);
-    try {
-      new RegExp(term, "gi");
-    } catch {
-      // escape regex characters
-      term = term.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&");
-    }
-    path = path.replace(new RegExp(term, "gi"), mask);
+  return anonymizePathCompiled(path, compileTerms(terms));
+}
+
+// Variant that accepts pre-compiled term regexes — call sites that anonymize
+// many paths in a row (tree traversal) should compile once and reuse.
+export function anonymizePathCompiled(
+  path: string,
+  compiled: CompiledTermVariant[]
+) {
+  for (const c of compiled) {
+    path = path.replace(c.replaceRegex, c.mask);
  }
  return path;
 }
+
+export { compileTerms };
+export type { CompiledTermVariant };