mirror of
https://github.com/tdurieux/anonymous_github.git
synced 2026-07-02 20:05:50 +02:00
Fix .bat anonymization, truncated-tree misses, submodule warning, account deletion (#742)
* fix: anonymize Windows batch scripts (#735) mime-types maps .bat to application/x-msdownload, the same MIME type as .exe/.dll, so batch scripts were classified as binary and streamed through without any anonymization. Special-case .bat/.cmd as text before the MIME lookup, keeping .exe/.dll binary. * fix: recover files missing from truncated tree listings (#738) GitHub truncates tree listings of very large repositories. Folders whose listing was truncated are recorded in truncatedFolders, but files that fell outside the listing never reached the database, so requesting them returned 404 file_not_found even though they exist on GitHub — and a force refresh could not help. When a file lookup misses and its directory is under a truncated folder, fetch the file metadata directly from GitHub's contents API (object media type, so it works past the 1MB inline limit), cache it in the database, and serve it normally. * feat: warn when a repository uses git submodules (#737) GitHub archives and tree listings never include submodule contents, so submodules end up as empty folders in the anonymized repository, which surprises users. Detect a root .gitmodules file and show a warning banner in the explorer explaining that submodule contents are not included. * feat: allow users to delete their account (#741) Add DELETE /api/user: removes all anonymized repositories, gists, and pull requests owned by the user, best-effort revokes the GitHub OAuth grant, and scrubs personal data (username, emails, tokens, GitHub id, photo) from the user record. The record itself is kept with a placeholder username so removed repoIds stay reserved and owner references remain resolvable. The settings page gains an Account section with a confirmed delete button. * fix: add missing error translations for token_expired and job_is_active The error-code coverage test failed because both backend codes had no frontend translation.
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"core.min.js": "core.6332b3c288.min.js",
|
||||
"vendor.min.js": "vendor.d7d972f465.min.js",
|
||||
"vendor.min.js": "vendor.2b972dfbd5.min.js",
|
||||
"mermaid.min.js": "mermaid.f848a72d16.min.js",
|
||||
"all.min.css": "all.1a9babcb45.min.css"
|
||||
}
|
||||
@@ -108,12 +108,15 @@
|
||||
"cannot_coauthor_self": "You cannot add yourself as a co-author.",
|
||||
"storage_write_size_mismatch": "The downloaded file was smaller than expected. The upstream source may have returned an incomplete response — please try again.",
|
||||
"storage_read_error": "An error occurred while reading the file from storage — please try again.",
|
||||
"upstream_error": "A temporary error occurred while fetching from GitHub — please try again."
|
||||
"upstream_error": "A temporary error occurred while fetching from GitHub — please try again.",
|
||||
"token_expired": "Your GitHub access token has expired. Please log out and log in again to refresh it.",
|
||||
"job_is_active": "This job is currently running — wait for it to finish or remove it first."
|
||||
},
|
||||
"WARNINGS": {
|
||||
"page_not_enabled_on_repo": "GitHub Pages is not enabled on this repository. Enable it in the repository's Settings → Pages on GitHub, then refresh.",
|
||||
"page_branch_mismatch": "GitHub Pages on this repository is served from the '{{pageBranch}}' branch, but you selected '{{selectedBranch}}'. Switch the branch above to '{{pageBranch}}' to anonymize the Pages site.",
|
||||
"folder_truncated": "This folder has more than 10,000 entries; only a partial listing is shown.",
|
||||
"repo_truncated": "Some folders in this repository have too many files to be fully listed. Affected folders are marked with a warning icon."
|
||||
"repo_truncated": "Some folders in this repository have too many files to be fully listed. Affected folders are marked with a warning icon.",
|
||||
"submodules_not_included": "This repository uses git submodules. Submodule contents are not included in the anonymized repository and appear as empty folders."
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,6 +50,14 @@
|
||||
<i class="fas fa-exclamation-triangle"></i>
|
||||
{{ 'WARNINGS.repo_truncated' | translate }}
|
||||
</div>
|
||||
<div
|
||||
ng-if="options.hasSubmodules"
|
||||
class="paper-inline-warning"
|
||||
role="alert"
|
||||
>
|
||||
<i class="fas fa-exclamation-triangle"></i>
|
||||
{{ 'WARNINGS.submodules_not_included' | translate }}
|
||||
</div>
|
||||
<tree class="files" file="files" search-query="fileSearchQuery" search-results="fileSearchResults"></tree>
|
||||
</div>
|
||||
<div class="leftCol-foot">
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
<a href="#settings-rendering">Rendering</a>
|
||||
<a href="#settings-features">Features</a>
|
||||
<a href="#settings-expiration">Expiration</a>
|
||||
<a href="#settings-account">Account</a>
|
||||
</nav>
|
||||
</aside>
|
||||
|
||||
@@ -168,6 +169,20 @@
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<section id="settings-account" class="paper-settings-section">
|
||||
<div class="paper-section-eyebrow">Account</div>
|
||||
<p class="form-text text-muted">
|
||||
Deleting your account removes all your anonymized repositories,
|
||||
gists, and pull requests, revokes the application's access to your
|
||||
GitHub account, and erases your personal data (username, email,
|
||||
access tokens) from our database. This cannot be undone.
|
||||
</p>
|
||||
<div class="alert alert-danger" role="alert" ng-if="deleteError" ng-bind="deleteError"></div>
|
||||
<button type="button" class="btn btn-danger" ng-click="deleteAccount()" ng-disabled="deletingAccount">
|
||||
{{ deletingAccount ? 'Deleting…' : 'Delete my account' }}
|
||||
</button>
|
||||
</section>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -1161,6 +1161,26 @@ angular
|
||||
}
|
||||
);
|
||||
};
|
||||
|
||||
$scope.deleteAccount = () => {
|
||||
if (
|
||||
!confirm(
|
||||
"Delete your account? All your anonymized repositories, gists, and pull requests will be removed, and your personal data will be erased. This cannot be undone."
|
||||
)
|
||||
)
|
||||
return;
|
||||
$scope.deletingAccount = true;
|
||||
$http.delete("/api/user").then(
|
||||
() => {
|
||||
window.location.href = "/";
|
||||
},
|
||||
() => {
|
||||
$scope.deletingAccount = false;
|
||||
$scope.deleteError =
|
||||
"Unable to delete the account. Please try again.";
|
||||
}
|
||||
);
|
||||
};
|
||||
},
|
||||
])
|
||||
.controller("claimController", [
|
||||
|
||||
Vendored
+1
-1
File diff suppressed because one or more lines are too long
@@ -147,6 +147,17 @@ export default class AnonymizedFile {
|
||||
this._file = res;
|
||||
return res;
|
||||
}
|
||||
// The stored tree can be incomplete: GitHub truncates tree listings of
|
||||
// very large repositories, and folders recorded in `truncatedFolders`
|
||||
// have entries that never made it into the database. Ask GitHub
|
||||
// directly for the path before concluding the file does not exist
|
||||
// (#738). Without an anonymization mask the anonymized path is the
|
||||
// original path, so it can be looked up as-is.
|
||||
const recovered = await this.recoverTruncatedFile(fileDir);
|
||||
if (recovered) {
|
||||
this._file = recovered;
|
||||
return recovered;
|
||||
}
|
||||
throw new AnonymousError("file_not_found", {
|
||||
object: this,
|
||||
httpStatus: 404,
|
||||
@@ -189,6 +200,39 @@ export default class AnonymizedFile {
|
||||
});
|
||||
}
|
||||
|
||||
// On-demand recovery for files missing from the database because the
|
||||
// GitHub tree listing was truncated. Only paths under a recorded
|
||||
// truncated folder qualify — everything else is a genuine miss.
|
||||
private async recoverTruncatedFile(fileDir: string): Promise<IFile | null> {
|
||||
const truncated = this.repository.model.truncatedFolders || [];
|
||||
const isAffected = truncated.some(
|
||||
(folder) =>
|
||||
folder === "" || fileDir === folder || fileDir.startsWith(folder + "/")
|
||||
);
|
||||
if (!isAffected) return null;
|
||||
const source = this.repository.source as {
|
||||
fetchFileInfoFromPath?: (filePath: string) => Promise<IFile | null>;
|
||||
};
|
||||
if (typeof source.fetchFileInfoFromPath !== "function") return null;
|
||||
const recovered = await source.fetchFileInfoFromPath(this.anonymizedPath);
|
||||
if (!recovered) return null;
|
||||
recovered.repoId = this.repository.repoId;
|
||||
logger.info("recovered file from truncated tree", {
|
||||
repoId: this.repository.repoId,
|
||||
path: this.anonymizedPath,
|
||||
});
|
||||
try {
|
||||
// Cache it so the next request is served from the database.
|
||||
await FileModel.create(recovered);
|
||||
} catch (error) {
|
||||
logger.warn(
|
||||
"failed to cache recovered file",
|
||||
serializeError(error as Error)
|
||||
);
|
||||
}
|
||||
return recovered;
|
||||
}
|
||||
|
||||
/**
|
||||
* De-anonymize the path
|
||||
*
|
||||
|
||||
@@ -101,6 +101,10 @@ function classifyByName(filePath: string): boolean | null {
|
||||
const extension = name.split(".").reverse()[0].toLowerCase();
|
||||
if (config.additionalExtensions.includes(extension)) return true;
|
||||
if (KNOWN_TEXT_FILENAMES.has(name.toLowerCase())) return true;
|
||||
// mime-types maps `.bat` to application/x-msdownload, the same MIME as
|
||||
// .exe/.dll, so the MIME allowlist can't distinguish them. Batch scripts
|
||||
// are text and must be anonymized (#735).
|
||||
if (extension === "bat" || extension === "cmd") return true;
|
||||
const mime = lookupMime(name);
|
||||
if (mime === false) return null;
|
||||
// mime-types treats `.ts` as video/mp2t; route.ts already special-cases it.
|
||||
|
||||
@@ -376,6 +376,44 @@ export default class GitHubStream extends GitHubBase {
|
||||
return this.getTruncatedTree(this.data.commit, progress);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch a single file's blob metadata directly from GitHub. Used as a
|
||||
* fallback when the stored tree is incomplete because GitHub truncated
|
||||
* the tree listing of a very large repository (#738). The `object` media
|
||||
* type returns sha/size without the content payload, so it also works for
|
||||
* files above the 1MB contents-API inline limit.
|
||||
*/
|
||||
async fetchFileInfoFromPath(filePath: string): Promise<IFile | null> {
|
||||
const token = await this.data.getToken();
|
||||
const oct = octokit(token);
|
||||
try {
|
||||
await waitForTokenGate(token);
|
||||
const res = await oct.repos.getContent({
|
||||
owner: this.data.organization,
|
||||
repo: this.data.repoName,
|
||||
path: filePath,
|
||||
ref: this.data.commit,
|
||||
mediaType: { format: "object" },
|
||||
});
|
||||
const data = res.data as { type?: string; sha?: string; size?: number };
|
||||
if (data.type !== "file" || !data.sha) return null;
|
||||
const parent = dirname(filePath);
|
||||
return {
|
||||
name: basename(filePath),
|
||||
path: parent === "." ? "" : parent,
|
||||
repoId: this.data.repoId,
|
||||
sha: data.sha,
|
||||
size: data.size,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.debug("fetchFileInfoFromPath miss", {
|
||||
filePath,
|
||||
error: serializeError(error),
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async getGHTree(
|
||||
oct: ReturnType<typeof octokit>,
|
||||
token: string,
|
||||
|
||||
@@ -390,6 +390,15 @@ router.get(
|
||||
isOwner: user?.id == repo.model.owner,
|
||||
hasWebsite: !!repo.options.page && !!repo.options.pageSource,
|
||||
truncatedFolders: repo.model.truncatedFolders || [],
|
||||
// Submodule contents are not included in GitHub archives/trees, so
|
||||
// they end up as empty folders in the anonymized repository. Surface
|
||||
// a warning in the explorer when the repository uses submodules (#737).
|
||||
hasSubmodules:
|
||||
(await FileModel.exists({
|
||||
repoId: repo.repoId,
|
||||
name: ".gitmodules",
|
||||
path: "",
|
||||
})) != null,
|
||||
});
|
||||
} catch (error) {
|
||||
handleError(error, res, req);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import * as express from "express";
|
||||
import got from "got";
|
||||
import config from "../../config";
|
||||
import { ensureAuthenticated } from "./connection";
|
||||
import { handleError, getUser, isOwnerOrAdmin } from "./route-utils";
|
||||
@@ -147,6 +148,77 @@ router.post("/default", async (req: express.Request, res: express.Response) => {
|
||||
}
|
||||
});
|
||||
|
||||
// Delete the account: remove all anonymized content owned by the user,
|
||||
// best-effort revoke the GitHub OAuth grant, and scrub personal data from
|
||||
// the user record (#741). The record itself is kept (with a placeholder
|
||||
// username) so removed repoIds stay reserved and owner references remain
|
||||
// resolvable.
|
||||
router.delete("/", async (req: express.Request, res: express.Response) => {
|
||||
try {
|
||||
const user = await getUser(req);
|
||||
|
||||
const repositories = (await user.getRepositories()).filter(
|
||||
(r) => r.owner.id === user.model.id && r.status !== "removed"
|
||||
);
|
||||
for (const repo of repositories) {
|
||||
await repo.remove();
|
||||
}
|
||||
for (const pullRequest of await user.getPullRequests()) {
|
||||
if (pullRequest.status !== "removed") await pullRequest.remove();
|
||||
}
|
||||
for (const gist of await user.getGists()) {
|
||||
if (gist.status !== "removed") await gist.remove();
|
||||
}
|
||||
|
||||
// Revoke the OAuth grant so the application no longer appears in the
|
||||
// user's GitHub authorized applications. Best-effort: the account is
|
||||
// scrubbed even if GitHub rejects the revocation.
|
||||
try {
|
||||
await got.delete(
|
||||
`https://api.github.com/applications/${config.CLIENT_ID}/grant`,
|
||||
{
|
||||
username: config.CLIENT_ID,
|
||||
password: config.CLIENT_SECRET,
|
||||
headers: { accept: "application/vnd.github+json" },
|
||||
json: { access_token: user.accessToken },
|
||||
}
|
||||
);
|
||||
} catch (error) {
|
||||
logger.warn("oauth grant revocation failed", serializeError(error));
|
||||
}
|
||||
|
||||
await UserModel.updateOne(
|
||||
{ _id: user.model._id },
|
||||
{
|
||||
$set: {
|
||||
status: "removed",
|
||||
username: `deleted-${user.model._id}`,
|
||||
emails: [],
|
||||
apiTokens: [],
|
||||
repositories: [],
|
||||
},
|
||||
$unset: {
|
||||
accessTokens: "",
|
||||
accessTokenDates: "",
|
||||
externalIDs: "",
|
||||
photo: "",
|
||||
default: "",
|
||||
},
|
||||
}
|
||||
).exec();
|
||||
|
||||
logger.info("account removed", { userId: user.model.id });
|
||||
req.logout((error) => {
|
||||
if (error) {
|
||||
logger.error("logout after account removal failed", serializeError(error));
|
||||
}
|
||||
res.json({ status: "ok" });
|
||||
});
|
||||
} catch (error) {
|
||||
handleError(error, res, req);
|
||||
}
|
||||
});
|
||||
|
||||
router.get(
|
||||
"/anonymized_repositories",
|
||||
async (req: express.Request, res: express.Response) => {
|
||||
|
||||
@@ -219,3 +219,40 @@ describe("AnonymizedFile.isFileSupported()", function () {
|
||||
.false;
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Replicated logic from AnonymizedFile.recoverTruncatedFile (#738): a file
|
||||
// missing from the database is only recovered from GitHub when its directory
|
||||
// is under a folder whose tree listing was truncated.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function isUnderTruncatedFolder(fileDir, truncatedFolders) {
|
||||
return truncatedFolders.some(
|
||||
(folder) =>
|
||||
folder === "" || fileDir === folder || fileDir.startsWith(folder + "/")
|
||||
);
|
||||
}
|
||||
|
||||
describe("AnonymizedFile truncated-folder matching (#738)", function () {
|
||||
it("matches files directly inside a truncated folder", function () {
|
||||
expect(isUnderTruncatedFolder("data", ["data"])).to.be.true;
|
||||
});
|
||||
|
||||
it("matches files nested under a truncated folder", function () {
|
||||
expect(isUnderTruncatedFolder("data/sub/dir", ["data"])).to.be.true;
|
||||
});
|
||||
|
||||
it("matches everything when the root listing was truncated", function () {
|
||||
expect(isUnderTruncatedFolder("", [""])).to.be.true;
|
||||
expect(isUnderTruncatedFolder("any/dir", [""])).to.be.true;
|
||||
});
|
||||
|
||||
it("does not match sibling folders sharing a prefix", function () {
|
||||
expect(isUnderTruncatedFolder("database", ["data"])).to.be.false;
|
||||
expect(isUnderTruncatedFolder("database/sub", ["data"])).to.be.false;
|
||||
});
|
||||
|
||||
it("does not match when nothing was truncated", function () {
|
||||
expect(isUnderTruncatedFolder("data", [])).to.be.false;
|
||||
});
|
||||
});
|
||||
|
||||
@@ -28,6 +28,19 @@ describe("isTextFile", function () {
|
||||
expect(isTextFile("foo.zip")).to.equal(false);
|
||||
});
|
||||
|
||||
// #735 — mime-types maps .bat to application/x-msdownload (same MIME as
|
||||
// .exe/.dll), which classified batch scripts as binary and skipped
|
||||
// anonymization entirely.
|
||||
it("recognizes Windows batch scripts as text", function () {
|
||||
expect(isTextFile("script.bat")).to.equal(true);
|
||||
expect(isTextFile("SCRIPT.BAT")).to.equal(true);
|
||||
expect(isTextFile("script.cmd")).to.equal(true);
|
||||
expect(isTextFile("path/to/build.bat")).to.equal(true);
|
||||
// .exe/.dll share the same MIME type but must stay binary
|
||||
expect(isTextFile("app.exe")).to.equal(false);
|
||||
expect(isTextFile("lib.dll")).to.equal(false);
|
||||
});
|
||||
|
||||
it("recognizes jsonl-family dataset extensions", function () {
|
||||
expect(isTextFile("data.jsonl")).to.equal(true);
|
||||
expect(isTextFile("data.ndjson")).to.equal(true);
|
||||
|
||||
Reference in New Issue
Block a user