fix(gbrain-sync): cut source-id slugs on hyphen boundaries (+ #1357)

Cherry-picked from #1481 by drummerms and extended with the explicit
HTTPS-remote regression case for #1357 (decision D2=A).

`constrainSourceId` truncated the slug with `slug.slice(-tailBudget)`,
which cut mid-word when the boundary fell inside a token. For a repo
where the combined `prefix-org-repo-pathhash` exceeded 32 chars, this
produced embarrassing artifacts like `gstack-code-kill-270c0001-c32152`
(from `drummerms-av-sow-wiz-skill-270c0001`).

Two changes carried from #1481, adapted for the #1468 hostpathhash:

1. `constrainSourceId` now walks hyphen-separated tokens from the right,
   accumulating whole tokens until adding the next would exceed
   `tailBudget`. When no token fits, falls through to the existing
   `${prefix}-${hash}` form.

2. `deriveCodeSourceId` now retries with `repo-only-hostpathhash`
   (dropping the org segment) when the full `org-repo-hostpathhash`
   triggers truncation. Keeps the repo name readable when it fits at all.

Plus a new test asserting the source id is period-free for the exact
HTTPS-with-.git remote shape from #1357 (`https://github.com/foo/bar.git`).
canonicalizeRemote strips `.git`; the sanitizer strips any residual
non-alnum. The test closes #1357 by pinning the property.

Closes #1357

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
drummerms
2026-05-16 13:56:23 -07:00
committed by Garry Tan
parent cdd3a2f77a
commit cfd2b5d792
2 changed files with 94 additions and 2 deletions
+26 -2
View File
@@ -186,7 +186,14 @@ function deriveCodeSourceId(repoPath: string): string {
if (remote) {
const segs = remote.split("/").filter(Boolean);
const slugSource = segs.slice(-2).join("-");
return constrainSourceId("gstack-code", `${slugSource}-${hostPathHash}`);
const fullId = constrainSourceId("gstack-code", `${slugSource}-${hostPathHash}`);
// If the org+repo+hostpathhash fits cleanly (suffix preserved), use it.
if (fullId.endsWith(`-${hostPathHash}`)) return fullId;
// Otherwise drop the org prefix and retry with just repo+hostpathhash so
// the repo name stays readable. If that still doesn't fit,
// constrainSourceId falls back to a deterministic hash-only form.
const repoOnly = segs[segs.length - 1] || "repo";
return constrainSourceId("gstack-code", `${repoOnly}-${hostPathHash}`);
}
const base = repoPath.split("/").pop() || "repo";
return constrainSourceId("gstack-code", `${base}-${hostPathHash}`);
@@ -383,6 +390,10 @@ export function removeOrphanedSource(oldId: string): boolean {
* Build a gbrain-valid source id (1-32 lowercase alnum + interior hyphens). Sanitizes
* `raw`, prefixes with `prefix`, and falls back to a hashed-tail form when total length
* would exceed 32 chars.
*
* Truncation cuts on hyphen boundaries (whole-word units) from the right, never
* mid-word. Inputs like "drummerms-av-sow-wiz-skill-270c0001" produce
* "${prefix}-270c0001-<hash>", not "${prefix}-kill-270c0001-<hash>".
*/
function constrainSourceId(prefix: string, raw: string): string {
const MAX = 32;
@@ -401,7 +412,20 @@ function constrainSourceId(prefix: string, raw: string): string {
// Total budget: prefix + "-" + tail + "-" + hash
const tailBudget = MAX - prefix.length - 2 - hash.length;
if (tailBudget < 1) return `${prefix}-${hash}`;
const tail = slug.slice(-tailBudget).replace(/^-+|-+$/g, "");
// Cut on hyphen boundaries instead of mid-word. Walk tokens from the right,
// accumulating until adding the next token would exceed tailBudget. This
// preserves readable suffixes (pathhash, repo name) and avoids embarrassing
// mid-word artifacts like "skill" → "kill".
const tokens = slug.split("-").filter(Boolean);
const kept: string[] = [];
let len = 0;
for (let i = tokens.length - 1; i >= 0; i--) {
const add = kept.length === 0 ? tokens[i].length : tokens[i].length + 1;
if (len + add > tailBudget) break;
kept.unshift(tokens[i]);
len += add;
}
const tail = kept.join("-");
return tail ? `${prefix}-${tail}-${hash}` : `${prefix}-${hash}`;
}