diff --git a/CHANGELOG.md b/CHANGELOG.md index f8b47c8f9..86c066df9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,47 @@ # Changelog +## [1.33.1.0] - 2026-05-11 + +## **Long skills stop drifting away from their starting context.** +## **`/investigate`, `/qa`, and `/ship` now pull learnings keyed to what they're actually about, and refresh that pull mid-flow as work shifts to new sub-tasks.** + +For the last 30+ versions, every gstack skill loaded learnings the same way: `gstack-learnings-search --limit 10` at the top, generic top-10 by confidence, no query, no refresh. Short skills were fine, they finish before the loaded learnings go stale. Long skills (`/investigate` walks 4 phases, `/qa` runs a multi-bug fix loop, `/ship` covers ~20 steps from test to bump to PR) drifted away from whatever was loaded at minute zero. By the time `/ship` reaches Step 12 (VERSION bump), the learnings it pulled at Step 1 are about whatever was the highest-confidence entry in your project, not about the headline feature you're shipping. + +Two changes ship in this release: per-skill task-shaped queries at the top of the three long skills, and a mid-flow refresh checkpoint inside each one that re-pulls learnings keyed to the sub-task that's about to begin. Both rely on a fix to `bin/gstack-learnings-search` itself. The binary's `--query` flag previously used whole-string substring match against key/insight/files, so a query like `"debug investigation"` would only match a learning whose insight contained that exact contiguous phrase. The flag is now token-OR: split on whitespace, match if ANY token appears in any haystack field. This is what most users expect from a search flag. + +### The numbers that matter + +Source: this project's local `learnings.jsonl` (35 entries as of this release). Same query, same flag, before vs after the binary fix: + +| Query | Before (substring) | After (token-OR) | Δ | +|-------|-------------------|------------------|---| +| `"debug investigation root cause"` | 0 entries matched | 5 entries matched | +5 | +| `"qa testing bug regression"` | 0 entries matched | 2 entries matched | +2 | +| `"release ship version changelog"` | 0 entries matched | 8 entries matched | +8 | +| `"skill resolver"` | 0 entries matched | 12 entries matched | +12 | + +Recall on the static skill-shaped queries went from zero to relevant. Without this fix, the rest of the change would have been silent. The bash would run, the binary would exit 0 with no output, and the skill would render the same empty section it always rendered. + +### What this means for builders + +If you run `/investigate` on a bug, the top-of-skill learnings pull now surfaces prior investigation patterns instead of unrelated top-10 confidence entries. When you finish Phase 1 (naming a root-cause hypothesis), a mid-flow refresh fires and re-pulls learnings keyed to your hypothesis keyword, so prior fixes for the same problem-shape land in the agent's context right when they're relevant. Same pattern for `/qa` (refresh before the fix loop, keyed to the buggy component) and `/ship` (refresh before the VERSION/CHANGELOG step, keyed to the headline feature). The other 13 short-lived skills are unchanged: their existing top-10 generic pull is still right for their attention span. + +### Itemized changes + +#### Changed + +- **`bin/gstack-learnings-search`** now uses token-OR `--query` semantics. Multi-word queries split on whitespace and match if ANY token appears as a substring in ANY of key/insight/files. Single-word queries behave exactly as before. No flag changes; same CLI surface. The old whole-string substring behavior was a silent footgun that returned nothing on real-world learnings stores. New test file `test/gstack-learnings-search.test.ts` covers the three branches (multi-token, single-token, no-query backwards compat). +- **`scripts/resolvers/learnings.ts`** `{{LEARNINGS_SEARCH}}` macro now accepts a `query=KEYWORD` argument. Empty value falls through to no-query (principle of least surprise: a stray `{{LEARNINGS_SEARCH:query=}}` placeholder gets today's behavior, not a build failure). Pattern reuses the parameterized-macro infrastructure from `composition.ts`. The 13 templates that don't pass a query stay byte-identical in their generated SKILL.md output. Shell-injection guard: the query value is whitelisted to `^[A-Za-z0-9 _-]+$` at gen-skill-docs time, so any `$()`, backticks, semicolons, or quotes in a future template throw a loud build error instead of emitting executable bash. +- **`investigate/SKILL.md.tmpl`** top-of-skill learnings pull keyed to `debug investigation root cause hypothesis bug fix`. New mid-flow refresh block between Phase 1 (hypothesis) and Phase 2 (analysis) instructs the agent to pick one alphanumeric-only keyword from the hypothesis and re-pull. Worked examples included (good: `auth-cookie`, `session-expiry`; bad: `auth.ts:47`, ``). +- **`qa/SKILL.md.tmpl`** top-of-skill pull keyed to `qa testing bug regression flake fixture`. Mid-flow refresh inserted between Phase 7 (triage) and Phase 8 (fix loop), keyed to the buggy component name. +- **`ship/SKILL.md.tmpl`** top-of-skill pull keyed to `release ship version changelog merge pr`. Mid-flow refresh inserted just before Step 12 (VERSION bump), keyed to the headline feature on this branch. +- **`test/gen-skill-docs.test.ts`** 5 new resolver assertions: no-args has no `--query`, claude+query=foo bar appears in BOTH cross-project and project-scoped branches, codex host gets `--query` in the codex bash variant, empty value `query=` falls through to no-query, AND shell-injection payloads (`$(whoami)`, backticks, `;`, `&`, `"`, `\`, `$x`) throw a build error. +- **All generated `SKILL.md` files for the 3 long skills + 4 host outputs** regenerated. The other 13 skills' generated output is byte-identical (backwards-compat verified via diff). + +#### For contributors + +- Contributed by @Fergtic ([chronicle-write-up](https://github.com/Fergtic/chronicle-write-up)) who flagged the load-once + no-refresh pattern and the spend-per-success data point that motivated this work. The static-skill query expansion was also informed by a key fact-check from Codex outside-voice review: the binary's `--query` was single-substring match, not token-OR, which silently invalidated any multi-word query in the wild. + ## [1.33.0.0] - 2026-05-11 ## **`/sync-gbrain` memory stage no longer infinite-loops or silently throws away progress.** diff --git a/VERSION b/VERSION index 6309077ba..bb92f340d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.33.0.0 +1.33.1.0 diff --git a/bin/gstack-learnings-search b/bin/gstack-learnings-search index 3b39e4626..95825635a 100755 --- a/bin/gstack-learnings-search +++ b/bin/gstack-learnings-search @@ -48,7 +48,8 @@ cat "${FILES[@]}" 2>/dev/null | GSTACK_SEARCH_TYPE="$TYPE" GSTACK_SEARCH_QUERY=" const lines = (await Bun.stdin.text()).trim().split('\n').filter(Boolean); const now = Date.now(); const type = process.env.GSTACK_SEARCH_TYPE || ''; -const query = (process.env.GSTACK_SEARCH_QUERY || '').toLowerCase(); +const queryRaw = (process.env.GSTACK_SEARCH_QUERY || '').toLowerCase(); +const queryTokens = queryRaw.split(/\s+/).filter(Boolean); const limit = parseInt(process.env.GSTACK_SEARCH_LIMIT || '10', 10); const slug = process.env.GSTACK_SEARCH_SLUG || ''; @@ -94,12 +95,11 @@ let results = Array.from(seen.values()); // Filter by type if (type) results = results.filter(e => e.type === type); -// Filter by query -if (query) results = results.filter(e => - (e.key || '').toLowerCase().includes(query) || - (e.insight || '').toLowerCase().includes(query) || - (e.files || []).some(f => f.toLowerCase().includes(query)) -); +// Filter by query (token-OR: match if ANY whitespace-split token appears in ANY haystack) +if (queryTokens.length > 0) results = results.filter(e => { + const haystacks = [(e.key || '').toLowerCase(), (e.insight || '').toLowerCase(), ...(e.files || []).map(f => f.toLowerCase())]; + return queryTokens.some(tok => haystacks.some(h => h.includes(tok))); +}); // Sort by effective confidence desc, then recency results.sort((a, b) => { diff --git a/investigate/SKILL.md b/investigate/SKILL.md index f69aefe17..9e2b23f0f 100644 --- a/investigate/SKILL.md +++ b/investigate/SKILL.md @@ -823,9 +823,9 @@ Search for relevant learnings from previous sessions: _CROSS_PROJ=$(~/.claude/skills/gstack/bin/gstack-config get cross_project_learnings 2>/dev/null || echo "unset") echo "CROSS_PROJECT: $_CROSS_PROJ" if [ "$_CROSS_PROJ" = "true" ]; then - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --cross-project 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "debug investigation root cause hypothesis bug fix" --cross-project 2>/dev/null || true else - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "debug investigation root cause hypothesis bug fix" 2>/dev/null || true fi ``` @@ -855,6 +855,20 @@ smarter on their codebase over time. Output: **"Root cause hypothesis: ..."** — a specific, testable claim about what is wrong and why. +### Refresh learnings for the hypothesis you just named + +The top-of-skill learnings pull above is keyed to "debug investigation" broadly. Now that you have a specific hypothesis, re-pull learnings keyed to that hypothesis so prior fixes for the same problem-shape surface. + +Pick ONE keyword from the hypothesis. The keyword should be a noun: the failing component name, the basename of the file you suspect (without extension), or the bug noun. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (investigate-specific): good keywords are `auth-cookie`, `session-expiry`, `redirect-loop`. Bad: `auth.ts:47`, `fix the auth bug`, ``. + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to your investigation in one sentence. If none come back, continue without reference — the absence of a matching prior learning is itself useful information. + --- ## Scope Lock diff --git a/investigate/SKILL.md.tmpl b/investigate/SKILL.md.tmpl index fb649f026..20cc2e26d 100644 --- a/investigate/SKILL.md.tmpl +++ b/investigate/SKILL.md.tmpl @@ -93,10 +93,24 @@ Gather context before forming any hypothesis. 5. **Check investigation history:** Search prior learnings for investigations on the same files. Recurring bugs in the same area are an architectural smell. If prior investigations exist, note patterns and check if the root cause was structural. -{{LEARNINGS_SEARCH}} +{{LEARNINGS_SEARCH:query=debug investigation root cause hypothesis bug fix}} Output: **"Root cause hypothesis: ..."** — a specific, testable claim about what is wrong and why. +### Refresh learnings for the hypothesis you just named + +The top-of-skill learnings pull above is keyed to "debug investigation" broadly. Now that you have a specific hypothesis, re-pull learnings keyed to that hypothesis so prior fixes for the same problem-shape surface. + +Pick ONE keyword from the hypothesis. The keyword should be a noun: the failing component name, the basename of the file you suspect (without extension), or the bug noun. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (investigate-specific): good keywords are `auth-cookie`, `session-expiry`, `redirect-loop`. Bad: `auth.ts:47`, `fix the auth bug`, ``. + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to your investigation in one sentence. If none come back, continue without reference — the absence of a matching prior learning is itself useful information. + --- ## Scope Lock diff --git a/package.json b/package.json index e07e329f7..87c4c7992 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "1.33.0.0", + "version": "1.33.1.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", diff --git a/qa/SKILL.md b/qa/SKILL.md index 41f84fccc..0b56e53e2 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -1068,9 +1068,9 @@ Search for relevant learnings from previous sessions: _CROSS_PROJ=$(~/.claude/skills/gstack/bin/gstack-config get cross_project_learnings 2>/dev/null || echo "unset") echo "CROSS_PROJECT: $_CROSS_PROJ" if [ "$_CROSS_PROJ" = "true" ]; then - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --cross-project 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "qa testing bug regression flake fixture" --cross-project 2>/dev/null || true else - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "qa testing bug regression flake fixture" 2>/dev/null || true fi ``` @@ -1426,6 +1426,20 @@ Sort all discovered issues by severity, then decide which to fix based on the se Mark issues that cannot be fixed from source code (e.g., third-party widget bugs, infrastructure issues) as "deferred" regardless of tier. +### Refresh learnings for the component/page where the bug lives + +The top-of-skill learnings pull was keyed to "qa testing" broadly. Before the fix loop, re-pull learnings keyed to the component or page where the bug you're about to fix lives so prior fixes for the same component-shape surface. + +Pick ONE keyword that names the buggy component or page. The keyword should be a noun: the failing component name, the page route base, or the feature noun. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (qa-specific): good keywords are `checkout-button`, `signup-form`, `payment`. Bad: `tests are failing`, ``, `app/views/_checkout.html.erb`. + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to the fix you're about to make in one sentence. If none come back, continue without reference — the absence is itself useful information. + --- ## Phase 8: Fix Loop diff --git a/qa/SKILL.md.tmpl b/qa/SKILL.md.tmpl index 62081d2c1..11997f7b8 100644 --- a/qa/SKILL.md.tmpl +++ b/qa/SKILL.md.tmpl @@ -100,7 +100,7 @@ mkdir -p .gstack/qa-reports/screenshots --- -{{LEARNINGS_SEARCH}} +{{LEARNINGS_SEARCH:query=qa testing bug regression flake fixture}} ## Test Plan Context @@ -154,6 +154,20 @@ Sort all discovered issues by severity, then decide which to fix based on the se Mark issues that cannot be fixed from source code (e.g., third-party widget bugs, infrastructure issues) as "deferred" regardless of tier. +### Refresh learnings for the component/page where the bug lives + +The top-of-skill learnings pull was keyed to "qa testing" broadly. Before the fix loop, re-pull learnings keyed to the component or page where the bug you're about to fix lives so prior fixes for the same component-shape surface. + +Pick ONE keyword that names the buggy component or page. The keyword should be a noun: the failing component name, the page route base, or the feature noun. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (qa-specific): good keywords are `checkout-button`, `signup-form`, `payment`. Bad: `tests are failing`, ``, `app/views/_checkout.html.erb`. + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to the fix you're about to make in one sentence. If none come back, continue without reference — the absence is itself useful information. + --- ## Phase 8: Fix Loop diff --git a/scripts/resolvers/learnings.ts b/scripts/resolvers/learnings.ts index 685188fb2..8182251d0 100644 --- a/scripts/resolvers/learnings.ts +++ b/scripts/resolvers/learnings.ts @@ -13,7 +13,27 @@ */ import type { TemplateContext } from './types'; -export function generateLearningsSearch(ctx: TemplateContext): string { +// Whitelist for query= macro values. Allows alphanumeric, space, hyphen, underscore. +// Anything else (e.g. $, backticks, quotes, ;) is a shell-injection vector when the +// emitted bash interpolates the value into `--query "${queryArg}"`. Static template +// queries hand-written in gstack are safe, but the resolver API must defend against +// future contributors writing dangerous values. +const QUERY_SAFE_RE = /^[A-Za-z0-9 _-]+$/; + +export function generateLearningsSearch(ctx: TemplateContext, args?: string[]): string { + // Parse query= arg. Empty value falls through to no-query (principle of least surprise: + // a stray {{LEARNINGS_SEARCH:query=}} placeholder gets today's behavior, not a build error). + const queryArg = (args || []) + .filter(a => a.startsWith('query=')) + .map(a => a.slice(6)) + .filter(Boolean)[0]; + if (queryArg && !QUERY_SAFE_RE.test(queryArg)) { + throw new Error( + `{{LEARNINGS_SEARCH:query=...}} value must match ${QUERY_SAFE_RE} (alphanumeric, space, hyphen, underscore). Got: ${JSON.stringify(queryArg)}` + ); + } + const queryFlag = queryArg ? ` --query "${queryArg}"` : ''; + if (ctx.host === 'codex') { // Codex: simpler version, no cross-project, uses $GSTACK_BIN return `## Prior Learnings @@ -21,7 +41,7 @@ export function generateLearningsSearch(ctx: TemplateContext): string { Search for relevant learnings from previous sessions on this project: \`\`\`bash -$GSTACK_BIN/gstack-learnings-search --limit 10 2>/dev/null || true +$GSTACK_BIN/gstack-learnings-search --limit 10${queryFlag} 2>/dev/null || true \`\`\` If learnings are found, incorporate them into your analysis. When a review finding @@ -36,9 +56,9 @@ Search for relevant learnings from previous sessions: _CROSS_PROJ=$(${ctx.paths.binDir}/gstack-config get cross_project_learnings 2>/dev/null || echo "unset") echo "CROSS_PROJECT: $_CROSS_PROJ" if [ "$_CROSS_PROJ" = "true" ]; then - ${ctx.paths.binDir}/gstack-learnings-search --limit 10 --cross-project 2>/dev/null || true + ${ctx.paths.binDir}/gstack-learnings-search --limit 10${queryFlag} --cross-project 2>/dev/null || true else - ${ctx.paths.binDir}/gstack-learnings-search --limit 10 2>/dev/null || true + ${ctx.paths.binDir}/gstack-learnings-search --limit 10${queryFlag} 2>/dev/null || true fi \`\`\` diff --git a/ship/SKILL.md b/ship/SKILL.md index c1b25c95a..25119fb39 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -1824,9 +1824,9 @@ Search for relevant learnings from previous sessions: _CROSS_PROJ=$(~/.claude/skills/gstack/bin/gstack-config get cross_project_learnings 2>/dev/null || echo "unset") echo "CROSS_PROJECT: $_CROSS_PROJ" if [ "$_CROSS_PROJ" = "true" ]; then - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --cross-project 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "release ship version changelog merge pr" --cross-project 2>/dev/null || true else - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "release ship version changelog merge pr" 2>/dev/null || true fi ``` @@ -2459,6 +2459,20 @@ already knows. A good test: would this insight save time in a future session? If +### Refresh learnings for the headline feature on this branch + +The top-of-skill learnings pull was keyed to "release ship" broadly. Before the VERSION/CHANGELOG step, re-pull learnings keyed to THIS branch's headline feature so any prior version-bump or CHANGELOG pitfalls for similar features surface. + +Pick ONE keyword that names the headline feature you're shipping. The keyword should be a noun: the primary skill or module name, the central feature noun, or the binary you changed. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (ship-specific): good keywords are `learnings-search`, `pacing`, `worktree-ship`. Bad: `the branch headline`, `v1.31.1.0`, `feat: token-or search`. + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to the version bump or CHANGELOG framing in one sentence. If none come back, continue without reference — the absence is itself useful information. + ## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, classify the state by comparing `VERSION` against the base branch AND against `package.json`'s `version` field. Four states: FRESH (do bump), ALREADY_BUMPED (skip bump), DRIFT_STALE_PKG (sync pkg only, no re-bump), DRIFT_UNEXPECTED (stop and ask). diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index a1c18ecf5..5a7c34661 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -283,7 +283,7 @@ If multiple suites need to run, run them sequentially (each needs a test lane). {{PLAN_VERIFICATION_EXEC}} -{{LEARNINGS_SEARCH}} +{{LEARNINGS_SEARCH:query=release ship version changelog merge pr}} {{SCOPE_DRIFT}} @@ -401,6 +401,20 @@ For each comment in `comments`: {{GBRAIN_SAVE_RESULTS}} +### Refresh learnings for the headline feature on this branch + +The top-of-skill learnings pull was keyed to "release ship" broadly. Before the VERSION/CHANGELOG step, re-pull learnings keyed to THIS branch's headline feature so any prior version-bump or CHANGELOG pitfalls for similar features surface. + +Pick ONE keyword that names the headline feature you're shipping. The keyword should be a noun: the primary skill or module name, the central feature noun, or the binary you changed. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (ship-specific): good keywords are `learnings-search`, `pacing`, `worktree-ship`. Bad: `the branch headline`, `v1.31.1.0`, `feat: token-or search`. + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to the version bump or CHANGELOG framing in one sentence. If none come back, continue without reference — the absence is itself useful information. + ## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, classify the state by comparing `VERSION` against the base branch AND against `package.json`'s `version` field. Four states: FRESH (do bump), ALREADY_BUMPED (skip bump), DRIFT_STALE_PKG (sync pkg only, no re-bump), DRIFT_UNEXPECTED (stop and ask). diff --git a/test/fixtures/golden/claude-ship-SKILL.md b/test/fixtures/golden/claude-ship-SKILL.md index c1b25c95a..25119fb39 100644 --- a/test/fixtures/golden/claude-ship-SKILL.md +++ b/test/fixtures/golden/claude-ship-SKILL.md @@ -1824,9 +1824,9 @@ Search for relevant learnings from previous sessions: _CROSS_PROJ=$(~/.claude/skills/gstack/bin/gstack-config get cross_project_learnings 2>/dev/null || echo "unset") echo "CROSS_PROJECT: $_CROSS_PROJ" if [ "$_CROSS_PROJ" = "true" ]; then - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --cross-project 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "release ship version changelog merge pr" --cross-project 2>/dev/null || true else - ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 2>/dev/null || true + ~/.claude/skills/gstack/bin/gstack-learnings-search --limit 10 --query "release ship version changelog merge pr" 2>/dev/null || true fi ``` @@ -2459,6 +2459,20 @@ already knows. A good test: would this insight save time in a future session? If +### Refresh learnings for the headline feature on this branch + +The top-of-skill learnings pull was keyed to "release ship" broadly. Before the VERSION/CHANGELOG step, re-pull learnings keyed to THIS branch's headline feature so any prior version-bump or CHANGELOG pitfalls for similar features surface. + +Pick ONE keyword that names the headline feature you're shipping. The keyword should be a noun: the primary skill or module name, the central feature noun, or the binary you changed. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (ship-specific): good keywords are `learnings-search`, `pacing`, `worktree-ship`. Bad: `the branch headline`, `v1.31.1.0`, `feat: token-or search`. + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to the version bump or CHANGELOG framing in one sentence. If none come back, continue without reference — the absence is itself useful information. + ## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, classify the state by comparing `VERSION` against the base branch AND against `package.json`'s `version` field. Four states: FRESH (do bump), ALREADY_BUMPED (skip bump), DRIFT_STALE_PKG (sync pkg only, no re-bump), DRIFT_UNEXPECTED (stop and ask). diff --git a/test/fixtures/golden/codex-ship-SKILL.md b/test/fixtures/golden/codex-ship-SKILL.md index d09e39e98..7770a8906 100644 --- a/test/fixtures/golden/codex-ship-SKILL.md +++ b/test/fixtures/golden/codex-ship-SKILL.md @@ -310,6 +310,26 @@ Effort both-scales: when an option involves effort, label both human-team and CC Net line closes the tradeoff. Per-skill instructions may add stricter rules. +12. **Non-ASCII characters — write directly, never \u-escape.** When any + string field (question, option label, option description) contains + Chinese (繁體/簡體), Japanese, Korean, or other non-ASCII text, emit + the literal UTF-8 characters in the JSON string. **Never escape them + as `\uXXXX`.** Claude Code's tool parameter pipe is UTF-8 native + and passes characters through unchanged. Manually escaping requires + recalling each codepoint from training, which is unreliable for long + CJK strings — the model regularly emits the wrong codepoint (e.g. + writes `\u3103` thinking it is 管 U+7BA1, but `\u3103` is + actually ㄃, so the user sees `管理工具` rendered as `㄃3用箱`). + The trigger is long, multi-line questions with hundreds of CJK + characters: that is exactly when reflexive escaping kicks in and + exactly when miscoding is most damaging. Long ≠ escape. Keep + characters literal. + + Wrong: `"question": "請選擇\uXXXX\uXXXX\uXXXX\uXXXX"` + Right: `"question": "請選擇管理工具"` + + Only JSON-mandatory escapes remain allowed: `\n`, `\t`, `\"`, `\\`. + ### Self-check before emitting Before calling AskUserQuestion, verify: @@ -322,6 +342,7 @@ Before calling AskUserQuestion, verify: - [ ] Dual-scale effort labels on effort-bearing options (human / CC) - [ ] Net line closes the decision - [ ] You are calling the tool, not writing prose +- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped ## Artifacts Sync (skill start) @@ -1789,7 +1810,7 @@ Add a `## Verification Results` section to the PR body (Step 19): Search for relevant learnings from previous sessions on this project: ```bash -$GSTACK_BIN/gstack-learnings-search --limit 10 2>/dev/null || true +$GSTACK_BIN/gstack-learnings-search --limit 10 --query "release ship version changelog merge pr" 2>/dev/null || true ``` If learnings are found, incorporate them into your analysis. When a review finding @@ -2053,6 +2074,20 @@ already knows. A good test: would this insight save time in a future session? If +### Refresh learnings for the headline feature on this branch + +The top-of-skill learnings pull was keyed to "release ship" broadly. Before the VERSION/CHANGELOG step, re-pull learnings keyed to THIS branch's headline feature so any prior version-bump or CHANGELOG pitfalls for similar features surface. + +Pick ONE keyword that names the headline feature you're shipping. The keyword should be a noun: the primary skill or module name, the central feature noun, or the binary you changed. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (ship-specific): good keywords are `learnings-search`, `pacing`, `worktree-ship`. Bad: `the branch headline`, `v1.31.1.0`, `feat: token-or search`. + +```bash +$GSTACK_ROOT/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to the version bump or CHANGELOG framing in one sentence. If none come back, continue without reference — the absence is itself useful information. + ## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, classify the state by comparing `VERSION` against the base branch AND against `package.json`'s `version` field. Four states: FRESH (do bump), ALREADY_BUMPED (skip bump), DRIFT_STALE_PKG (sync pkg only, no re-bump), DRIFT_UNEXPECTED (stop and ask). diff --git a/test/fixtures/golden/factory-ship-SKILL.md b/test/fixtures/golden/factory-ship-SKILL.md index ec849bcce..baae7421d 100644 --- a/test/fixtures/golden/factory-ship-SKILL.md +++ b/test/fixtures/golden/factory-ship-SKILL.md @@ -312,6 +312,26 @@ Effort both-scales: when an option involves effort, label both human-team and CC Net line closes the tradeoff. Per-skill instructions may add stricter rules. +12. **Non-ASCII characters — write directly, never \u-escape.** When any + string field (question, option label, option description) contains + Chinese (繁體/簡體), Japanese, Korean, or other non-ASCII text, emit + the literal UTF-8 characters in the JSON string. **Never escape them + as `\uXXXX`.** Claude Code's tool parameter pipe is UTF-8 native + and passes characters through unchanged. Manually escaping requires + recalling each codepoint from training, which is unreliable for long + CJK strings — the model regularly emits the wrong codepoint (e.g. + writes `\u3103` thinking it is 管 U+7BA1, but `\u3103` is + actually ㄃, so the user sees `管理工具` rendered as `㄃3用箱`). + The trigger is long, multi-line questions with hundreds of CJK + characters: that is exactly when reflexive escaping kicks in and + exactly when miscoding is most damaging. Long ≠ escape. Keep + characters literal. + + Wrong: `"question": "請選擇\uXXXX\uXXXX\uXXXX\uXXXX"` + Right: `"question": "請選擇管理工具"` + + Only JSON-mandatory escapes remain allowed: `\n`, `\t`, `\"`, `\\`. + ### Self-check before emitting Before calling AskUserQuestion, verify: @@ -324,6 +344,7 @@ Before calling AskUserQuestion, verify: - [ ] Dual-scale effort labels on effort-bearing options (human / CC) - [ ] Net line closes the decision - [ ] You are calling the tool, not writing prose +- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped ## Artifacts Sync (skill start) @@ -1794,9 +1815,9 @@ Search for relevant learnings from previous sessions: _CROSS_PROJ=$($GSTACK_BIN/gstack-config get cross_project_learnings 2>/dev/null || echo "unset") echo "CROSS_PROJECT: $_CROSS_PROJ" if [ "$_CROSS_PROJ" = "true" ]; then - $GSTACK_BIN/gstack-learnings-search --limit 10 --cross-project 2>/dev/null || true + $GSTACK_BIN/gstack-learnings-search --limit 10 --query "release ship version changelog merge pr" --cross-project 2>/dev/null || true else - $GSTACK_BIN/gstack-learnings-search --limit 10 2>/dev/null || true + $GSTACK_BIN/gstack-learnings-search --limit 10 --query "release ship version changelog merge pr" 2>/dev/null || true fi ``` @@ -2429,6 +2450,20 @@ already knows. A good test: would this insight save time in a future session? If +### Refresh learnings for the headline feature on this branch + +The top-of-skill learnings pull was keyed to "release ship" broadly. Before the VERSION/CHANGELOG step, re-pull learnings keyed to THIS branch's headline feature so any prior version-bump or CHANGELOG pitfalls for similar features surface. + +Pick ONE keyword that names the headline feature you're shipping. The keyword should be a noun: the primary skill or module name, the central feature noun, or the binary you changed. The keyword MUST be alphanumeric or hyphen only — no quotes, slashes, dots, colons, or whitespace. If your candidate has any of those, simplify to just the alphanumeric stem. + +Worked examples (ship-specific): good keywords are `learnings-search`, `pacing`, `worktree-ship`. Bad: `the branch headline`, `v1.31.1.0`, `feat: token-or search`. + +```bash +$GSTACK_ROOT/bin/gstack-learnings-search --query "" --limit 5 2>/dev/null || true +``` + +If any learnings come back, name which one applies to the version bump or CHANGELOG framing in one sentence. If none come back, continue without reference — the absence is itself useful information. + ## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, classify the state by comparing `VERSION` against the base branch AND against `package.json`'s `version` field. Four states: FRESH (do bump), ALREADY_BUMPED (skip bump), DRIFT_STALE_PKG (sync pkg only, no re-bump), DRIFT_UNEXPECTED (stop and ask). diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index b30a32464..4bf8abeee 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -3047,3 +3047,51 @@ describe('GSTACK REVIEW REPORT delete-then-append flow', () => { expect(src).not.toContain('If it was found mid-file, move it'); }); }); + +describe('LEARNINGS_SEARCH resolver: query parameter', () => { + // Lazy-load resolver and types after describe block to keep test file self-contained. + const { generateLearningsSearch } = require('../scripts/resolvers/learnings'); + const { HOST_PATHS } = require('../scripts/resolvers/types'); + + const claudeCtx = { + skillName: 'test', + tmplPath: 'test/SKILL.md.tmpl', + host: 'claude', + paths: HOST_PATHS.claude, + }; + const codexCtx = { ...claudeCtx, host: 'codex', paths: HOST_PATHS.codex }; + + test('no args → bash does not contain --query (backwards-compat)', () => { + const out = generateLearningsSearch(claudeCtx); + expect(out).not.toContain('--query'); + }); + + test('claude host + query=foo bar → both cross-project and project-scoped branches contain --query', () => { + const out = generateLearningsSearch(claudeCtx, ['query=foo bar']); + // Both branches of the if/else must carry the flag. + const lines = out.split('\n').filter(l => l.includes('gstack-learnings-search')); + expect(lines.length).toBeGreaterThanOrEqual(2); + for (const line of lines) { + expect(line).toContain('--query "foo bar"'); + } + }); + + test('codex host + query=foo bar → codex bash variant contains --query', () => { + const out = generateLearningsSearch(codexCtx, ['query=foo bar']); + expect(out).toContain('--query "foo bar"'); + expect(out).toContain('$GSTACK_BIN/gstack-learnings-search'); + }); + + test('empty value query= → bash does not contain --query (locked semantics: falls through)', () => { + const claudeOut = generateLearningsSearch(claudeCtx, ['query=']); + expect(claudeOut).not.toContain('--query'); + const codexOut = generateLearningsSearch(codexCtx, ['query=']); + expect(codexOut).not.toContain('--query'); + }); + + test('shell-injection chars in query= → throws at gen-time (defense in depth)', () => { + for (const bad of ['$(whoami)', '`cmd`', 'a;b', 'a&b', 'a"b', 'a\\b', 'foo$x']) { + expect(() => generateLearningsSearch(claudeCtx, [`query=${bad}`])).toThrow(/alphanumeric/); + } + }); +}); diff --git a/test/gstack-learnings-search.test.ts b/test/gstack-learnings-search.test.ts new file mode 100644 index 000000000..7218d60f1 --- /dev/null +++ b/test/gstack-learnings-search.test.ts @@ -0,0 +1,60 @@ +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { execFileSync } from 'child_process'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const BIN = path.join(ROOT, 'bin', 'gstack-learnings-search'); + +const tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-search-test-')); +const tmpCwd = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-search-cwd-')); +// gstack-slug derives slug from git remote (none here) → falls back to basename of cwd. +const slug = path.basename(tmpCwd).replace(/[^a-zA-Z0-9._-]/g, ''); +const projDir = path.join(tmpHome, 'projects', slug); + +function run(args: string[]): string { + return execFileSync(BIN, args, { + env: { ...process.env, GSTACK_HOME: tmpHome }, + cwd: tmpCwd, + encoding: 'utf-8', + }); +} + +beforeAll(() => { + fs.mkdirSync(projDir, { recursive: true }); + const entries = [ + { ts: '2026-05-01T00:00:00Z', skill: 'test', type: 'pattern', key: 'foo-pattern', insight: 'A foo-related insight', confidence: 8, source: 'observed', files: [] }, + { ts: '2026-05-02T00:00:00Z', skill: 'test', type: 'pitfall', key: 'bar-pitfall', insight: 'A bar-related insight', confidence: 8, source: 'observed', files: [] }, + { ts: '2026-05-03T00:00:00Z', skill: 'test', type: 'pattern', key: 'baz-pattern', insight: 'A baz-related insight', confidence: 8, source: 'observed', files: [] }, + ]; + fs.writeFileSync(path.join(projDir, 'learnings.jsonl'), entries.map(e => JSON.stringify(e)).join('\n') + '\n'); +}); + +afterAll(() => { + fs.rmSync(tmpHome, { recursive: true, force: true }); + fs.rmSync(tmpCwd, { recursive: true, force: true }); +}); + +describe('gstack-learnings-search token-OR query semantics', () => { + test('multi-token query returns entries matching ANY token', () => { + const out = run(['--query', 'foo bar']); + expect(out).toContain('foo-pattern'); + expect(out).toContain('bar-pitfall'); + expect(out).not.toContain('baz-pattern'); + }); + + test('single-token query returns only entries matching that token', () => { + const out = run(['--query', 'foo']); + expect(out).toContain('foo-pattern'); + expect(out).not.toContain('bar-pitfall'); + expect(out).not.toContain('baz-pattern'); + }); + + test('no --query flag returns all entries (backwards-compat)', () => { + const out = run(['--limit', '10']); + expect(out).toContain('foo-pattern'); + expect(out).toContain('bar-pitfall'); + expect(out).toContain('baz-pattern'); + }); +});