From e0a6b893991a15cee0ad9795679418306215a2aa Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sat, 14 Mar 2026 22:04:32 -0500
Subject: [PATCH] feat: merge TODO.md + TODOS.md into unified backlog with
 shared format reference

Merge TODO.md (roadmap) and TODOS.md (near-term) into one file organized by
skill/component with P0-P4 priority ordering and Completed section. Add shared
review/TODOS-format.md for canonical format. Add static validation tests.
---
 TODO.md                       | 120 ----------
 TODOS.md                      | 397 ++++++++++++++++++++++++++++++++--
 review/TODOS-format.md        |  62 ++++++
 test/skill-validation.test.ts |  23 ++
 4 files changed, 464 insertions(+), 138 deletions(-)
 delete mode 100644 TODO.md
 create mode 100644 review/TODOS-format.md
diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index 1485eee4..00000000
--- a/TODO.md
+++ /dev/null
@@ -1,120 +0,0 @@
-# TODO — gstack roadmap
-
-## Phase 1: Foundations (v0.2.0)
-  - [x] Rename to gstack
-  - [x] Restructure to monorepo layout
-  - [x] Setup script for skill symlinks
-  - [x] Snapshot command with ref-based element selection
-  - [x] Snapshot tests
-
-## Phase 2: Enhanced Browser (v0.2.0) ✅
-  - [x] Annotated screenshots (--annotate flag, ref labels overlaid on screenshot)
-  - [x] Snapshot diffing (--diff flag, unified diff against previous snapshot)
-  - [x] Dialog handling (auto-accept/dismiss, dialog buffer, prevents browser lockup)
-  - [x] File upload (upload <sel> <files>)
-  - [x] Cursor-interactive elements (-C flag, cursor:pointer/onclick/tabindex scan)
-  - [x] Element state checks (is visible/hidden/enabled/disabled/checked/editable/focused)
-  - [x] CircularBuffer — O(1) ring buffer for console/network/dialog (was O(n) array+shift)
-  - [x] Async buffer flush with Bun.write() (was appendFileSync)
-  - [x] Health check with page.evaluate('1') + 2s timeout
-  - [x] Playwright error wrapping — actionable messages for AI agents
-  - [x] Fix useragent — context recreation preserves cookies/storage/URLs
-  - [x] DRY: getCleanText exported, command sets in chain updated
-  - [x] 148 integration tests (was ~63)
-
-## Phase 3: QA Testing Agent (v0.3.0)
-  - [x] `/qa` SKILL.md — 6-phase workflow: Initialize → Authenticate → Orient → Explore → Document → Wrap up
-  - [x] Issue taxonomy reference (7 categories: visual, functional, UX, content, performance, console, accessibility)
-  - [x] Severity classification (critical/high/medium/low)
-  - [x] Exploration checklist per page
-  - [x] Report template (structured markdown with per-issue evidence)
-  - [x] Repro-first philosophy: every issue gets evidence before moving on
-  - [x] Two evidence tiers: interactive bugs (multi-step screenshots), static bugs (single annotated screenshot)
-  - [x] Key guidance: 5-10 well-documented issues per session, depth over breadth, write incrementally
-  - [x] Three modes: full (systematic), quick (30-second smoke test), regression (compare against baseline)
-  - [x] Framework detection guidance (Next.js, Rails, WordPress, SPA)
-  - [x] Health score rubric (7 categories, weighted average)
-  - [x] `wait --networkidle` / `wait --load` / `wait --domcontentloaded`
-  - [x] `console --errors` (filter to error/warning only)
-  - [x] `cookie-import <json-file>` (bulk cookie import with auto-fill domain)
-  - [x] `browse/bin/find-browse` (DRY binary discovery across skills)
-  - [ ] Video recording (deferred to Phase 5 — recreateContext destroys page state)
-
-## Phase 3.5: Browser Cookie Import (v0.3.x)
-  - [x] `cookie-import-browser` command (Chromium cookie DB decryption)
-  - [x] Cookie picker web UI (served from browse server)
-  - [x] `/setup-browser-cookies` skill
-  - [x] Unit tests with encrypted cookie fixtures (18 tests)
-  - [x] Browser registry (Comet, Chrome, Arc, Brave, Edge)
-
-## Phase 3.6: Visual PR Annotations + S3 Upload
-  - [ ] `/setup-gstack-upload` skill (configure S3 bucket for image hosting)
-  - [ ] `browse/bin/gstack-upload` helper (upload file to S3, return public URL)
-  - [ ] `/ship` Step 7.5: visual verification with screenshots in PR body
-  - [ ] `/review` Step 4.5: visual review with annotated screenshots in PR
-  - [ ] WebM → GIF conversion (ffmpeg) for video evidence in PRs
-  - [ ] README documentation for visual PR annotations
-
-## Phase 4: Skill + Browser Integration
-  - [ ] ship + browse: post-deploy verification
-    - Browse staging/preview URL after push
-    - Screenshot key pages
-    - Check console for JS errors
-    - Compare staging vs prod via snapshot diff
-    - Include verification screenshots in PR body
-    - STOP if critical errors found
-  - [ ] review + browse: visual diff review
-    - Browse PR's preview deploy
-    - Annotated screenshots of changed pages
-    - Compare against production visually
-    - Check responsive layouts (mobile/tablet/desktop)
-    - Verify accessibility tree hasn't regressed
-  - [ ] deploy-verify skill: lightweight post-deploy smoke test
-    - Hit key URLs, verify 200s
-    - Screenshot critical pages
-    - Console error check
-    - Compare against baseline snapshots
-    - Pass/fail with evidence
-
-## Phase 5: State & Sessions
-  - [ ] Bundle server.ts into compiled binary (eliminate resolveServerScript() fallback chain entirely) (P2, M)
-  - [ ] v20 encryption format support (AES-256-GCM) — future Chromium versions may change from v10
-  - [ ] Sessions (isolated browser instances with separate cookies/storage/history)
-  - [ ] State persistence (save/load cookies + localStorage to JSON files)
-  - [ ] Auth vault (encrypted credential storage, referenced by name, LLM never sees passwords)
-  - [ ] Video recording (record start/stop — needs sessions for clean context lifecycle)
-  - [ ] retro + browse: deployment health tracking
-    - Screenshot production state
-    - Check perf metrics (page load times)
-    - Count console errors across key pages
-    - Track trends over retro window
-
-## Phase 6: Advanced Browser
-  - [ ] Iframe support (frame <sel>, frame main)
-  - [ ] Semantic locators (find role/label/text/placeholder/testid with actions)
-  - [ ] Device emulation presets (set device "iPhone 16 Pro")
-  - [ ] Network mocking/routing (intercept, block, mock requests)
-  - [ ] Download handling (click-to-download with path control)
-  - [ ] Content safety (--max-output truncation, --allowed-domains)
-  - [ ] Streaming (WebSocket live preview for pair browsing)
-  - [ ] CDP mode (connect to already-running Chrome/Electron apps)
-
-## Future Ideas
-  - [ ] Linux/Windows cookie decryption (GNOME Keyring / kwallet / DPAPI)
-  - [ ] Trend tracking across QA runs — compare baseline.json over time, detect regressions (P2, S)
-  - [ ] CI/CD integration — `/qa` as GitHub Action step, fail PR if health score drops (P2, M)
-  - [ ] Accessibility audit mode — `--a11y` flag for focused accessibility testing (P3, S)
-  - [ ] Greptile training feedback loop — export suppression patterns to Greptile team for model improvement (P3, S)
-  - [x] E2E test cost tracking — track cumulative API spend, warn if over threshold (P3, S)
-  - [ ] E2E model pinning — pin E2E tests to claude-sonnet-4-6 for cost efficiency, add retry:2 for flaky LLM (P2, XS)
-  - [ ] Smart default QA tier — after a few runs, check index.md for user's usual tier pick, skip the question (P2, S)
-
-## Ideas & Notes
-  - Browser is the nervous system — every skill should be able to see, interact with, and verify the web
-  - Skills are the product; the browser enables them
-  - One repo, one install, entire AI engineering workflow
-  - Bun compiled binary matches Rust CLI performance for this use case (bottleneck is Chromium, not CLI parsing)
-  - Accessibility tree snapshots use ~200-400 tokens vs ~3000-5000 for full DOM — critical for AI context efficiency
-  - Locator map approach for refs: store Map<string, Locator> on BrowserManager, no DOM mutation, no CSP issues
-  - Snapshot scoping (-i, -c, -d, -s flags) is critical for performance on large pages
-  - All new commands follow existing pattern: add to command set, add switch case, return string
diff --git a/TODOS.md b/TODOS.md
index 1ded3ba7..e98fcc1c 100644
--- a/TODOS.md
+++ b/TODOS.md
@@ -1,36 +1,397 @@
 # TODOS
 
-## Auto-upgrade mode (zero-prompt)
+## Browse
 
-**What:** Add a `GSTACK_AUTO_UPGRADE=1` env var or `~/.gstack/config` option that skips the AskUserQuestion prompt and upgrades automatically when a new version is detected.
+### Bundle server.ts into compiled binary
 
-**Why:** Power users and CI environments may want zero-friction upgrades without being asked every time.
+**What:** Eliminate `resolveServerScript()` fallback chain entirely — bundle server.ts into the compiled browse binary.
 
-**Context:** The current upgrade system (v0.3.4) always prompts via AskUserQuestion. This TODO adds an opt-in bypass. Implementation is ~10 lines in the preamble instructions: check for the env var/config before calling AskUserQuestion, and if set, go straight to the upgrade flow. Depends on the full upgrade system being stable first — wait for user feedback on the prompt-based flow before adding this.
+**Why:** The current fallback chain (check adjacent to cli.ts, check global install) is fragile and caused bugs in v0.3.2. A single compiled binary is simpler and more reliable.
 
-**Effort:** S (small)
-**Priority:** P3 (nice-to-have, revisit after adoption data)
+**Context:** Bun's `--compile` flag can bundle multiple entry points. The server is currently resolved at runtime via file path lookup. Bundling it removes the resolution step entirely.
 
-## GitHub Actions eval upload
+**Effort:** M
+**Priority:** P2
+**Depends on:** None
+
+### Sessions (isolated browser instances)
+
+**What:** Isolated browser instances with separate cookies/storage/history, addressable by name.
+
+**Why:** Enables parallel testing of different user roles, A/B test verification, and clean auth state management.
+
+**Context:** Requires Playwright browser context isolation. Each session gets its own context with independent cookies/localStorage. Prerequisite for video recording (clean context lifecycle) and auth vault.
+
+**Effort:** L
+**Priority:** P3
+
+### Video recording
+
+**What:** Record browser interactions as video (start/stop controls).
+
+**Why:** Video evidence in QA reports and PR bodies. Currently deferred because `recreateContext()` destroys page state.
+
+**Context:** Needs sessions for clean context lifecycle. Playwright supports video recording per context. Also needs WebM → GIF conversion for PR embedding.
+
+**Effort:** M
+**Priority:** P3
+**Depends on:** Sessions
+
+### v20 encryption format support
+
+**What:** AES-256-GCM support for future Chromium cookie DB versions (currently v10).
+
+**Why:** Future Chromium versions may change encryption format. Proactive support prevents breakage.
+
+**Effort:** S
+**Priority:** P3
+
+### State persistence
+
+**What:** Save/load cookies + localStorage to JSON files for reproducible test sessions.
+
+**Why:** Enables "resume where I left off" for QA sessions and repeatable auth states.
+
+**Effort:** M
+**Priority:** P3
+**Depends on:** Sessions
+
+### Auth vault
+
+**What:** Encrypted credential storage, referenced by name. LLM never sees passwords.
+
+**Why:** Security — currently auth credentials flow through the LLM context. Vault keeps secrets out of the AI's view.
+
+**Effort:** L
+**Priority:** P3
+**Depends on:** Sessions, state persistence
+
+### Iframe support
+
+**What:** `frame <sel>` and `frame main` commands for cross-frame interaction.
+
+**Why:** Many web apps use iframes (embeds, payment forms, ads). Currently invisible to browse.
+
+**Effort:** M
+**Priority:** P4
+
+### Semantic locators
+
+**What:** `find role/label/text/placeholder/testid` with attached actions.
+
+**Why:** More resilient element selection than CSS selectors or ref numbers.
+
+**Effort:** M
+**Priority:** P4
+
+### Device emulation presets
+
+**What:** `set device "iPhone 16 Pro"` for mobile/tablet testing.
+
+**Why:** Responsive layout testing without manual viewport resizing.
+
+**Effort:** S
+**Priority:** P4
+
+### Network mocking/routing
+
+**What:** Intercept, block, and mock network requests.
+
+**Why:** Test error states, loading states, and offline behavior.
+
+**Effort:** M
+**Priority:** P4
+
+### Download handling
+
+**What:** Click-to-download with path control.
+
+**Why:** Test file download flows end-to-end.
+
+**Effort:** S
+**Priority:** P4
+
+### Content safety
+
+**What:** `--max-output` truncation, `--allowed-domains` filtering.
+
+**Why:** Prevent context window overflow and restrict navigation to safe domains.
+
+**Effort:** S
+**Priority:** P4
+
+### Streaming (WebSocket live preview)
+
+**What:** WebSocket-based live preview for pair browsing sessions.
+
+**Why:** Enables real-time collaboration — human watches AI browse.
+
+**Effort:** L
+**Priority:** P4
+
+### CDP mode
+
+**What:** Connect to already-running Chrome/Electron apps via Chrome DevTools Protocol.
+
+**Why:** Test production apps, Electron apps, and existing browser sessions without launching new instances.
+
+**Effort:** M
+**Priority:** P4
+
+### Linux/Windows cookie decryption
+
+**What:** GNOME Keyring / kwallet / DPAPI support for non-macOS cookie import.
+
+**Why:** Cross-platform cookie import. Currently macOS-only (Keychain).
+
+**Effort:** L
+**Priority:** P4
+
+## Ship
+
+### Ship log — persistent record of /ship runs
+
+**What:** Append structured JSON entry to `.gstack/ship-log.json` at end of every /ship run (version, date, branch, PR URL, review findings, Greptile stats, todos completed, test results).
+
+**Why:** /retro has no structured data about shipping velocity. Ship log enables: PRs-per-week trending, review finding rates, Greptile signal over time, test suite growth.
+
+**Context:** /retro already reads greptile-history.md — same pattern. Eval persistence (eval-store.ts) shows the JSON append pattern exists in the codebase. ~15 lines in ship template.
+
+**Effort:** S
+**Priority:** P2
+**Depends on:** None
+
+### Post-deploy verification (ship + browse)
+
+**What:** After push, browse staging/preview URL, screenshot key pages, check console for JS errors, compare staging vs prod via snapshot diff. Include verification screenshots in PR body. STOP if critical errors found.
+
+**Why:** Catch deployment-time regressions (JS errors, broken layouts) before merge.
+
+**Context:** Requires S3 upload infrastructure for PR screenshots. Pairs with visual PR annotations.
+
+**Effort:** L
+**Priority:** P2
+**Depends on:** /setup-gstack-upload, visual PR annotations
+
+### Visual verification with screenshots in PR body
+
+**What:** /ship Step 7.5: screenshot key pages after push, embed in PR body.
+
+**Why:** Visual evidence in PRs. Reviewers see what changed without deploying locally.
+
+**Context:** Part of Phase 3.6. Needs S3 upload for image hosting.
+
+**Effort:** M
+**Priority:** P2
+**Depends on:** /setup-gstack-upload
+
+## Review
+
+### Inline PR annotations
+
+**What:** /ship and /review post inline review comments at specific file:line locations using `gh api` to create pull request review comments.
+
+**Why:** Line-level annotations are more actionable than top-level comments. The PR thread becomes a line-by-line conversation between Greptile, Claude, and human reviewers.
+
+**Context:** GitHub supports inline review comments via `gh api repos/$REPO/pulls/$PR/reviews`. Pairs naturally with Phase 3.6 visual annotations.
+
+**Effort:** S
+**Priority:** P2
+**Depends on:** None
+
+### Greptile training feedback export
+
+**What:** Aggregate greptile-history.md into machine-readable JSON summary of false positive patterns, exportable to the Greptile team for model improvement.
+
+**Why:** Closes the feedback loop — Greptile can use FP data to stop making the same mistakes on your codebase.
+
+**Context:** Was a P3 Future Idea. Upgraded to P2 now that greptile-history.md data infrastructure exists. The signal data is already being collected; this just makes it exportable. ~40 lines.
+
+**Effort:** S
+**Priority:** P2
+**Depends on:** Enough FP data accumulated (10+ entries)
+
+### Visual review with annotated screenshots
+
+**What:** /review Step 4.5: browse PR's preview deploy, annotated screenshots of changed pages, compare against production, check responsive layouts, verify accessibility tree.
+
+**Why:** Visual diff catches layout regressions that code review misses.
+
+**Context:** Part of Phase 3.6. Needs S3 upload for image hosting.
+
+**Effort:** M
+**Priority:** P2
+**Depends on:** /setup-gstack-upload
+
+## QA
+
+### QA trend tracking
+
+**What:** Compare baseline.json over time, detect regressions across QA runs.
+
+**Why:** Spot quality trends — is the app getting better or worse?
+
+**Context:** QA already writes structured reports. This adds cross-run comparison.
+
+**Effort:** S
+**Priority:** P2
+
+### CI/CD QA integration
+
+**What:** `/qa` as GitHub Action step, fail PR if health score drops.
+
+**Why:** Automated quality gate in CI. Catch regressions before merge.
+
+**Effort:** M
+**Priority:** P2
+
+### Smart default QA tier
+
+**What:** After a few runs, check index.md for user's usual tier pick, skip the AskUserQuestion.
+
+**Why:** Reduces friction for repeat users.
+
+**Effort:** S
+**Priority:** P2
+
+### Accessibility audit mode
+
+**What:** `--a11y` flag for focused accessibility testing.
+
+**Why:** Dedicated accessibility testing beyond the general QA checklist.
+
+**Effort:** S
+**Priority:** P3
+
+## Retro
+
+### Deployment health tracking (retro + browse)
+
+**What:** Screenshot production state, check perf metrics (page load times), count console errors across key pages, track trends over retro window.
+
+**Why:** Retro should include production health alongside code metrics.
+
+**Context:** Requires browse integration. Screenshots + metrics fed into retro output.
+
+**Effort:** L
+**Priority:** P3
+**Depends on:** Browse sessions
+
+## Infrastructure
+
+### /setup-gstack-upload skill (S3 bucket)
+
+**What:** Configure S3 bucket for image hosting. One-time setup for visual PR annotations.
+
+**Why:** Prerequisite for visual PR annotations in /ship and /review.
+
+**Effort:** M
+**Priority:** P2
+
+### gstack-upload helper
+
+**What:** `browse/bin/gstack-upload` — upload file to S3, return public URL.
+
+**Why:** Shared utility for all skills that need to embed images in PRs.
+
+**Effort:** S
+**Priority:** P2
+**Depends on:** /setup-gstack-upload
+
+### WebM to GIF conversion
+
+**What:** ffmpeg-based WebM → GIF conversion for video evidence in PRs.
+
+**Why:** GitHub PR bodies render GIFs but not WebM. Needed for video recording evidence.
+
+**Effort:** S
+**Priority:** P3
+**Depends on:** Video recording
+
+### Deploy-verify skill
+
+**What:** Lightweight post-deploy smoke test: hit key URLs, verify 200s, screenshot critical pages, console error check, compare against baseline snapshots. Pass/fail with evidence.
+
+**Why:** Fast post-deploy confidence check, separate from full QA.
+
+**Effort:** M
+**Priority:** P2
+
+### GitHub Actions eval upload
 
 **What:** Run eval suite in CI, upload result JSON as artifact, post summary comment on PR.
 
-**Why:** Currently evals only run locally. CI integration would catch quality regressions before merge and provide a persistent record of eval results per PR.
+**Why:** CI integration catches quality regressions before merge and provides persistent eval records per PR.
 
-**Context:** Requires `ANTHROPIC_API_KEY` in CI secrets. Cost is ~$4/run. The eval persistence system (v0.3.6) writes JSON to `~/.gstack-dev/evals/` — CI would upload these as GitHub Actions artifacts and use `eval:compare` to post a delta comment on the PR.
+**Context:** Requires `ANTHROPIC_API_KEY` in CI secrets. Cost is ~$4/run. Eval persistence system (v0.3.6) writes JSON to `~/.gstack-dev/evals/` — CI would upload as GitHub Actions artifacts and use `eval:compare` to post delta comment.
 
-**Depends on:** Eval persistence shipping (v0.3.6).
-**Effort:** M (medium)
+**Effort:** M
+**Priority:** P2
+**Depends on:** Eval persistence (shipped in v0.3.6)
+
+### E2E model pinning
+
+**What:** Pin E2E tests to claude-sonnet-4-6 for cost efficiency, add retry:2 for flaky LLM responses.
+
+**Why:** Reduce E2E test cost and flakiness.
+
+**Effort:** XS
 **Priority:** P2
 
-## Eval web dashboard
+### Auto-upgrade mode (zero-prompt)
 
-**What:** `bun run eval:dashboard` serves local HTML with charts: cost trending, detection rate over time, pass/fail history.
+**What:** `GSTACK_AUTO_UPGRADE=1` env var or `~/.gstack/config` option that skips the AskUserQuestion prompt and upgrades automatically.
 
-**Why:** The CLI tools (`eval:list`, `eval:compare`, `eval:summary`) are good for quick checks but visual charts are better for spotting trends over many runs.
+**Why:** Power users and CI environments want zero-friction upgrades.
 
-**Context:** Reads the same `~/.gstack-dev/evals/*.json` files. ~200 lines HTML + chart.js code served via a simple Bun HTTP server. No external dependencies beyond what's already installed.
+**Context:** Current upgrade system (v0.3.4) always prompts. This adds opt-in bypass. ~10 lines in preamble instructions.
 
-**Depends on:** Eval persistence + eval:list shipping (v0.3.6).
-**Effort:** M (medium)
-**Priority:** P3 (nice-to-have, revisit after eval system sees regular use)
+**Effort:** S
+**Priority:** P3
+
+### Eval web dashboard
+
+**What:** `bun run eval:dashboard` serves local HTML with charts: cost trending, detection rate, pass/fail history.
+
+**Why:** Visual charts better for spotting trends than CLI tools.
+
+**Context:** Reads `~/.gstack-dev/evals/*.json`. ~200 lines HTML + chart.js via Bun HTTP server.
+
+**Effort:** M
+**Priority:** P3
+**Depends on:** Eval persistence (shipped in v0.3.6)
+
+## Completed
+
+### Phase 1: Foundations (v0.2.0)
+- Rename to gstack
+- Restructure to monorepo layout
+- Setup script for skill symlinks
+- Snapshot command with ref-based element selection
+- Snapshot tests
+**Completed:** v0.2.0
+
+### Phase 2: Enhanced Browser (v0.2.0)
+- Annotated screenshots, snapshot diffing, dialog handling, file upload
+- Cursor-interactive elements, element state checks
+- CircularBuffer, async buffer flush, health check
+- Playwright error wrapping, useragent fix
+- 148 integration tests
+**Completed:** v0.2.0
+
+### Phase 3: QA Testing Agent (v0.3.0)
+- /qa SKILL.md with 6-phase workflow, 3 modes (full/quick/regression)
+- Issue taxonomy, severity classification, exploration checklist
+- Report template, health score rubric, framework detection
+- wait/console/cookie-import commands, find-browse binary
+**Completed:** v0.3.0
+
+### Phase 3.5: Browser Cookie Import (v0.3.x)
+- cookie-import-browser command (Chromium cookie DB decryption)
+- Cookie picker web UI, /setup-browser-cookies skill
+- 18 unit tests, browser registry (Comet, Chrome, Arc, Brave, Edge)
+**Completed:** v0.3.1
+
+### E2E test cost tracking
+- Track cumulative API spend, warn if over threshold
+**Completed:** v0.3.6
diff --git a/review/TODOS-format.md b/review/TODOS-format.md
new file mode 100644
index 00000000..2619d5f3
--- /dev/null
+++ b/review/TODOS-format.md
@@ -0,0 +1,62 @@
+# TODOS.md Format Reference
+
+Shared reference for the canonical TODOS.md format. Referenced by `/ship` (Step 5.5) and `/plan-ceo-review` (TODOS.md updates section) to ensure consistent TODO item structure.
+
+---
+
+## File Structure
+
+```markdown
+# TODOS
+
+## <Skill/Component>     ← e.g., ## Browse, ## Ship, ## Review, ## Infrastructure
+<items sorted P0 first, then P1, P2, P3, P4>
+
+## Completed
+<finished items with completion annotation>
+```
+
+**Sections:** Organize by skill or component (`## Browse`, `## Ship`, `## Review`, `## QA`, `## Retro`, `## Infrastructure`). Within each section, sort items by priority (P0 at top).
+
+---
+
+## TODO Item Format
+
+Each item is an H3 under its section:
+
+```markdown
+### <Title>
+
+**What:** One-line description of the work.
+
+**Why:** The concrete problem it solves or value it unlocks.
+
+**Context:** Enough detail that someone picking this up in 3 months understands the motivation, the current state, and where to start.
+
+**Effort:** S / M / L / XL
+**Priority:** P0 / P1 / P2 / P3 / P4
+**Depends on:** <prerequisites, or "None">
+```
+
+**Required fields:** What, Why, Context, Effort, Priority
+**Optional fields:** Depends on, Blocked by
+
+---
+
+## Priority Definitions
+
+- **P0** — Blocking: must be done before next release
+- **P1** — Critical: should be done this cycle
+- **P2** — Important: do when P0/P1 are clear
+- **P3** — Nice-to-have: revisit after adoption/usage data
+- **P4** — Someday: good idea, no urgency
+
+---
+
+## Completed Item Format
+
+When an item is completed, move it to the `## Completed` section preserving its original content and appending:
+
+```markdown
+**Completed:** vX.Y.Z (YYYY-MM-DD)
+```
diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts
index 8cb2ecd6..05a39761 100644
--- a/test/skill-validation.test.ts
+++ b/test/skill-validation.test.ts
@@ -361,6 +361,29 @@ describe('Greptile history format consistency', () => {
   });
 });
 
+// --- Part 7b: TODOS-format.md reference consistency ---
+
+describe('TODOS-format.md reference consistency', () => {
+  test('review/TODOS-format.md exists and defines canonical format', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'review', 'TODOS-format.md'), 'utf-8');
+    expect(content).toContain('**What:**');
+    expect(content).toContain('**Why:**');
+    expect(content).toContain('**Priority:**');
+    expect(content).toContain('**Effort:**');
+    expect(content).toContain('## Completed');
+  });
+
+  test('skills that write TODOs reference TODOS-format.md', () => {
+    const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    const ceoPlanContent = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
+    const engPlanContent = fs.readFileSync(path.join(ROOT, 'plan-eng-review', 'SKILL.md'), 'utf-8');
+
+    expect(shipContent).toContain('TODOS-format.md');
+    expect(ceoPlanContent).toContain('TODOS-format.md');
+    expect(engPlanContent).toContain('TODOS-format.md');
+  });
+});
+
 // --- Part 7: Planted-bug fixture validation (A4) ---
 
 describe('Planted-bug fixture validation', () => {