feat: aggressive cleanup heuristics + preserve top nav bar

Deterministic cleanup improvements (used as first pass before LLM analysis):
- New 'clutter' category: audio players, podcast widgets, sidebar puzzles/games,
  recirculation widgets (taboola, outbrain, nativo), cross-promotion banners
- Text-content detection: removes "ADVERTISEMENT", "Article continues below",
  "Sponsored", "Paid content" labels and their parent wrappers
- Sticky fix: preserves the topmost full-width element near viewport top (site
  nav bar) instead of hiding all sticky/fixed elements. Sorts by vertical
  position, preserves the first one that spans >80% viewport width.

Tests: clutter category, ad label removal, nav bar preservation logic.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-29 23:44:37 -07:00
parent fa03b66c61
commit 0940d216ea
2 changed files with 116 additions and 26 deletions
+77 -13
View File
@@ -98,6 +98,26 @@ const CLEANUP_SELECTORS = {
// App download banners
'[class*="app-banner"]', '[class*="smart-banner"]', '[class*="app-download"]',
'[id*="branch-banner"]', '.smartbanner',
// Cross-promotion / "follow us" / "preferred source" widgets
'[class*="promo-banner"]', '[class*="cross-promo"]', '[class*="partner-promo"]',
'[class*="preferred-source"]', '[class*="google-promo"]',
],
clutter: [
// Audio/podcast player widgets (not part of the article text)
'[class*="audio-player"]', '[class*="podcast-player"]', '[class*="listen-widget"]',
'[class*="everlit"]', '[class*="Everlit"]',
'audio', // bare audio elements
// Sidebar games/puzzles widgets
'[class*="puzzle"]', '[class*="daily-game"]', '[class*="games-widget"]',
'[class*="crossword-promo"]', '[class*="mini-game"]',
// "Most Popular" / "Trending" sidebar recirculation (not the top nav trending bar)
'aside [class*="most-popular"]', 'aside [class*="trending"]',
'aside [class*="most-read"]', 'aside [class*="recommended"]',
// Related articles / recirculation at bottom
'[class*="related-articles"]', '[class*="more-stories"]',
'[class*="recirculation"]', '[class*="taboola"]', '[class*="outbrain"]',
// Hearst-specific (SF Chronicle, etc.)
'[class*="nativo"]', '[data-tb-region]',
],
sticky: [
// Handled via JavaScript evaluation, not pure selectors
@@ -486,7 +506,7 @@ export async function handleWriteCommand(
case 'cleanup': {
// Parse flags
let doAds = false, doCookies = false, doSticky = false, doSocial = false;
let doOverlays = false;
let doOverlays = false, doClutter = false;
let doAll = false;
// Default to --all if no args (most common use case from sidebar button)
@@ -501,14 +521,15 @@ export async function handleWriteCommand(
case '--sticky': doSticky = true; break;
case '--social': doSocial = true; break;
case '--overlays': doOverlays = true; break;
case '--clutter': doClutter = true; break;
case '--all': doAll = true; break;
default:
throw new Error(`Unknown cleanup flag: ${arg}. Use: --ads, --cookies, --sticky, --social, --overlays, --all`);
throw new Error(`Unknown cleanup flag: ${arg}. Use: --ads, --cookies, --sticky, --social, --overlays, --clutter, --all`);
}
}
if (doAll) {
doAds = doCookies = doSticky = doSocial = doOverlays = true;
doAds = doCookies = doSticky = doSocial = doOverlays = doClutter = true;
}
const removed: string[] = [];
@@ -519,6 +540,7 @@ export async function handleWriteCommand(
if (doCookies) selectors.push(...CLEANUP_SELECTORS.cookies);
if (doSocial) selectors.push(...CLEANUP_SELECTORS.social);
if (doOverlays) selectors.push(...CLEANUP_SELECTORS.overlays);
if (doClutter) selectors.push(...CLEANUP_SELECTORS.clutter);
if (selectors.length > 0) {
const count = await page.evaluate((sels: string[]) => {
@@ -539,6 +561,7 @@ export async function handleWriteCommand(
if (doCookies) removed.push('cookie banners');
if (doSocial) removed.push('social widgets');
if (doOverlays) removed.push('overlays/popups');
if (doClutter) removed.push('clutter');
}
}
@@ -546,23 +569,36 @@ export async function handleWriteCommand(
if (doSticky) {
const stickyCount = await page.evaluate(() => {
let removed = 0;
// Collect all sticky/fixed elements, sort by vertical position
const stickyEls: Array<{ el: Element; top: number; width: number; height: number }> = [];
const allElements = document.querySelectorAll('*');
const viewportWidth = window.innerWidth;
for (const el of allElements) {
const style = getComputedStyle(el);
if (style.position === 'fixed' || style.position === 'sticky') {
const tag = el.tagName.toLowerCase();
// Skip main nav/header elements at the top of the page
if (tag === 'nav' || tag === 'header') continue;
if (el.getAttribute('role') === 'navigation') continue;
// Skip elements at the very top that look like navbars
const rect = el.getBoundingClientRect();
if (rect.top <= 10 && rect.height < 100 && tag !== 'div') continue;
// Skip the gstack control indicator
if (el.id === 'gstack-ctrl') continue;
(el as HTMLElement).style.setProperty('display', 'none', 'important');
removed++;
stickyEls.push({ el, top: rect.top, width: rect.width, height: rect.height });
}
}
// Sort by vertical position (topmost first)
stickyEls.sort((a, b) => a.top - b.top);
let preservedTopNav = false;
for (const { el, top, width, height } of stickyEls) {
const tag = el.tagName.toLowerCase();
// Always skip nav/header semantic elements
if (tag === 'nav' || tag === 'header') continue;
if (el.getAttribute('role') === 'navigation') continue;
// Skip the gstack control indicator
if ((el as HTMLElement).id === 'gstack-ctrl') continue;
// Preserve the FIRST full-width element near the top (site's main nav bar)
// This catches divs that act as navbars but aren't semantic <nav> elements
if (!preservedTopNav && top <= 50 && width > viewportWidth * 0.8 && height < 120) {
preservedTopNav = true;
continue;
}
(el as HTMLElement).style.setProperty('display', 'none', 'important');
removed++;
}
return removed;
});
if (stickyCount > 0) removed.push(`${stickyCount} sticky/fixed elements`);
@@ -610,6 +646,34 @@ export async function handleWriteCommand(
});
if (scrollFixed > 0) removed.push('scroll unlocked');
// Remove "ADVERTISEMENT" / "Article continues below" text labels
const adLabelCount = await page.evaluate(() => {
let removed = 0;
const adTextPatterns = [
/^advertisement$/i, /^sponsored$/i, /^promoted$/i,
/article continues/i, /continues below/i,
/^ad$/i, /^paid content$/i, /^partner content$/i,
];
// Walk text-heavy small elements looking for ad labels
const candidates = document.querySelectorAll('div, span, p, figcaption, label');
for (const el of candidates) {
const text = (el.textContent || '').trim();
if (text.length > 50) continue; // Too much text, probably real content
if (adTextPatterns.some(p => p.test(text))) {
// Also hide the parent if it's a wrapper with little else
const parent = el.parentElement;
if (parent && (parent.textContent || '').trim().length < 80) {
(parent as HTMLElement).style.setProperty('display', 'none', 'important');
} else {
(el as HTMLElement).style.setProperty('display', 'none', 'important');
}
removed++;
}
}
return removed;
});
if (adLabelCount > 0) removed.push(`${adLabelCount} ad labels`);
// Remove empty ad placeholder whitespace (divs that are now empty after ad removal)
const collapsedCount = await page.evaluate(() => {
let collapsed = 0;
+39 -13
View File
@@ -771,22 +771,24 @@ describe('cleanup and screenshot buttons', () => {
expect(html).toContain('quick-actions');
});
test('sidepanel.js cleanup handler POSTs to /command with cleanup', () => {
expect(js).toContain("command: 'cleanup'");
expect(js).toContain("args: ['--all']");
});
test('sidepanel.js screenshot handler POSTs to /command with screenshot', () => {
expect(js).toContain("command: 'screenshot'");
});
test('sidepanel.js cleanup resets inspector state after success', () => {
// runCleanup should call inspectorShowEmpty after cleanup
test('cleanup button sends smart prompt to sidebar agent (not just deterministic selectors)', () => {
// Should use /sidebar-command endpoint (agent-based) not just /command (deterministic)
const cleanupFn = js.slice(
js.indexOf('async function runCleanup('),
js.indexOf('async function runScreenshot('),
);
expect(cleanupFn).toContain('inspectorShowEmpty');
expect(cleanupFn).toContain('sidebar-command');
expect(cleanupFn).toContain('cleanupPrompt');
// Should include both deterministic first pass AND agent snapshot analysis
expect(cleanupFn).toContain('cleanup --all');
expect(cleanupFn).toContain('snapshot -i');
// Should instruct agent to KEEP site branding
expect(cleanupFn).toContain('KEEP');
expect(cleanupFn).toContain('header/masthead/logo');
});
test('sidepanel.js screenshot handler POSTs to /command with screenshot', () => {
expect(js).toContain("command: 'screenshot'");
});
test('sidepanel.js has notification rendering for type notification', () => {
@@ -880,7 +882,31 @@ describe('cleanup heuristics (write-commands.ts)', () => {
});
test('sticky cleanup skips gstack control indicator', () => {
expect(wcSrc).toContain("el.id === 'gstack-ctrl'");
expect(wcSrc).toContain("gstack-ctrl");
});
test('CLEANUP_SELECTORS has clutter category', () => {
expect(wcSrc).toContain('clutter: [');
expect(wcSrc).toContain('audio-player');
expect(wcSrc).toContain('podcast-player');
expect(wcSrc).toContain('puzzle');
expect(wcSrc).toContain('recirculation');
expect(wcSrc).toContain('everlit');
});
test('cleanup removes "ADVERTISEMENT" text labels', () => {
expect(wcSrc).toContain('adTextPatterns');
expect(wcSrc).toContain('/^advertisement$/i');
expect(wcSrc).toContain('/article continues/i');
expect(wcSrc).toContain('ad labels');
});
test('sticky cleanup preserves topmost full-width nav bar', () => {
// Should preserve the first full-width element near the top
expect(wcSrc).toContain('preservedTopNav');
expect(wcSrc).toContain('viewportWidth * 0.8');
// Should sort sticky elements by vertical position
expect(wcSrc).toContain('sort((a, b) => a.top - b.top)');
});
});