Files
Shadowbroker/backend/services/test_scraper.py
T
2026-03-04 22:44:08 -07:00

68 lines
2.4 KiB
Python

import requests
from bs4 import BeautifulSoup
import json
def scrape_broadcastify_top():
print("Scraping Broadcastify Top Feeds...")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
try:
# The top 50 feeds page provides a wealth of listening data
res = requests.get("https://www.broadcastify.com/listen/top", headers=headers, timeout=10)
if res.status_code != 200:
print(f"Failed HTTP {res.status_code}")
return []
soup = BeautifulSoup(res.text, 'html.parser')
# The table of feeds is in a standard class
table = soup.find('table', {'class': 'btable'})
if not table:
print("Could not find feeds table.")
return []
feeds = []
rows = table.find_all('tr')[1:] # Skip header
for row in rows:
cols = row.find_all('td')
if len(cols) >= 5:
# Top layout: [Listeners, Feed ID (hidden), Location, Feed Name, Category, Genre]
listeners_str = cols[0].text.strip().replace(',', '')
listeners = int(listeners_str) if listeners_str.isdigit() else 0
# The link is usually in the Feed Name column
link_tag = cols[2].find('a')
if not link_tag:
continue
href = link_tag.get('href', '')
feed_id = href.split('/')[-1] if '/listen/feed/' in href else None
if not feed_id:
continue
location = cols[1].text.strip()
name = cols[2].text.strip()
feeds.append({
"id": feed_id,
"listeners": listeners,
"location": location,
"name": name,
"stream_url": f"https://broadcastify.cdnstream1.com/{feed_id}"
})
print(f"Successfully scraped {len(feeds)} top feeds.")
return feeds
except Exception as e:
print(f"Scrape error: {e}")
return []
if __name__ == "__main__":
top_feeds = scrape_broadcastify_top()
print(json.dumps(top_feeds[:3], indent=2))