Files
Shadowbroker/scripts/extract_plane_alert_additions.py
T
BigBodyCobain a0c79c2044 data: sync plane-alert VIP tracking with real names only
Import oligarchs, royals, and curated celebrities from plane-alert-db while excluding joke tag labels from tracked_names. Sync plane_alert_db.json metadata, add import scripts, and map oligarch/royal/celebrity colors in the legend.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-24 20:02:03 -06:00

224 lines
8.2 KiB
Python

#!/usr/bin/env python3
"""Extract plane-alert-db entries missing from tracked_names.json."""
from __future__ import annotations
import csv
import json
import re
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SB = ROOT / "backend" / "data" / "tracked_names.json"
PAD = Path.home() / "Downloads" / "plane-alert-db-main" / "plane-alert-db-main"
# Categories to import into tracked_names
IMPORT_CATS = {
"Don't you know who I am?",
"Oligarch",
"Royal Aircraft",
"Football",
"Head of State",
"Dictator Alert",
}
# As Seen on TV / Bizjets only when operator looks like a person (heuristic)
PERSON_CATS = {"As Seen on TV", "Bizjets", "Vanity Plate"}
# Skip obvious corps / generic operators
CORP_RE = re.compile(
r"\b(inc|llc|ltd|corp|company|co\.|group|holdings|university|air force|"
r"airlines|aviation|services|systems|international|global|partners|"
r"foundation|bank|pharma|laboratories|transportation|motors|enterprises)\b",
re.I,
)
CELEB_HINTS = re.compile(
r"\b(actor|actress|singer|rapper|musician|celebrity|nfl|nba|f1|formula|"
r"royal|prince|princess|king|queen|duke|sheik|sultan|oligarch|billionaire|"
r"mogul|tycoon|founder|ceo|president|senator|governor|judge|athlete|"
r"footballer|golfer|tennis|director|producer|host|comedian|model|"
r"influencer|youtuber|podcast|chef|author|writer|artist|designer)\b",
re.I,
)
KNOWN_PERSON_NAMES = {
"elon musk", "jay z", "jay-z", "kanye", "west", "kim kardashian", "taylor swift",
"beyonce", "drake", "rihanna", "oprah", "gates", "bezos", "zuckerberg",
"buffett", "dalio", "icahn", "ackman", "soros", "thiel", "musk", "cruise",
"dicaprio", "pitt", "jolie", "clooney", "hanks", "spielberg", "lucas",
"branson", "trump", "biden", "obama", "clinton", "bush", "romney",
"ramaswamy", "benioff", "blavatnik", "abramovich", "abramov", "potanin",
"fridman", "deripaska", "kerimov", "tinkov", "mordashov", "rybolovlev",
"lisin", "vekselberg", "medvedchuk", "alekperov", "mikhelson", "diddy",
"combs", "sean combs", "ronaldo", "messi", "mbappe", "beckham", "jordan",
"lebron", "brady", "mahomes", "kroenke", "kraft", "jones", "snyder",
"sheindlin", "judge judy", "elton john", "moss", "ambani", "adani",
"lowry", "ecclestone", "hamilton", "verstappen", "schumacher", "woods",
"nicklaus", "federer", "nadal", "djokovic", "osaka", "williams", "serena",
"venus", "sharapova", "mcgregor", "mayweather", "paul", "logan paul",
"jake paul", "mrbeast", "pewdiepie", "charlie munger", "larry ellison",
"michael dell", "tim cook", "satya nadella", "sundar pichai", "jensen huang",
"gisele", "tom brady", "gwyneth", "howard stern", "howard marks",
"steven cohen", "ken griffin", "david tepper", "ray dalio", "peter thiel",
"paul allen", "steve ballmer", "mark cuban", "richard branson", "larry page",
"sergey brin", "eric schmidt", "reid hoffman", "marc andreessen",
"chamath", "naval", "andretti", "penske", "hendrick", "rick hendrick",
}
def norm_reg(s: str) -> str:
return (s or "").strip().upper()
def norm_name(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip())
def looks_like_person(operator: str, tag1: str, tag2: str, tag3: str) -> bool:
blob = " ".join([operator, tag1, tag2, tag3]).strip()
if not blob or len(blob) < 3:
return False
low = blob.lower()
if CORP_RE.search(low) and not any(h in low for h in KNOWN_PERSON_NAMES):
# allow "Falcon Landing LLC" when tag says Elon Musk
if not any(h in low for h in KNOWN_PERSON_NAMES):
return False
if any(h in low for h in KNOWN_PERSON_NAMES):
return True
if CELEB_HINTS.search(low):
return True
# Two+ capitalized words, no corp suffix — weak person signal
words = operator.split()
if 2 <= len(words) <= 4 and operator == operator.title() and not CORP_RE.search(low):
return True
return False
def sb_category_for(cat: str, operator: str) -> str:
low = operator.lower()
if cat in {"Oligarch", "Dictator Alert"}:
return "Oligarch"
if cat == "Royal Aircraft" or "royal" in low:
return "Royal"
if cat == "Football":
return "Sports"
if cat in {"Head of State"}:
return "Government"
if any(x in low for x in ("nfl", "nba", "mlb", "football", "basketball", "soccer", "f1", "formula")):
return "Sports"
return "Celebrity"
def row_get(row: dict[str, str], *keys: str) -> str:
for k in keys:
if row.get(k):
return str(row[k]).strip()
return ""
def main() -> None:
with SB.open(encoding="utf-8") as f:
sb = json.load(f)
sb_regs: set[str] = set()
sb_names: dict[str, str] = {}
for name, info in sb.get("details", {}).items():
for reg in info.get("registrations", []):
r = norm_reg(reg)
if r:
sb_regs.add(r)
sb_names[r] = name
additions: dict[str, dict] = {}
merge: dict[str, list[str]] = {}
csv_paths = [
PAD / "plane-alert-db.csv",
PAD / "plane-alert-civ.csv",
PAD / "plane-alert-gov.csv",
PAD / "plane-alert-mil.csv",
]
seen: set[tuple[str, str]] = set()
person_hits = 0
for path in csv_paths:
if not path.exists():
continue
with path.open(encoding="utf-8", errors="replace") as f:
for row in csv.DictReader(f):
cat = row_get(row, "Category")
reg = norm_reg(row_get(row, "$Registration", "Registration"))
op = norm_reg(row_get(row, "$Operator", "Operator"))
op_display = norm_name(row_get(row, "$Operator", "Operator"))
tag1 = row_get(row, "$Tag 1", "Tag 1")
tag2 = row_get(row, "#Tag 2", "$#Tag 2")
tag3 = row_get(row, "#Tag 3", "$#Tag 3")
if not reg:
continue
if (reg, cat) in seen:
continue
seen.add((reg, cat))
include = cat in IMPORT_CATS
if not include and cat in PERSON_CATS:
if looks_like_person(op_display, tag1, tag2, tag3):
include = True
person_hits += 1
if not include:
continue
if reg in sb_regs:
continue
# Prefer tag person name over shell company
display = op_display
for tag in (tag1, tag2, tag3):
if tag and any(h in tag.lower() for h in KNOWN_PERSON_NAMES):
display = tag
break
if tag and len(tag.split()) <= 4 and tag[0].isupper() and "llc" not in tag.lower():
if cat == "Don't you know who I am?" and tag not in {"Bizjet", "Pusher Prop"}:
display = tag
key = display
if key in sb.get("details", {}):
merge.setdefault(key, []).append(reg)
else:
entry = additions.setdefault(
key,
{"category": sb_category_for(cat, display), "registrations": []},
)
if reg not in entry["registrations"]:
entry["registrations"].append(reg)
print(f"New named entries: {len(additions)}")
print(f"Merge into existing: {len(merge)}")
print(f"Person-heuristic hits (ASTV/Bizjets): {person_hits}")
print()
by_cat: dict[str, list[tuple[str, list[str]]]] = {}
for name, info in sorted(additions.items()):
by_cat.setdefault(info["category"], []).append((name, info["registrations"]))
for cat in sorted(by_cat):
items = by_cat[cat]
print(f"## {cat} ({len(items)})")
for name, regs in items[:40]:
print(f" {name}: {', '.join(regs)}")
if len(items) > 40:
print(f" ... +{len(items)-40} more")
print()
out = ROOT / "scripts" / "plane_alert_additions.json"
out.write_text(
json.dumps({"additions": additions, "merge": merge}, indent=2),
encoding="utf-8",
)
print(f"Wrote {out}")
if __name__ == "__main__":
main()