From 24313603c5717c023a5e8dac4d95eee7de290c97 Mon Sep 17 00:00:00 2001
From: ggman12 <goodejonah@gmail.com>
Date: Wed, 18 Feb 2026 23:03:02 -0500
Subject: [PATCH] works

---
 scripts/scrape_theairtraffic.py   | 242 ++++++++++++++++++++++++++++++
 scripts/validate_theairtraffic.py |  69 +++++++++
 2 files changed, 311 insertions(+)
 create mode 100644 scripts/scrape_theairtraffic.py
 create mode 100644 scripts/validate_theairtraffic.py
diff --git a/scripts/scrape_theairtraffic.py b/scripts/scrape_theairtraffic.py
new file mode 100644
index 0000000..6b785cf
--- /dev/null
+++ b/scripts/scrape_theairtraffic.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
+
+Source: "TheAirTraffic Database - Aircraft 2.csv"
+Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
+
+Categories in the spreadsheet columns (paired: name, registrations, separator):
+  Col  1-3:  Business
+  Col  4-6:  Government
+  Col  7-9:  People
+  Col 10-12: Sports
+  Col 13-15: Celebrity
+  Col 16-18: State Govt./Law
+  Col 19-21: Other
+  Col 22-24: Test Aircraft
+  Col 25-27: YouTubers
+  Col 28-30: Formula 1 VIP's
+  Col 31-33: Active GII's and GIII's  (test/demo aircraft)
+  Col 34-37: Russia & Ukraine          (extra col for old/new)
+  Col 38-40: Helicopters & Blimps
+  Col 41-43: Unique Reg's
+  Col 44-46: Saudi & UAE
+  Col 47-49: Schools
+  Col 50-52: Special Charter
+  Col 53-55: Unknown Owners
+  Col 56-59: Frequent Flyers           (extra cols: name, aircraft, logged, hours)
+"""
+
+import csv
+import json
+import hashlib
+import re
+import sys
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+# ── Category mapping ────────────────────────────────────────────────────────
+# Each entry: (name_col, reg_col, owner_category_tags)
+# owner_category_tags is a dict of tag keys to add beyond "owner"
+CATEGORY_COLUMNS = [
+    # (name_col, reg_col, {tag_key: tag_value, ...})
+    (1,  2,  {"owner_category_0": "business"}),
+    (4,  5,  {"owner_category_0": "government"}),
+    (7,  8,  {"owner_category_0": "celebrity"}),
+    (10, 11, {"owner_category_0": "sports"}),
+    (13, 14, {"owner_category_0": "celebrity"}),
+    (16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
+    (19, 20, {"owner_category_0": "other"}),
+    (22, 23, {"owner_category_0": "test_aircraft"}),
+    (25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
+    (28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
+    (31, 32, {"owner_category_0": "test_aircraft"}),
+    # Russia & Ukraine: col 34=name, col 35 or 36 may have reg
+    (34, 35, {"owner_category_0": "celebrity"}),
+    (38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
+    (41, 42, {"owner_category_0": "other"}),
+    (44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
+    (47, 48, {"owner_category_0": "education"}),
+    (50, 51, {"owner_category_0": "charter"}),
+    (53, 54, {"owner_category_0": "unknown"}),
+    (56, 57, {"owner_category_0": "celebrity"}),   # Frequent Flyers name col, aircraft col
+]
+
+# First data row index (0-based) in the CSV
+DATA_START_ROW = 4
+
+# ── Contributor info ────────────────────────────────────────────────────────
+CONTRIBUTOR_NAME = "TheAirTraffic"
+# Deterministic UUID v5 from contributor name
+CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
+
+# Citation
+CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
+
+
+def looks_like_military_serial(reg: str) -> bool:
+    """
+    Detect military-style serials like 92-9000, 82-8000, 98-0001
+    or pure numeric IDs like 929000, 828000, 980001.
+    These aren't standard civil registrations; use openairframes_id.
+    """
+    # Pattern: NN-NNNN
+    if re.match(r'^\d{2}-\d{4}$', reg):
+        return True
+    # Pure 6-digit numbers (likely ICAO hex or military mode-S)
+    if re.match(r'^\d{6}$', reg):
+        return True
+    # Short numeric-only (1-5 digits) like "01", "02", "676"
+    if re.match(r'^\d{1,5}$', reg):
+        return True
+    return False
+
+
+def normalize_reg(raw: str) -> str:
+    """Clean up a registration string."""
+    reg = raw.strip().rstrip(',').strip()
+    # Remove carriage returns and other whitespace
+    reg = reg.replace('\r', '').replace('\n', '').strip()
+    return reg
+
+
+def parse_regs(cell_value: str) -> list[str]:
+    """
+    Parse a cell that may contain one or many registrations,
+    separated by commas, possibly wrapped in quotes.
+    """
+    if not cell_value or not cell_value.strip():
+        return []
+
+    # Some cells have ADS-B exchange URLs – skip those
+    if 'globe.adsbexchange.com' in cell_value:
+        return []
+    if cell_value.strip() in ('.', ',', ''):
+        return []
+
+    results = []
+    # Split on comma
+    parts = cell_value.split(',')
+    for part in parts:
+        reg = normalize_reg(part)
+        if not reg:
+            continue
+        # Skip URLs, section labels, etc.
+        if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
+            continue
+        # Skip if it's just whitespace or dots
+        if reg in ('.', '..', '...'):
+            continue
+        results.append(reg)
+    return results
+
+
+def make_submission(
+    reg: str,
+    owner: str,
+    category_tags: dict[str, str],
+) -> dict:
+    """Build a single community_submission.v1 object."""
+
+    entry: dict = {}
+
+    # Decide identifier field
+    if looks_like_military_serial(reg):
+        entry["openairframes_id"] = reg
+    else:
+        entry["registration_number"] = reg
+
+    # Tags
+    tags: dict = {
+        "citation_0": CITATION,
+    }
+    if owner:
+        tags["owner"] = owner.strip()
+    tags.update(category_tags)
+    entry["tags"] = tags
+
+    return entry
+
+
+def main():
+    csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
+        "/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
+    )
+
+    if not csv_path.exists():
+        print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
+        sys.exit(1)
+
+    # Read CSV
+    with open(csv_path, 'r', encoding='utf-8-sig') as f:
+        reader = csv.reader(f)
+        rows = list(reader)
+
+    print(f"Read {len(rows)} rows from {csv_path.name}")
+
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    submissions: list[dict] = []
+    seen: set[tuple] = set()  # (reg, owner) dedup
+
+    for row_idx in range(DATA_START_ROW, len(rows)):
+        row = rows[row_idx]
+        if len(row) < 3:
+            continue
+
+        for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
+            if reg_col >= len(row) or name_col >= len(row):
+                continue
+
+            owner_raw = row[name_col].strip().rstrip(',').strip()
+            reg_raw = row[reg_col]
+
+            # Clean owner name
+            owner = owner_raw.replace('\r', '').replace('\n', '').strip()
+            if not owner or owner in ('.', ',', 'Section 1'):
+                continue
+            # Skip header-like values
+            if owner.startswith('http') or owner.startswith('Link '):
+                continue
+
+            regs = parse_regs(reg_raw)
+            if not regs:
+                # For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
+                if name_col == 34 and reg_col + 1 < len(row):
+                    regs = parse_regs(row[reg_col + 1])
+
+            for reg in regs:
+                key = (reg, owner)
+                if key in seen:
+                    continue
+                seen.add(key)
+                submissions.append(make_submission(reg, owner, cat_tags))
+
+    print(f"Generated {len(submissions)} submissions")
+
+    # Write output
+    proj_root = Path(__file__).resolve().parent.parent
+    out_dir = proj_root / "community" / date_str
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    out_file = out_dir / f"theairtraffic_{date_str}.json"
+
+    with open(out_file, 'w', encoding='utf-8') as f:
+        json.dump(submissions, f, indent=2, ensure_ascii=False)
+
+    print(f"Written to {out_file}")
+    print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
+
+    # Quick stats
+    cats = {}
+    for s in submissions:
+        c = s['tags'].get('owner_category_0', 'NONE')
+        cats[c] = cats.get(c, 0) + 1
+    print("\nCategory breakdown:")
+    for c, n in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {c}: {n}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/validate_theairtraffic.py b/scripts/validate_theairtraffic.py
new file mode 100644
index 0000000..7faef22
--- /dev/null
+++ b/scripts/validate_theairtraffic.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Validate the generated theairtraffic JSON output."""
+import json
+import glob
+import sys
+
+# Find the latest output
+files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
+if not files:
+    print("No output files found!")
+    sys.exit(1)
+
+path = files[-1]
+print(f"Validating: {path}")
+
+with open(path) as f:
+    data = json.load(f)
+
+print(f"Total entries: {len(data)}")
+
+# Check military serial handling
+mil = [d for d in data if "openairframes_id" in d]
+print(f"\nEntries using openairframes_id: {len(mil)}")
+for m in mil[:10]:
+    print(f"  {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
+
+# Check youtuber entries
+yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
+print(f"\nYouTuber entries: {len(yt)}")
+for y in yt[:5]:
+    reg = y.get("registration_number", y.get("openairframes_id"))
+    c0 = y["tags"].get("owner_category_0")
+    c1 = y["tags"].get("owner_category_1")
+    print(f"  {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
+
+# Check US Govt / military
+gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
+print(f"\nUSA 747/757 entries: {len(gov)}")
+for g in gov:
+    oid = g.get("openairframes_id", g.get("registration_number"))
+    print(f"  {oid}")
+
+# Schema validation
+issues = 0
+for i, d in enumerate(data):
+    has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
+    if not has_id:
+        print(f"  Entry {i}: no identifier!")
+        issues += 1
+    if "tags" not in d:
+        print(f"  Entry {i}: no tags!")
+        issues += 1
+    # Check tag key format
+    for k in d.get("tags", {}):
+        import re
+        if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
+            print(f"  Entry {i}: invalid tag key '{k}'")
+            issues += 1
+
+print(f"\nSchema issues: {issues}")
+
+# Category breakdown
+cats = {}
+for s in data:
+    c = s["tags"].get("owner_category_0", "NONE")
+    cats[c] = cats.get(c, 0) + 1
+print("\nCategory breakdown:")
+for c, n in sorted(cats.items(), key=lambda x: -x[1]):
+    print(f"  {c}: {n}")