From dd323f6e55406e31a1ad47eca13a0cc1baf5e065 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Thu, 12 Feb 2026 17:25:50 -0500 Subject: [PATCH] delete old files --- ...create_daily_openairframes_adsb_release.py | 84 -------- src/create_daily_openairframes_faa_release.py | 33 --- src/get_latest_openairframes_release.py | 189 ------------------ 3 files changed, 306 deletions(-) delete mode 100644 src/create_daily_openairframes_adsb_release.py delete mode 100644 src/create_daily_openairframes_faa_release.py delete mode 100644 src/get_latest_openairframes_release.py diff --git a/src/create_daily_openairframes_adsb_release.py b/src/create_daily_openairframes_adsb_release.py deleted file mode 100644 index 0a5137e..0000000 --- a/src/create_daily_openairframes_adsb_release.py +++ /dev/null @@ -1,84 +0,0 @@ -from pathlib import Path -from datetime import datetime, timezone, timedelta -import sys - -import polars as pl - -# Add adsb directory to path -sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation - -from adsb.compress_adsb_to_aircraft_data import ( - load_historical_for_day, - concat_compressed_dfs, - get_latest_aircraft_adsb_csv_df, -) - -if __name__ == '__main__': - # Get yesterday's date (data for the previous day) - day = datetime.now(timezone.utc) - timedelta(days=1) - - # Find a day with complete data - max_attempts = 2 # Don't look back more than a week - for attempt in range(max_attempts): - date_str = day.strftime("%Y-%m-%d") - print(f"Processing ADS-B data for {date_str}") - - print("Loading new ADS-B data...") - df_new = load_historical_for_day(day) - if df_new.height == 0: - day = day - timedelta(days=1) - continue - max_time = df_new['time'].max() - if max_time is not None: - # Handle timezone - max_time_dt = max_time - if hasattr(max_time_dt, 'replace'): - max_time_dt = max_time_dt.replace(tzinfo=timezone.utc) - - end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5) - - # Convert polars datetime to python datetime if needed - if isinstance(max_time_dt, datetime): - if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day: - break - else: - # Polars returns python datetime already - if max_time >= day.replace(hour=23, minute=54, second=59): - break - - print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.") - day = day - timedelta(days=1) - else: - raise RuntimeError(f"Could not find complete data in the last {max_attempts} days") - - try: - # Get the latest release data - print("Downloading latest ADS-B release...") - df_base, start_date_str = get_latest_aircraft_adsb_csv_df() - # Combine with historical data - print("Combining with historical data...") - df_combined = concat_compressed_dfs(df_base, df_new) - except Exception as e: - print(f"Error downloading latest ADS-B release: {e}") - df_combined = df_new - start_date_str = date_str - - # Sort by time for consistent ordering - df_combined = df_combined.sort('time') - - # Convert any list columns to strings for CSV compatibility - for col in df_combined.columns: - if df_combined[col].dtype == pl.List: - df_combined = df_combined.with_columns( - pl.col(col).list.join(",").alias(col) - ) - - # Save the result - OUT_ROOT = Path("data/openairframes") - OUT_ROOT.mkdir(parents=True, exist_ok=True) - - output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv" - df_combined.write_csv(output_file) - - print(f"Saved: {output_file}") - print(f"Total aircraft: {df_combined.height}") diff --git a/src/create_daily_openairframes_faa_release.py b/src/create_daily_openairframes_faa_release.py deleted file mode 100644 index 25bb32b..0000000 --- a/src/create_daily_openairframes_faa_release.py +++ /dev/null @@ -1,33 +0,0 @@ -from pathlib import Path -from datetime import datetime, timezone -date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") - -out_dir = Path("data/faa_releasable") -out_dir.mkdir(parents=True, exist_ok=True) -zip_name = f"ReleasableAircraft_{date_str}.zip" - -zip_path = out_dir / zip_name -if not zip_path.exists(): - # URL and paths - url = "https://registry.faa.gov/database/ReleasableAircraft.zip" - from urllib.request import Request, urlopen - - req = Request( - url, - headers={"User-Agent": "Mozilla/5.0"}, - method="GET", - ) - - with urlopen(req, timeout=120) as r: - body = r.read() - zip_path.write_bytes(body) - -OUT_ROOT = Path("data/openairframes") -OUT_ROOT.mkdir(parents=True, exist_ok=True) -from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df -from get_latest_openairframes_release import get_latest_aircraft_faa_csv_df -df_new = convert_faa_master_txt_to_df(zip_path, date_str) -df_base, start_date_str = get_latest_aircraft_faa_csv_df() -df_base = concat_faa_historical_df(df_base, df_new) -assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" -df_base.to_csv(OUT_ROOT / f"openairframes_faa_{start_date_str}_{date_str}.csv", index=False) \ No newline at end of file diff --git a/src/get_latest_openairframes_release.py b/src/get_latest_openairframes_release.py deleted file mode 100644 index b29b82a..0000000 --- a/src/get_latest_openairframes_release.py +++ /dev/null @@ -1,189 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from pathlib import Path -from typing import Iterable, Optional -import re -import urllib.request -import urllib.error -import json - - -REPO = "PlaneQuery/openairframes" -LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest" - - -@dataclass(frozen=True) -class ReleaseAsset: - name: str - download_url: str - size: int # bytes - - -def _http_get_json(url: str, headers: dict[str, str]) -> dict: - req = urllib.request.Request(url, headers=headers, method="GET") - with urllib.request.urlopen(req, timeout=120) as resp: - data = resp.read() - return json.loads(data.decode("utf-8")) - - -def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]: - url = f"https://api.github.com/repos/{repo}/releases/latest" - headers = { - "Accept": "application/vnd.github+json", - "User-Agent": "openairframes-downloader/1.0", - } - if github_token: - headers["Authorization"] = f"Bearer {github_token}" - - payload = _http_get_json(url, headers=headers) - assets = [] - for a in payload.get("assets", []): - assets.append( - ReleaseAsset( - name=a["name"], - download_url=a["browser_download_url"], - size=int(a.get("size", 0)), - ) - ) - return assets - - -def pick_asset( - assets: Iterable[ReleaseAsset], - *, - exact_name: Optional[str] = None, - name_regex: Optional[str] = None, -) -> ReleaseAsset: - assets = list(assets) - - if exact_name: - for a in assets: - if a.name == exact_name: - return a - raise FileNotFoundError(f"No asset exactly named {exact_name!r}. Available: {[a.name for a in assets]}") - - if name_regex: - rx = re.compile(name_regex) - matches = [a for a in assets if rx.search(a.name)] - if not matches: - raise FileNotFoundError(f"No asset matched regex {name_regex!r}. Available: {[a.name for a in assets]}") - if len(matches) > 1: - raise FileExistsError(f"Regex {name_regex!r} matched multiple assets: {[m.name for m in matches]}") - return matches[0] - - raise ValueError("Provide either exact_name=... or name_regex=...") - - -def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[str] = None) -> Path: - out_path = Path(out_path) - out_path.parent.mkdir(parents=True, exist_ok=True) - - headers = { - "User-Agent": "openairframes-downloader/1.0", - "Accept": "application/octet-stream", - } - if github_token: - headers["Authorization"] = f"Bearer {github_token}" - - req = urllib.request.Request(asset.download_url, headers=headers, method="GET") - - try: - with urllib.request.urlopen(req, timeout=300) as resp, out_path.open("wb") as f: - # Stream download - while True: - chunk = resp.read(1024 * 1024) # 1 MiB - if not chunk: - break - f.write(chunk) - except urllib.error.HTTPError as e: - body = e.read().decode("utf-8", errors="replace") if hasattr(e, "read") else "" - raise RuntimeError(f"HTTPError {e.code} downloading {asset.name}: {body[:500]}") from e - - return out_path - - -def download_latest_aircraft_csv( - output_dir: Path = Path("downloads"), - github_token: Optional[str] = None, - repo: str = REPO, -) -> Path: - """ - Download the latest openairframes_faa_*.csv file from the latest GitHub release. - - Args: - output_dir: Directory to save the downloaded file (default: "downloads") - github_token: Optional GitHub token for authentication - repo: GitHub repository in format "owner/repo" (default: REPO) - - Returns: - Path to the downloaded file - """ - assets = get_latest_release_assets(repo, github_token=github_token) - try: - asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$") - except FileNotFoundError: - # Fallback to old naming pattern - asset = pick_asset(assets, name_regex=r"^openairframes_\d{4}-\d{2}-\d{2}_.*\.csv$") - saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) - print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") - return saved_to - -def get_latest_aircraft_faa_csv_df(): - csv_path = download_latest_aircraft_csv() - import pandas as pd - df = pd.read_csv(csv_path, dtype={'transponder_code': str, - 'unique_regulatory_id': str, - 'registrant_county': str}) - df = df.fillna("") - # Extract start date from filename pattern: openairframes_faa_{start_date}_{end_date}.csv - match = re.search(r"openairframes_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path)) - if not match: - # Fallback to old naming pattern: openairframes_{start_date}_{end_date}.csv - match = re.search(r"openairframes_(\d{4}-\d{2}-\d{2})_", str(csv_path)) - if not match: - raise ValueError(f"Could not extract date from filename: {csv_path.name}") - - date_str = match.group(1) - return df, date_str - - -def download_latest_aircraft_adsb_csv( - output_dir: Path = Path("downloads"), - github_token: Optional[str] = None, - repo: str = REPO, -) -> Path: - """ - Download the latest openairframes_adsb_*.csv file from the latest GitHub release. - - Args: - output_dir: Directory to save the downloaded file (default: "downloads") - github_token: Optional GitHub token for authentication - repo: GitHub repository in format "owner/repo" (default: REPO) - - Returns: - Path to the downloaded file - """ - assets = get_latest_release_assets(repo, github_token=github_token) - asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$") - saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) - print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") - return saved_to - - -def get_latest_aircraft_adsb_csv_df(): - csv_path = download_latest_aircraft_adsb_csv() - import pandas as pd - df = pd.read_csv(csv_path) - df = df.fillna("") - # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv - match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) - if not match: - raise ValueError(f"Could not extract date from filename: {csv_path.name}") - - date_str = match.group(1) - return df, date_str - - -if __name__ == "__main__": - download_latest_aircraft_csv()