diff --git a/.github/workflows/historical-adsb.yaml b/.github/workflows/historical-adsb.yaml index 6c6f159..20b1c53 100644 --- a/.github/workflows/historical-adsb.yaml +++ b/.github/workflows/historical-adsb.yaml @@ -220,11 +220,11 @@ jobs: END_DATE: ${{ needs.generate-matrix.outputs.global_end }} run: | python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream - ls -lah data/planequery_aircraft/ + ls -lah data/openairframes/ - name: Upload final artifact uses: actions/upload-artifact@v4 with: - name: planequery_aircraft_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }} - path: data/planequery_aircraft/*.csv + name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }} + path: data/openairframes/*.csv retention-days: 30 diff --git a/.github/workflows/planequery-aircraft-daily-release.yaml b/.github/workflows/openairframes-daily-release.yaml similarity index 90% rename from .github/workflows/planequery-aircraft-daily-release.yaml rename to .github/workflows/openairframes-daily-release.yaml index 0af67c1..5a47629 100644 --- a/.github/workflows/planequery-aircraft-daily-release.yaml +++ b/.github/workflows/openairframes-daily-release.yaml @@ -1,4 +1,4 @@ -name: planequery-aircraft Daily Release +name: OpenAirframes Daily Release on: schedule: @@ -22,7 +22,7 @@ jobs: await github.rest.actions.createWorkflowDispatch({ owner: context.repo.owner, repo: context.repo.repo, - workflow_id: 'planequery-aircraft-daily-release.yaml', + workflow_id: 'openairframes-daily-release.yaml', ref: 'main' }); @@ -33,7 +33,7 @@ jobs: await github.rest.actions.createWorkflowDispatch({ owner: context.repo.owner, repo: context.repo.repo, - workflow_id: 'planequery-aircraft-daily-release.yaml', + workflow_id: 'openairframes-daily-release.yaml', ref: 'develop' }); @@ -58,16 +58,16 @@ jobs: - name: Run FAA release script run: | - python src/create_daily_planequery_aircraft_faa_release.py + python src/create_daily_faa_release.py ls -lah data/faa_releasable - ls -lah data/planequery_aircraft + ls -lah data/openairframes - name: Upload FAA artifacts uses: actions/upload-artifact@v4 with: name: faa-release path: | - data/planequery_aircraft/planequery_aircraft_faa_*.csv + data/openairframes/openairframes_faa_*.csv data/faa_releasable/ReleasableAircraft_*.zip retention-days: 1 @@ -214,13 +214,13 @@ jobs: mkdir -p data/output/adsb_chunks ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist" python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks - ls -lah data/planequery_aircraft/ + ls -lah data/openairframes/ - name: Upload ADS-B artifacts uses: actions/upload-artifact@v4 with: name: adsb-release - path: data/planequery_aircraft/planequery_aircraft_adsb_*.csv + path: data/openairframes/openairframes_adsb_*.csv retention-days: 1 build-community: @@ -245,13 +245,13 @@ jobs: - name: Run Community release script run: | python -m src.contributions.create_daily_community_release - ls -lah data/planequery_aircraft + ls -lah data/openairframes - name: Upload Community artifacts uses: actions/upload-artifact@v4 with: name: community-release - path: data/planequery_aircraft/planequery_aircraft_community_*.csv + path: data/openairframes/openairframes_community_*.csv retention-days: 1 create-release: @@ -297,14 +297,14 @@ jobs: elif [ "$BRANCH_NAME" = "develop" ]; then BRANCH_SUFFIX="-develop" fi - TAG="planequery-aircraft-${DATE}${BRANCH_SUFFIX}" + TAG="openairframes-${DATE}${BRANCH_SUFFIX}" # Find files from artifacts using find (handles nested structures) - CSV_FILE_FAA=$(find artifacts/faa -name "planequery_aircraft_faa_*.csv" | head -1) + CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" | head -1) CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA") - CSV_FILE_ADSB=$(find artifacts/adsb -name "planequery_aircraft_adsb_*.csv" | head -1) + CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" | head -1) CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB") - CSV_FILE_COMMUNITY=$(find artifacts/community -name "planequery_aircraft_community_*.csv" 2>/dev/null | head -1 || echo "") + CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" 2>/dev/null | head -1 || echo "") CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "") ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" | head -1) ZIP_BASENAME=$(basename "$ZIP_FILE") @@ -319,7 +319,7 @@ jobs: echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT" echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT" echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT" - echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT" + echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT" - name: Checkout for gh CLI uses: actions/checkout@v4 diff --git a/.github/workflows/process-historical-faa.yaml b/.github/workflows/process-historical-faa.yaml new file mode 100644 index 0000000..a5cdc4e --- /dev/null +++ b/.github/workflows/process-historical-faa.yaml @@ -0,0 +1,171 @@ +name: Process Historical FAA Data + +on: + workflow_dispatch: # Manual trigger + +jobs: + generate-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Generate date ranges + id: set-matrix + run: | + python3 << 'EOF' + import json + from datetime import datetime, timedelta + + start = datetime(2023, 8, 16) + end = datetime(2026, 1, 1) + + ranges = [] + current = start + + # Process in 4-day chunks + while current < end: + chunk_end = current + timedelta(days=4) + # Don't go past the end date + if chunk_end > end: + chunk_end = end + + ranges.append({ + "since": current.strftime("%Y-%m-%d"), + "until": chunk_end.strftime("%Y-%m-%d") + }) + + current = chunk_end + + print(f"::set-output name=matrix::{json.dumps(ranges)}") + EOF + + clone-faa-repo: + runs-on: ubuntu-latest + steps: + - name: Cache FAA repository + id: cache-faa-repo + uses: actions/cache@v4 + with: + path: data/scrape-faa-releasable-aircraft + key: faa-repo-v1 + + - name: Clone FAA repository + if: steps.cache-faa-repo.outputs.cache-hit != 'true' + run: | + mkdir -p data + git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft + echo "Repository cloned successfully" + + process-chunk: + needs: [generate-matrix, clone-faa-repo] + runs-on: ubuntu-latest + strategy: + max-parallel: 5 # Process 5 chunks at a time + matrix: + range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Restore FAA repository cache + uses: actions/cache/restore@v4 + with: + path: data/scrape-faa-releasable-aircraft + key: faa-repo-v1 + fail-on-cache-miss: true + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }} + run: | + python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}" + + - name: Upload CSV artifact + uses: actions/upload-artifact@v4 + with: + name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }} + path: data/faa_releasable_historical/*.csv + retention-days: 1 + + create-release: + needs: process-chunk + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Prepare release files + run: | + mkdir -p release-files + find artifacts -name "*.csv" -exec cp {} release-files/ \; + ls -lh release-files/ + + - name: Create Release + uses: softprops/action-gh-release@v1 + with: + tag_name: historical-faa-${{ github.run_number }} + name: Historical FAA Data Release ${{ github.run_number }} + body: | + Automated release of historical FAA aircraft data + Processing period: 2023-08-16 to 2026-01-01 + Generated: ${{ github.event.repository.updated_at }} + files: release-files/*.csv + draft: false + prerelease: false + + concatenate-and-release: + needs: process-chunk + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Prepare CSVs for concatenation + run: | + mkdir -p data/faa_releasable_historical + find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \; + ls -lh data/faa_releasable_historical/ + + - name: Concatenate all CSVs + run: | + python scripts/concat_csvs.py + + - name: Create Combined Release + uses: softprops/action-gh-release@v1 + with: + tag_name: historical-faa-combined-${{ github.run_number }} + name: Historical FAA Data Combined Release ${{ github.run_number }} + body: | + Combined historical FAA aircraft data (all chunks concatenated) + Processing period: 2023-08-16 to 2026-01-01 + Generated: ${{ github.event.repository.updated_at }} + files: data/openairframes/*.csv + draft: false + prerelease: false \ No newline at end of file diff --git a/LICENSE b/LICENSE index 3d4fbf1..36310d8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 PlaneQuery +Copyright (c) 2026 OpenAirframes Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/infra/stack.py b/infra/stack.py index a54bd79..81c501f 100644 --- a/infra/stack.py +++ b/infra/stack.py @@ -23,7 +23,7 @@ class AdsbProcessingStack(Stack): # --- S3 bucket for intermediate and final results --- bucket = s3.Bucket( self, "ResultsBucket", - bucket_name="planequery-aircraft-dev", + bucket_name="openairframes-dev", removal_policy=RemovalPolicy.DESTROY, auto_delete_objects=True, lifecycle_rules=[ diff --git a/schemas/community_submission.v1.schema.json b/schemas/community_submission.v1.schema.json index cc495b1..8d0d94c 100644 --- a/schemas/community_submission.v1.schema.json +++ b/schemas/community_submission.v1.schema.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "PlaneQuery Aircraft Community Submission (v1)", + "title": "OpenAirframes Community Submission (v1)", "type": "object", "additionalProperties": false, "properties": { diff --git a/src/adsb/combine_chunks_to_csv.py b/src/adsb/combine_chunks_to_csv.py index 8edbfd7..b5afca3 100644 --- a/src/adsb/combine_chunks_to_csv.py +++ b/src/adsb/combine_chunks_to_csv.py @@ -27,7 +27,7 @@ from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLU DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks") -FINAL_OUTPUT_DIR = "./data/planequery_aircraft" +FINAL_OUTPUT_DIR = "./data/openairframes" os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True) @@ -85,12 +85,12 @@ def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFram def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame: """Download base release and merge with new data.""" - from src.get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv + from src.get_latest_release import download_latest_aircraft_adsb_csv print("Downloading base ADS-B release...") try: base_path = download_latest_aircraft_adsb_csv( - output_dir="./data/planequery_aircraft_base" + output_dir="./data/openairframes_base" ) print(f"Download returned: {base_path}") @@ -176,7 +176,7 @@ def main(): if args.start_date and args.end_date: # Historical mode output_id = f"{args.start_date}_{args.end_date}" - output_filename = f"planequery_aircraft_adsb_{args.start_date}_{args.end_date}.csv" + output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv" print(f"Combining chunks for date range: {args.start_date} to {args.end_date}") else: # Daily mode - use same date for start and end @@ -187,7 +187,7 @@ def main(): date_str = target_day.strftime("%Y-%m-%d") output_id = date_str - output_filename = f"planequery_aircraft_adsb_{date_str}_{date_str}.csv" + output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv" print(f"Combining chunks for {date_str}") chunks_dir = args.chunks_dir diff --git a/src/adsb/compress_adsb_to_aircraft_data.py b/src/adsb/compress_adsb_to_aircraft_data.py index 5ae58cd..0938883 100644 --- a/src/adsb/compress_adsb_to_aircraft_data.py +++ b/src/adsb/compress_adsb_to_aircraft_data.py @@ -253,7 +253,7 @@ def concat_compressed_dfs(df_base, df_new): def get_latest_aircraft_adsb_csv_df(): """Download and load the latest ADS-B CSV from GitHub releases.""" - from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv + from get_latest_release import download_latest_aircraft_adsb_csv import re csv_path = download_latest_aircraft_adsb_csv() @@ -264,8 +264,8 @@ def get_latest_aircraft_adsb_csv_df(): if df[col].dtype == pl.Utf8: df = df.with_columns(pl.col(col).fill_null("")) - # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv - match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv + match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) if not match: raise ValueError(f"Could not extract date from filename: {csv_path.name}") diff --git a/src/adsb/reducer.py b/src/adsb/reducer.py index 3ad7f40..9dcdb91 100644 --- a/src/adsb/reducer.py +++ b/src/adsb/reducer.py @@ -76,8 +76,8 @@ def main(): print(f"After dedup: {df_accumulated.height} rows") # Write and upload final result - output_name = f"planequery_aircraft_adsb_{global_start}_{global_end}.csv.gz" - csv_output = Path(f"/tmp/planequery_aircraft_adsb_{global_start}_{global_end}.csv") + output_name = f"openairframes_adsb_{global_start}_{global_end}.csv.gz" + csv_output = Path(f"/tmp/openairframes_adsb_{global_start}_{global_end}.csv") gz_output = Path(f"/tmp/{output_name}") df_accumulated.write_csv(csv_output) diff --git a/src/contributions/create_daily_community_release.py b/src/contributions/create_daily_community_release.py index 210e810..ec4d060 100644 --- a/src/contributions/create_daily_community_release.py +++ b/src/contributions/create_daily_community_release.py @@ -17,7 +17,7 @@ import pandas as pd COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community" -OUT_ROOT = Path("data/planequery_aircraft") +OUT_ROOT = Path("data/openairframes") def read_all_submissions(community_dir: Path) -> list[dict]: @@ -127,7 +127,7 @@ def main(): # Output OUT_ROOT.mkdir(parents=True, exist_ok=True) - output_file = OUT_ROOT / f"planequery_aircraft_community_{start_date_str}_{date_str}.csv" + output_file = OUT_ROOT / f"openairframes_community_{start_date_str}_{date_str}.csv" df.to_csv(output_file, index=False) diff --git a/src/contributions/schema.py b/src/contributions/schema.py index 949aa84..e2ee54b 100644 --- a/src/contributions/schema.py +++ b/src/contributions/schema.py @@ -111,7 +111,7 @@ def download_github_attachment(url: str) -> str | None: import urllib.error try: - req = urllib.request.Request(url, headers={"User-Agent": "PlaneQuery-Bot"}) + req = urllib.request.Request(url, headers={"User-Agent": "OpenAirframes-Bot"}) with urllib.request.urlopen(req, timeout=30) as response: return response.read().decode("utf-8") except (urllib.error.URLError, urllib.error.HTTPError, UnicodeDecodeError) as e: diff --git a/src/create_daily_planequery_aircraft_adsb_release.py b/src/create_daily_adsb_release.py similarity index 95% rename from src/create_daily_planequery_aircraft_adsb_release.py rename to src/create_daily_adsb_release.py index e5de1f8..0a5137e 100644 --- a/src/create_daily_planequery_aircraft_adsb_release.py +++ b/src/create_daily_adsb_release.py @@ -74,10 +74,10 @@ if __name__ == '__main__': ) # Save the result - OUT_ROOT = Path("data/planequery_aircraft") + OUT_ROOT = Path("data/openairframes") OUT_ROOT.mkdir(parents=True, exist_ok=True) - output_file = OUT_ROOT / f"planequery_aircraft_adsb_{start_date_str}_{date_str}.csv" + output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv" df_combined.write_csv(output_file) print(f"Saved: {output_file}") diff --git a/src/create_daily_planequery_aircraft_faa_release.py b/src/create_daily_faa_release.py similarity index 82% rename from src/create_daily_planequery_aircraft_faa_release.py rename to src/create_daily_faa_release.py index 559f8fc..c0bd6d0 100644 --- a/src/create_daily_planequery_aircraft_faa_release.py +++ b/src/create_daily_faa_release.py @@ -22,12 +22,12 @@ if not zip_path.exists(): body = r.read() zip_path.write_bytes(body) -OUT_ROOT = Path("data/planequery_aircraft") +OUT_ROOT = Path("data/openairframes") OUT_ROOT.mkdir(parents=True, exist_ok=True) from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df -from get_latest_planequery_aircraft_release import get_latest_aircraft_faa_csv_df +from get_latest_release import get_latest_aircraft_faa_csv_df df_new = convert_faa_master_txt_to_df(zip_path, date_str) df_base, start_date_str = get_latest_aircraft_faa_csv_df() df_base = concat_faa_historical_df(df_base, df_new) assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" -df_base.to_csv(OUT_ROOT / f"planequery_aircraft_faa_{start_date_str}_{date_str}.csv", index=False) \ No newline at end of file +df_base.to_csv(OUT_ROOT / f"openairframes_faa_{start_date_str}_{date_str}.csv", index=False) \ No newline at end of file diff --git a/src/create_daily_openairframes_adsb_release.py b/src/create_daily_openairframes_adsb_release.py new file mode 100644 index 0000000..0a5137e --- /dev/null +++ b/src/create_daily_openairframes_adsb_release.py @@ -0,0 +1,84 @@ +from pathlib import Path +from datetime import datetime, timezone, timedelta +import sys + +import polars as pl + +# Add adsb directory to path +sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation + +from adsb.compress_adsb_to_aircraft_data import ( + load_historical_for_day, + concat_compressed_dfs, + get_latest_aircraft_adsb_csv_df, +) + +if __name__ == '__main__': + # Get yesterday's date (data for the previous day) + day = datetime.now(timezone.utc) - timedelta(days=1) + + # Find a day with complete data + max_attempts = 2 # Don't look back more than a week + for attempt in range(max_attempts): + date_str = day.strftime("%Y-%m-%d") + print(f"Processing ADS-B data for {date_str}") + + print("Loading new ADS-B data...") + df_new = load_historical_for_day(day) + if df_new.height == 0: + day = day - timedelta(days=1) + continue + max_time = df_new['time'].max() + if max_time is not None: + # Handle timezone + max_time_dt = max_time + if hasattr(max_time_dt, 'replace'): + max_time_dt = max_time_dt.replace(tzinfo=timezone.utc) + + end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5) + + # Convert polars datetime to python datetime if needed + if isinstance(max_time_dt, datetime): + if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day: + break + else: + # Polars returns python datetime already + if max_time >= day.replace(hour=23, minute=54, second=59): + break + + print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.") + day = day - timedelta(days=1) + else: + raise RuntimeError(f"Could not find complete data in the last {max_attempts} days") + + try: + # Get the latest release data + print("Downloading latest ADS-B release...") + df_base, start_date_str = get_latest_aircraft_adsb_csv_df() + # Combine with historical data + print("Combining with historical data...") + df_combined = concat_compressed_dfs(df_base, df_new) + except Exception as e: + print(f"Error downloading latest ADS-B release: {e}") + df_combined = df_new + start_date_str = date_str + + # Sort by time for consistent ordering + df_combined = df_combined.sort('time') + + # Convert any list columns to strings for CSV compatibility + for col in df_combined.columns: + if df_combined[col].dtype == pl.List: + df_combined = df_combined.with_columns( + pl.col(col).list.join(",").alias(col) + ) + + # Save the result + OUT_ROOT = Path("data/openairframes") + OUT_ROOT.mkdir(parents=True, exist_ok=True) + + output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv" + df_combined.write_csv(output_file) + + print(f"Saved: {output_file}") + print(f"Total aircraft: {df_combined.height}") diff --git a/src/create_daily_openairframes_faa_release.py b/src/create_daily_openairframes_faa_release.py new file mode 100644 index 0000000..25bb32b --- /dev/null +++ b/src/create_daily_openairframes_faa_release.py @@ -0,0 +1,33 @@ +from pathlib import Path +from datetime import datetime, timezone +date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") + +out_dir = Path("data/faa_releasable") +out_dir.mkdir(parents=True, exist_ok=True) +zip_name = f"ReleasableAircraft_{date_str}.zip" + +zip_path = out_dir / zip_name +if not zip_path.exists(): + # URL and paths + url = "https://registry.faa.gov/database/ReleasableAircraft.zip" + from urllib.request import Request, urlopen + + req = Request( + url, + headers={"User-Agent": "Mozilla/5.0"}, + method="GET", + ) + + with urlopen(req, timeout=120) as r: + body = r.read() + zip_path.write_bytes(body) + +OUT_ROOT = Path("data/openairframes") +OUT_ROOT.mkdir(parents=True, exist_ok=True) +from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df +from get_latest_openairframes_release import get_latest_aircraft_faa_csv_df +df_new = convert_faa_master_txt_to_df(zip_path, date_str) +df_base, start_date_str = get_latest_aircraft_faa_csv_df() +df_base = concat_faa_historical_df(df_base, df_new) +assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" +df_base.to_csv(OUT_ROOT / f"openairframes_faa_{start_date_str}_{date_str}.csv", index=False) \ No newline at end of file diff --git a/src/get_latest_planequery_aircraft_release.py b/src/get_latest_openairframes_release.py similarity index 82% rename from src/get_latest_planequery_aircraft_release.py rename to src/get_latest_openairframes_release.py index 9867a5d..b29b82a 100644 --- a/src/get_latest_planequery_aircraft_release.py +++ b/src/get_latest_openairframes_release.py @@ -9,7 +9,7 @@ import urllib.error import json -REPO = "PlaneQuery/planequery-aircraft" +REPO = "PlaneQuery/openairframes" LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest" @@ -31,7 +31,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No url = f"https://api.github.com/repos/{repo}/releases/latest" headers = { "Accept": "application/vnd.github+json", - "User-Agent": "planequery-aircraft-downloader/1.0", + "User-Agent": "openairframes-downloader/1.0", } if github_token: headers["Authorization"] = f"Bearer {github_token}" @@ -80,7 +80,7 @@ def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[s out_path.parent.mkdir(parents=True, exist_ok=True) headers = { - "User-Agent": "planequery-aircraft-downloader/1.0", + "User-Agent": "openairframes-downloader/1.0", "Accept": "application/octet-stream", } if github_token: @@ -109,7 +109,7 @@ def download_latest_aircraft_csv( repo: str = REPO, ) -> Path: """ - Download the latest planequery_aircraft_faa_*.csv file from the latest GitHub release. + Download the latest openairframes_faa_*.csv file from the latest GitHub release. Args: output_dir: Directory to save the downloaded file (default: "downloads") @@ -121,10 +121,10 @@ def download_latest_aircraft_csv( """ assets = get_latest_release_assets(repo, github_token=github_token) try: - asset = pick_asset(assets, name_regex=r"^planequery_aircraft_faa_.*\.csv$") + asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$") except FileNotFoundError: # Fallback to old naming pattern - asset = pick_asset(assets, name_regex=r"^planequery_aircraft_\d{4}-\d{2}-\d{2}_.*\.csv$") + asset = pick_asset(assets, name_regex=r"^openairframes_\d{4}-\d{2}-\d{2}_.*\.csv$") saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") return saved_to @@ -136,11 +136,11 @@ def get_latest_aircraft_faa_csv_df(): 'unique_regulatory_id': str, 'registrant_county': str}) df = df.fillna("") - # Extract start date from filename pattern: planequery_aircraft_faa_{start_date}_{end_date}.csv - match = re.search(r"planequery_aircraft_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + # Extract start date from filename pattern: openairframes_faa_{start_date}_{end_date}.csv + match = re.search(r"openairframes_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path)) if not match: - # Fallback to old naming pattern: planequery_aircraft_{start_date}_{end_date}.csv - match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + # Fallback to old naming pattern: openairframes_{start_date}_{end_date}.csv + match = re.search(r"openairframes_(\d{4}-\d{2}-\d{2})_", str(csv_path)) if not match: raise ValueError(f"Could not extract date from filename: {csv_path.name}") @@ -154,7 +154,7 @@ def download_latest_aircraft_adsb_csv( repo: str = REPO, ) -> Path: """ - Download the latest planequery_aircraft_adsb_*.csv file from the latest GitHub release. + Download the latest openairframes_adsb_*.csv file from the latest GitHub release. Args: output_dir: Directory to save the downloaded file (default: "downloads") @@ -165,7 +165,7 @@ def download_latest_aircraft_adsb_csv( Path to the downloaded file """ assets = get_latest_release_assets(repo, github_token=github_token) - asset = pick_asset(assets, name_regex=r"^planequery_aircraft_adsb_.*\.csv$") + asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$") saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") return saved_to @@ -176,8 +176,8 @@ def get_latest_aircraft_adsb_csv_df(): import pandas as pd df = pd.read_csv(csv_path) df = df.fillna("") - # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv - match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv + match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) if not match: raise ValueError(f"Could not extract date from filename: {csv_path.name}") diff --git a/src/get_latest_release.py b/src/get_latest_release.py new file mode 100644 index 0000000..b29b82a --- /dev/null +++ b/src/get_latest_release.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, Optional +import re +import urllib.request +import urllib.error +import json + + +REPO = "PlaneQuery/openairframes" +LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest" + + +@dataclass(frozen=True) +class ReleaseAsset: + name: str + download_url: str + size: int # bytes + + +def _http_get_json(url: str, headers: dict[str, str]) -> dict: + req = urllib.request.Request(url, headers=headers, method="GET") + with urllib.request.urlopen(req, timeout=120) as resp: + data = resp.read() + return json.loads(data.decode("utf-8")) + + +def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]: + url = f"https://api.github.com/repos/{repo}/releases/latest" + headers = { + "Accept": "application/vnd.github+json", + "User-Agent": "openairframes-downloader/1.0", + } + if github_token: + headers["Authorization"] = f"Bearer {github_token}" + + payload = _http_get_json(url, headers=headers) + assets = [] + for a in payload.get("assets", []): + assets.append( + ReleaseAsset( + name=a["name"], + download_url=a["browser_download_url"], + size=int(a.get("size", 0)), + ) + ) + return assets + + +def pick_asset( + assets: Iterable[ReleaseAsset], + *, + exact_name: Optional[str] = None, + name_regex: Optional[str] = None, +) -> ReleaseAsset: + assets = list(assets) + + if exact_name: + for a in assets: + if a.name == exact_name: + return a + raise FileNotFoundError(f"No asset exactly named {exact_name!r}. Available: {[a.name for a in assets]}") + + if name_regex: + rx = re.compile(name_regex) + matches = [a for a in assets if rx.search(a.name)] + if not matches: + raise FileNotFoundError(f"No asset matched regex {name_regex!r}. Available: {[a.name for a in assets]}") + if len(matches) > 1: + raise FileExistsError(f"Regex {name_regex!r} matched multiple assets: {[m.name for m in matches]}") + return matches[0] + + raise ValueError("Provide either exact_name=... or name_regex=...") + + +def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[str] = None) -> Path: + out_path = Path(out_path) + out_path.parent.mkdir(parents=True, exist_ok=True) + + headers = { + "User-Agent": "openairframes-downloader/1.0", + "Accept": "application/octet-stream", + } + if github_token: + headers["Authorization"] = f"Bearer {github_token}" + + req = urllib.request.Request(asset.download_url, headers=headers, method="GET") + + try: + with urllib.request.urlopen(req, timeout=300) as resp, out_path.open("wb") as f: + # Stream download + while True: + chunk = resp.read(1024 * 1024) # 1 MiB + if not chunk: + break + f.write(chunk) + except urllib.error.HTTPError as e: + body = e.read().decode("utf-8", errors="replace") if hasattr(e, "read") else "" + raise RuntimeError(f"HTTPError {e.code} downloading {asset.name}: {body[:500]}") from e + + return out_path + + +def download_latest_aircraft_csv( + output_dir: Path = Path("downloads"), + github_token: Optional[str] = None, + repo: str = REPO, +) -> Path: + """ + Download the latest openairframes_faa_*.csv file from the latest GitHub release. + + Args: + output_dir: Directory to save the downloaded file (default: "downloads") + github_token: Optional GitHub token for authentication + repo: GitHub repository in format "owner/repo" (default: REPO) + + Returns: + Path to the downloaded file + """ + assets = get_latest_release_assets(repo, github_token=github_token) + try: + asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$") + except FileNotFoundError: + # Fallback to old naming pattern + asset = pick_asset(assets, name_regex=r"^openairframes_\d{4}-\d{2}-\d{2}_.*\.csv$") + saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) + print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") + return saved_to + +def get_latest_aircraft_faa_csv_df(): + csv_path = download_latest_aircraft_csv() + import pandas as pd + df = pd.read_csv(csv_path, dtype={'transponder_code': str, + 'unique_regulatory_id': str, + 'registrant_county': str}) + df = df.fillna("") + # Extract start date from filename pattern: openairframes_faa_{start_date}_{end_date}.csv + match = re.search(r"openairframes_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + if not match: + # Fallback to old naming pattern: openairframes_{start_date}_{end_date}.csv + match = re.search(r"openairframes_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + if not match: + raise ValueError(f"Could not extract date from filename: {csv_path.name}") + + date_str = match.group(1) + return df, date_str + + +def download_latest_aircraft_adsb_csv( + output_dir: Path = Path("downloads"), + github_token: Optional[str] = None, + repo: str = REPO, +) -> Path: + """ + Download the latest openairframes_adsb_*.csv file from the latest GitHub release. + + Args: + output_dir: Directory to save the downloaded file (default: "downloads") + github_token: Optional GitHub token for authentication + repo: GitHub repository in format "owner/repo" (default: REPO) + + Returns: + Path to the downloaded file + """ + assets = get_latest_release_assets(repo, github_token=github_token) + asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$") + saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) + print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") + return saved_to + + +def get_latest_aircraft_adsb_csv_df(): + csv_path = download_latest_aircraft_adsb_csv() + import pandas as pd + df = pd.read_csv(csv_path) + df = df.fillna("") + # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv + match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + if not match: + raise ValueError(f"Could not extract date from filename: {csv_path.name}") + + date_str = match.group(1) + return df, date_str + + +if __name__ == "__main__": + download_latest_aircraft_csv()