diff --git a/.github/workflows/faa-daily-release.yaml b/.github/workflows/planequery-aircraft-daily-release.yaml similarity index 68% rename from .github/workflows/faa-daily-release.yaml rename to .github/workflows/planequery-aircraft-daily-release.yaml index 2fee3ed..37309ca 100644 --- a/.github/workflows/faa-daily-release.yaml +++ b/.github/workflows/planequery-aircraft-daily-release.yaml @@ -1,4 +1,4 @@ -name: FAA daily snapshot + release +name: planequery-aircraft Daily Release on: schedule: @@ -29,18 +29,24 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt - - name: Run snapshot script + - name: Run daily release script run: | - python src/snapshot_faa.py + python src/create_daily_planequery_aircraft_release.py ls -lah data/faa_releasable + ls -lah data/planequery_aircraft - name: Prepare release metadata id: meta run: | DATE=$(date -u +"%Y-%m-%d") - TAG="faa-${DATE}" + TAG="planequery-aircraft-${DATE}" + # Find the CSV file in data/planequery_aircraft matching the pattern + CSV_FILE=$(ls data/planequery_aircraft/planequery_aircraft_*_${DATE}.csv | head -1) + CSV_BASENAME=$(basename "$CSV_FILE") echo "date=$DATE" >> "$GITHUB_OUTPUT" echo "tag=$TAG" >> "$GITHUB_OUTPUT" + echo "csv_file=$CSV_FILE" >> "$GITHUB_OUTPUT" + echo "csv_basename=$CSV_BASENAME" >> "$GITHUB_OUTPUT" echo "name=FAA ReleasableAircraft snapshot ($DATE)" >> "$GITHUB_OUTPUT" - name: Create GitHub Release and upload assets @@ -52,10 +58,10 @@ jobs: Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}. Assets: - - ReleasableAircraft_${{ steps.meta.outputs.date }}.csv + - ${{ steps.meta.outputs.csv_basename }} - ReleasableAircraft_${{ steps.meta.outputs.date }}.zip files: | - data/faa_releasable/ReleasableAircraft_${{ steps.meta.outputs.date }}.csv + ${{ steps.meta.outputs.csv_file }} data/faa_releasable/ReleasableAircraft_${{ steps.meta.outputs.date }}.zip env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/process-historical-faa.yaml b/.github/workflows/process-historical-faa.yaml new file mode 100644 index 0000000..d015499 --- /dev/null +++ b/.github/workflows/process-historical-faa.yaml @@ -0,0 +1,171 @@ +name: Process Historical FAA Data + +on: + workflow_dispatch: # Manual trigger + +jobs: + generate-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Generate date ranges + id: set-matrix + run: | + python3 << 'EOF' + import json + from datetime import datetime, timedelta + + start = datetime(2023, 8, 16) + end = datetime(2026, 1, 1) + + ranges = [] + current = start + + # Process in 4-day chunks + while current < end: + chunk_end = current + timedelta(days=4) + # Don't go past the end date + if chunk_end > end: + chunk_end = end + + ranges.append({ + "since": current.strftime("%Y-%m-%d"), + "until": chunk_end.strftime("%Y-%m-%d") + }) + + current = chunk_end + + print(f"::set-output name=matrix::{json.dumps(ranges)}") + EOF + + clone-faa-repo: + runs-on: ubuntu-latest + steps: + - name: Cache FAA repository + id: cache-faa-repo + uses: actions/cache@v4 + with: + path: data/scrape-faa-releasable-aircraft + key: faa-repo-v1 + + - name: Clone FAA repository + if: steps.cache-faa-repo.outputs.cache-hit != 'true' + run: | + mkdir -p data + git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft + echo "Repository cloned successfully" + + process-chunk: + needs: [generate-matrix, clone-faa-repo] + runs-on: ubuntu-latest + strategy: + max-parallel: 5 # Process 5 chunks at a time + matrix: + range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Restore FAA repository cache + uses: actions/cache/restore@v4 + with: + path: data/scrape-faa-releasable-aircraft + key: faa-repo-v1 + fail-on-cache-miss: true + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }} + run: | + python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}" + + - name: Upload CSV artifact + uses: actions/upload-artifact@v4 + with: + name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }} + path: data/faa_releasable_historical/*.csv + retention-days: 1 + + create-release: + needs: process-chunk + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Prepare release files + run: | + mkdir -p release-files + find artifacts -name "*.csv" -exec cp {} release-files/ \; + ls -lh release-files/ + + - name: Create Release + uses: softprops/action-gh-release@v1 + with: + tag_name: historical-faa-${{ github.run_number }} + name: Historical FAA Data Release ${{ github.run_number }} + body: | + Automated release of historical FAA aircraft data + Processing period: 2023-08-16 to 2026-01-01 + Generated: ${{ github.event.repository.updated_at }} + files: release-files/*.csv + draft: false + prerelease: false + + concatenate-and-release: + needs: process-chunk + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Prepare CSVs for concatenation + run: | + mkdir -p data/faa_releasable_historical + find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \; + ls -lh data/faa_releasable_historical/ + + - name: Concatenate all CSVs + run: | + python scripts/concat_csvs.py + + - name: Create Combined Release + uses: softprops/action-gh-release@v1 + with: + tag_name: historical-faa-combined-${{ github.run_number }} + name: Historical FAA Data Combined Release ${{ github.run_number }} + body: | + Combined historical FAA aircraft data (all chunks concatenated) + Processing period: 2023-08-16 to 2026-01-01 + Generated: ${{ github.event.repository.updated_at }} + files: data/planequery_aircraft/*.csv + draft: false + prerelease: false \ No newline at end of file diff --git a/README.md b/README.md index 66aaba1..f78157f 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -Downloads [`https://registry.faa.gov/database/ReleasableAircraft.zip`](https://registry.faa.gov/database/ReleasableAircraft.zip). Creates a daily GitHub Release at 06:00 UTC containing the unaltered `ReleasableAircraft.zip` and a derived CSV file with all data. The FAA database updates daily at 05:30 UTC. \ No newline at end of file +Downloads [`https://registry.faa.gov/database/ReleasableAircraft.zip`](https://registry.faa.gov/database/ReleasableAircraft.zip). Creates a daily GitHub Release at 06:00 UTC containing the unaltered `ReleasableAircraft.zip` and a derived CSV file with all data from FAA database since 2023-08-16. The FAA database updates daily at 05:30 UTC. \ No newline at end of file diff --git a/community/.gitkeep b/community/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/combine_historical_faa.py b/src/combine_historical_faa.py new file mode 100644 index 0000000..c66fec3 --- /dev/null +++ b/src/combine_historical_faa.py @@ -0,0 +1,14 @@ +#unique_regulatory_id +# 1. read historoical and output +# 2. read sequentially + +# Instead of reading all csvs I can read just the latest release csv to get everything. + +from pathlib import Path + +base = Path("data/faa_releasable_historical") +for day_dir in sorted(base.glob("2024-02-*")): + master = day_dir / "Master.txt" + if master.exists(): + out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv") + print(day_dir.name, "->", out_csv) \ No newline at end of file diff --git a/src/concat_csvs.py b/src/concat_csvs.py new file mode 100644 index 0000000..ce6780e --- /dev/null +++ b/src/concat_csvs.py @@ -0,0 +1,89 @@ +from pathlib import Path +import pandas as pd +import re +from derive_from_faa_master_txt import concat_faa_historical_df + +def concatenate_aircraft_csvs( + input_dir: Path = Path("data/concat"), + output_dir: Path = Path("data/planequery_aircraft"), + filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv" +): + """ + Read all CSVs matching the pattern from input_dir in order, + concatenate them using concat_faa_historical_df, and output a single CSV. + + Args: + input_dir: Directory containing the CSV files to concatenate + output_dir: Directory where the output CSV will be saved + filename_pattern: Regex pattern to match CSV filenames + """ + input_dir = Path(input_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Find all matching CSV files + pattern = re.compile(filename_pattern) + csv_files = [] + + for csv_path in sorted(input_dir.glob("*.csv")): + match = pattern.search(csv_path.name) + if match: + start_date = match.group(1) + end_date = match.group(2) + csv_files.append((start_date, end_date, csv_path)) + + # Sort by start date, then end date + csv_files.sort(key=lambda x: (x[0], x[1])) + + if not csv_files: + raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}") + + print(f"Found {len(csv_files)} CSV files to concatenate") + + # Read first CSV as base + first_start_date, first_end_date, first_path = csv_files[0] + print(f"Reading base file: {first_path.name}") + df_base = pd.read_csv( + first_path, + dtype={ + 'transponder_code': str, + 'unique_regulatory_id': str, + 'registrant_county': str + } + ) + + # Concatenate remaining CSVs + for start_date, end_date, csv_path in csv_files[1:]: + print(f"Concatenating: {csv_path.name}") + df_new = pd.read_csv( + csv_path, + dtype={ + 'transponder_code': str, + 'unique_regulatory_id': str, + 'registrant_county': str + } + ) + df_base = concat_faa_historical_df(df_base, df_new) + + # Verify monotonic increasing download_date + assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" + + # Output filename uses first start date and last end date + last_start_date, last_end_date, _ = csv_files[-1] + output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv" + output_path = output_dir / output_filename + + print(f"Writing output to: {output_path}") + df_base.to_csv(output_path, index=False) + print(f"Successfully concatenated {len(csv_files)} files into {output_filename}") + print(f"Total rows: {len(df_base)}") + + return output_path + + +if __name__ == "__main__": + # Example usage - modify these paths as needed + concatenate_aircraft_csvs( + input_dir=Path("data/concat"), + output_dir=Path("data/planequery_aircraft") + ) \ No newline at end of file diff --git a/src/create_daily_planequery_aircraft_release.py b/src/create_daily_planequery_aircraft_release.py new file mode 100644 index 0000000..4019aa7 --- /dev/null +++ b/src/create_daily_planequery_aircraft_release.py @@ -0,0 +1,33 @@ +from pathlib import Path +from datetime import datetime, timezone +date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") + +out_dir = Path("data/faa_releasable") +out_dir.mkdir(parents=True, exist_ok=True) +zip_name = f"ReleasableAircraft_{date_str}.zip" + +zip_path = out_dir / zip_name +if not zip_path.exists(): + # URL and paths + url = "https://registry.faa.gov/database/ReleasableAircraft.zip" + from urllib.request import Request, urlopen + + req = Request( + url, + headers={"User-Agent": "Mozilla/5.0"}, + method="GET", + ) + + with urlopen(req, timeout=120) as r: + body = r.read() + zip_path.write_bytes(body) + +OUT_ROOT = Path("data/planequery_aircraft") +OUT_ROOT.mkdir(parents=True, exist_ok=True) +from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df +from get_latest_planequery_aircraft_release import get_latest_aircraft_csv_df +df_new = convert_faa_master_txt_to_df(zip_path, date_str) +df_base, start_date_str = get_latest_aircraft_csv_df() +df_base = concat_faa_historical_df(df_base, df_new) +assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" +df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date_str}_{date_str}.csv", index=False) \ No newline at end of file diff --git a/src/derive_from_faa_master_txt.py b/src/derive_from_faa_master_txt.py new file mode 100644 index 0000000..532d3f3 --- /dev/null +++ b/src/derive_from_faa_master_txt.py @@ -0,0 +1,127 @@ +from pathlib import Path +import zipfile +import pandas as pd +from faa_aircraft_registry import read + +def convert_faa_master_txt_to_df(zip_path: Path, date: str): + with zipfile.ZipFile(zip_path) as z: + registrations = read(z) + + df = pd.DataFrame(registrations['master'].values()) + + df.insert(0, "download_date", date) + + registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_") + df = df.drop(columns="registrant").join(registrant) + + # Move transponder_code_hex to second column (after registration_number) + cols = df.columns.tolist() + cols.remove("transponder_code_hex") + cols.insert(1, "transponder_code_hex") + df = df[cols] + + df = df.rename(columns={"aircraft_type": "aircraft_type_2"}) + aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_") + df = df.drop(columns="aircraft").join(aircraft) + df = df.rename(columns={"engine_type": "engine_type_2"}) + engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_") + df = df.drop(columns="engine").join(engine) + certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_") + df = df.drop(columns="certification").join(certification) + + # Create planequery_airframe_id + df["planequery_airframe_id"] = ( + normalize(df["aircraft_manufacturer"]) + + "|" + + normalize(df["aircraft_model"]) + + "|" + + normalize(df["serial_number"]) + ) + + # Move planequery_airframe_id to come after registration_number + cols = df.columns.tolist() + cols.remove("planequery_airframe_id") + reg_idx = cols.index("registration_number") + cols.insert(reg_idx + 1, "planequery_airframe_id") + df = df[cols] + + # Convert all NaN to empty strings + df = df.fillna("") + + return df + + + +def normalize(s: pd.Series) -> pd.Series: + return ( + s.fillna("") + .astype(str) + .str.upper() + .str.strip() + # collapse whitespace + .str.replace(r"\s+", " ", regex=True) + # remove characters that cause false mismatches + .str.replace(r"[^\w\-]", "", regex=True) + ) + + +def concat_faa_historical_df(df_base, df_new): + df_new = df_new[df_base.columns] + df_base = pd.concat([df_base, df_new], ignore_index=True) + + CONTENT_COLS = [ + c for c in df_base.columns + if c not in {"download_date"} + ] + + # Normalize values to handle numeric type, formatting, and list ordering differences + def normalize_series(series): + def normalize_value(val): + # Handle lists (sort them for consistent comparison) + if isinstance(val, list): + return "|".join(sorted(str(v) for v in val)) + + # Convert to string + val_str = str(val).strip() + + # Handle empty strings + if val_str == "" or val_str == "nan": + return "" + + # Check if it looks like a list representation (starts with [ ) + if val_str.startswith('[') and val_str.endswith(']'): + try: + # Try to parse as a list-like string + import ast + parsed = ast.literal_eval(val_str) + if isinstance(parsed, list): + return "|".join(sorted(str(v) for v in parsed)) + except (ValueError, SyntaxError): + pass # Not a valid list, continue to other checks + + # Try to normalize as number + try: + # Remove leading zeros and convert float/int representations + num_val = float(val_str) + # If it's a whole number, return as int string (no .0) + if num_val == int(num_val): + return str(int(num_val)) + # Otherwise return as float + return str(num_val) + except (ValueError, OverflowError): + # Not a number, return as-is + return val_str + + return series.apply(normalize_value) + + df_base["row_fingerprint"] = ( + df_base[CONTENT_COLS] + .apply(normalize_series, axis=0) + .apply(lambda row: "|".join(row), axis=1) + ) + + df_base = df_base.drop_duplicates( + subset=["row_fingerprint"], + keep="first" + ).drop(columns=["row_fingerprint"]) + return df_base \ No newline at end of file diff --git a/src/get_historical_faa.py b/src/get_historical_faa.py index f8b020e..51cdcd4 100644 --- a/src/get_historical_faa.py +++ b/src/get_historical_faa.py @@ -1,63 +1,116 @@ -'''Generated with ChatGPT 5.2 prompt -scrape-faa-releasable-aircraft -Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023. -scrape-faa-releasable-aircraft % ls -ACFTREF.txt DOCINDEX.txt MASTER-1.txt MASTER-3.txt MASTER-5.txt MASTER-7.txt MASTER-9.txt RESERVED.txt -DEALER.txt ENGINE.txt MASTER-2.txt MASTER-4.txt MASTER-6.txt MASTER-8.txt README.md ardata.pdf -''' +""" +For each commit-day in Feb 2024 (last commit per day): +- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/ + ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt +- Recombine MASTER-*.txt into Master.txt +- Produce Master.csv via convert_faa_master_txt_to_csv + +Assumes the non-master files are present in every commit. +""" import subprocess, re from pathlib import Path +import shutil from collections import OrderedDict +from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df +import zipfile +import pandas as pd +import argparse +from datetime import datetime, timedelta -def run(*args: str) -> str: - return subprocess.check_output(args, text=True).strip() +# Parse command line arguments +parser = argparse.ArgumentParser(description="Process historical FAA data from git commits") +parser.add_argument("since", help="Start date (YYYY-MM-DD)") +parser.add_argument("until", help="End date (YYYY-MM-DD)") +args = parser.parse_args() -# Get commits that touched any MASTER-*.txt, oldest -> newest -log = run("git", "log", "--reverse", "--format=%H %cs", "--", ".") -# If you want to restrict to only commits that touched the master parts, use: -# log = run("git", "log", "--reverse", "--format=%H %cs", "--", "MASTER-1.txt") +# Clone repository if it doesn't exist +REPO = Path("data/scrape-faa-releasable-aircraft") +OUT_ROOT = Path("data/faa_releasable_historical") +OUT_ROOT.mkdir(parents=True, exist_ok=True) +def run_git_text(*args: str) -> str: + return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip() + +def run_git_bytes(*args: str) -> bytes: + return subprocess.check_output(["git", "-C", str(REPO), *args]) + +# Parse dates and adjust --since to the day before +since_date = datetime.strptime(args.since, "%Y-%m-%d") +adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d") + +# All commits in specified date range (oldest -> newest) +log = run_git_text( + "log", + "--reverse", + "--format=%H %cs", + f"--since={adjusted_since}", + f"--until={args.until}", +) lines = [ln for ln in log.splitlines() if ln.strip()] if not lines: - raise SystemExit("No commits found.") + raise SystemExit(f"No commits found between {args.since} and {args.until}.") -# Map date -> last commit SHA on that date (Ordered by history) +# date -> last SHA that day date_to_sha = OrderedDict() for ln in lines: sha, date = ln.split() - # keep last SHA per day date_to_sha[date] = sha -out_root = Path("out_master_by_day") -out_root.mkdir(exist_ok=True) - +OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"] master_re = re.compile(r"^MASTER-(\d+)\.txt$") - +df_base = pd.DataFrame() +start_date = None +end_date = None for date, sha in date_to_sha.items(): - # list files at this commit, filter MASTER-*.txt in repo root - names = run("git", "ls-tree", "--name-only", sha).splitlines() + if start_date is None: + start_date = date + end_date = date + day_dir = OUT_ROOT / date + day_dir.mkdir(parents=True, exist_ok=True) + + # Write auxiliary files (assumed present) + for fname in OTHER_FILES: + (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}")) + + # Recombine MASTER parts + names = run_git_text("ls-tree", "--name-only", sha).splitlines() parts = [] for n in names: m = master_re.match(n) if m: parts.append((int(m.group(1)), n)) parts.sort() - if not parts: - # no master parts in that commit/day; skip - continue + raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found") - day_dir = out_root / date - day_dir.mkdir(parents=True, exist_ok=True) - out_path = day_dir / "Master.txt" - - with out_path.open("wb") as w: + master_path = day_dir / "MASTER.txt" + with master_path.open("wb") as w: for _, fname in parts: - data = subprocess.check_output(["git", "show", f"{sha}:{fname}"]) + data = run_git_bytes("show", f"{sha}:{fname}") w.write(data) if data and not data.endswith(b"\n"): w.write(b"\n") - print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)") + # 3) Zip the day's files + zip_path = day_dir / f"ReleasableAircraft.zip" + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z: + for p in day_dir.iterdir(): + z.write(p, arcname=p.name) -print(f"\nDone. Output root: {out_root.resolve()}") + print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})") + # 4) Convert ZIP -> CSV + df_new = convert_faa_master_txt_to_df(zip_path, date) + if df_base.empty: + df_base = df_new + print(len(df_base), "total entries so far") + # Delete all files in the day directory + shutil.rmtree(day_dir) + continue + + df_base = concat_faa_historical_df(df_base, df_new) + shutil.rmtree(day_dir) + print(len(df_base), "total entries so far") + +assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" +df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date}_{end_date}.csv", index=False) +# TODO: get average number of new rows per day. diff --git a/src/get_latest_planequery_aircraft_release.py b/src/get_latest_planequery_aircraft_release.py new file mode 100644 index 0000000..c264250 --- /dev/null +++ b/src/get_latest_planequery_aircraft_release.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, Optional +import re +import urllib.request +import urllib.error +import json + + +REPO = "PlaneQuery/planequery-aircraft" +LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest" + + +@dataclass(frozen=True) +class ReleaseAsset: + name: str + download_url: str + size: int # bytes + + +def _http_get_json(url: str, headers: dict[str, str]) -> dict: + req = urllib.request.Request(url, headers=headers, method="GET") + with urllib.request.urlopen(req, timeout=120) as resp: + data = resp.read() + return json.loads(data.decode("utf-8")) + + +def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]: + url = f"https://api.github.com/repos/{repo}/releases/latest" + headers = { + "Accept": "application/vnd.github+json", + "User-Agent": "planequery-aircraft-downloader/1.0", + } + if github_token: + headers["Authorization"] = f"Bearer {github_token}" + + payload = _http_get_json(url, headers=headers) + assets = [] + for a in payload.get("assets", []): + assets.append( + ReleaseAsset( + name=a["name"], + download_url=a["browser_download_url"], + size=int(a.get("size", 0)), + ) + ) + return assets + + +def pick_asset( + assets: Iterable[ReleaseAsset], + *, + exact_name: Optional[str] = None, + name_regex: Optional[str] = None, +) -> ReleaseAsset: + assets = list(assets) + + if exact_name: + for a in assets: + if a.name == exact_name: + return a + raise FileNotFoundError(f"No asset exactly named {exact_name!r}. Available: {[a.name for a in assets]}") + + if name_regex: + rx = re.compile(name_regex) + matches = [a for a in assets if rx.search(a.name)] + if not matches: + raise FileNotFoundError(f"No asset matched regex {name_regex!r}. Available: {[a.name for a in assets]}") + if len(matches) > 1: + raise FileExistsError(f"Regex {name_regex!r} matched multiple assets: {[m.name for m in matches]}") + return matches[0] + + raise ValueError("Provide either exact_name=... or name_regex=...") + + +def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[str] = None) -> Path: + out_path = Path(out_path) + out_path.parent.mkdir(parents=True, exist_ok=True) + + headers = { + "User-Agent": "planequery-aircraft-downloader/1.0", + "Accept": "application/octet-stream", + } + if github_token: + headers["Authorization"] = f"Bearer {github_token}" + + req = urllib.request.Request(asset.download_url, headers=headers, method="GET") + + try: + with urllib.request.urlopen(req, timeout=300) as resp, out_path.open("wb") as f: + # Stream download + while True: + chunk = resp.read(1024 * 1024) # 1 MiB + if not chunk: + break + f.write(chunk) + except urllib.error.HTTPError as e: + body = e.read().decode("utf-8", errors="replace") if hasattr(e, "read") else "" + raise RuntimeError(f"HTTPError {e.code} downloading {asset.name}: {body[:500]}") from e + + return out_path + + +def download_latest_aircraft_csv( + output_dir: Path = Path("downloads"), + github_token: Optional[str] = None, + repo: str = REPO, +) -> Path: + """ + Download the latest planequery_aircraft_*.csv file from the latest GitHub release. + + Args: + output_dir: Directory to save the downloaded file (default: "downloads") + github_token: Optional GitHub token for authentication + repo: GitHub repository in format "owner/repo" (default: REPO) + + Returns: + Path to the downloaded file + """ + assets = get_latest_release_assets(repo, github_token=github_token) + asset = pick_asset(assets, name_regex=r"^planequery_aircraft_.*\.csv$") + saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) + print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") + return saved_to + +def get_latest_aircraft_csv_df(): + csv_path = download_latest_aircraft_csv() + import pandas as pd + df = pd.read_csv(csv_path, dtype={'transponder_code': str, + 'unique_regulatory_id': str, + 'registrant_county': str}) + df = df.fillna("") + # Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv + match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + if not match: + raise ValueError(f"Could not extract date from filename: {csv_path.name}") + + date_str = match.group(1) + return df, date_str + +if __name__ == "__main__": + download_latest_aircraft_csv() diff --git a/src/snapshot_faa.py b/src/snapshot_faa.py deleted file mode 100644 index f7cf7b0..0000000 --- a/src/snapshot_faa.py +++ /dev/null @@ -1,48 +0,0 @@ -from faa_aircraft_registry import read -import pandas as pd -import zipfile -import zipfile -from pathlib import Path -from datetime import datetime, timezone -date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") - -out_dir = Path("data/faa_releasable") -out_dir.mkdir(parents=True, exist_ok=True) -zip_name = f"ReleasableAircraft_{date_str}.zip" -csv_name = f"ReleasableAircraft_{date_str}.csv" - -zip_path = out_dir / zip_name -csv_path = out_dir / csv_name - -# URL and paths -url = "https://registry.faa.gov/database/ReleasableAircraft.zip" -from urllib.request import Request, urlopen - -req = Request( - url, - headers={"User-Agent": "Mozilla/5.0"}, - method="GET", -) - -with urlopen(req, timeout=120) as r: - body = r.read() - zip_path.write_bytes(body) - -with zipfile.ZipFile(zip_path) as z: - registrations = read(z) - -df = pd.DataFrame(registrations['master'].values()) -col = "transponder_code_hex" -df = df[[col] + [c for c in df.columns if c != col]] -df = df.rename(columns={"transponder_code_hex": "icao"}) -registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_") -df = df.drop(columns="registrant").join(registrant) -df = df.rename(columns={"aircraft_type": "aircraft_type_2"}) -aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_") -df = df.drop(columns="aircraft").join(aircraft) -df = df.rename(columns={"engine_type": "engine_type_2"}) -engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_") -df = df.drop(columns="engine").join(engine) -df = df.sort_values(by=["icao"]) -df.to_csv(csv_path, index=False) - diff --git a/ui/.gitkeep b/ui/.gitkeep new file mode 100644 index 0000000..e69de29