Compare commits

...

32 Commits

Author SHA1 Message Date
ggman12 65d9417cb2 put back csv_path 2026-02-02 20:28:58 -05:00
ggman12 87e37df035 handle duplictes much better 2026-02-02 20:28:09 -05:00
ggman12 250ac98e66 NaN to empty string 2026-02-02 20:04:55 -05:00
ggman12 44e2af3239 udpate to use latest_aircraft_csv_download 2026-02-02 19:05:08 -05:00
ggman12 7abd1c2169 update workflow 2026-02-02 19:03:14 -05:00
ggman12 6d1be9dc66 add ui folder 2026-02-02 18:57:51 -05:00
ggman12 6fcf179fc4 move concat_csvs 2026-02-02 18:57:47 -05:00
ggman12 f3cca9037b adjust --since date to one day earlier for accurate commit range 2026-02-01 23:40:57 -05:00
ggman12 725a42d075 change days to 5 2026-02-01 22:37:57 -05:00
ggman12 cc186f3707 update 2026-02-01 22:33:55 -05:00
ggman12 2f41c6e954 update to 1 day chunk 2026-02-01 22:31:49 -05:00
ggman12 6a9f81a8bf remove out_csv 2026-02-01 22:11:30 -05:00
ggman12 960661eebd add community folder 2026-02-01 21:51:09 -05:00
ggman12 1ea839669c add concat csvs 2026-02-01 21:47:07 -05:00
ggman12 2763e923fc use cache from github workflows 2026-02-01 21:01:31 -05:00
ggman12 70cc0ab490 cache the repisitory 2026-02-01 20:58:51 -05:00
ggman12 d333f8a8e4 add .github/workflows/process-historical-faa.yaml 2026-02-01 20:50:15 -05:00
ggman12 01024c4695 fix args 2026-02-01 20:40:37 -05:00
ggman12 f2a64fc34c rename to src/create_daily_planequery_aircraft_release.py 2026-02-01 20:38:51 -05:00
ggman12 a2652fe64c feat: add functionality to download latest aircraft CSV from GitHub releases 2026-02-01 20:37:33 -05:00
ggman12 c2e174f22c Works. appending csv that already exisits. 2026-02-01 20:33:22 -05:00
ggman12 2c7c1a713a feat: implement concat_faa_historical_df function for deduplication and concatenation of historical FAA data 2026-02-01 19:39:41 -05:00
ggman12 5c7cdf12b1 fix: change command line arguments from optional to positional for date inputs 2026-02-01 19:08:31 -05:00
ggman12 4e22b19786 add cli arguments 2026-02-01 19:03:05 -05:00
ggman12 aedd7b3df5 fix: update repository path and improve cloning logic in FAA data retrieval 2026-02-01 19:01:42 -05:00
ggman12 60869bc178 feat: add download date and restructure columns in FAA master conversion 2026-02-01 19:01:37 -05:00
ggman12 66a1108ac9 belive this works. 2026-02-01 18:37:59 -05:00
ggman12 d1f5ab693b add download date, change days 2026-02-01 15:15:25 -05:00
ggman12 efe63743ab fix: handle missing aircraft and engine data in conversion process
feat: add combine_historical_faa.py to process historical FAA data
2026-02-01 14:44:27 -05:00
ggman12 8368bfcbc9 update name to Master_ .csv 2026-02-01 14:14:29 -05:00
ggman12 e701f424a2 moidfy src/get_historical_faa.py 2026-02-01 14:07:37 -05:00
ggman12 34a8cf1324 refactor: move FAA data conversion logic to a separate function 2026-02-01 14:06:26 -05:00
11 changed files with 676 additions and 87 deletions
@@ -1,4 +1,4 @@
name: FAA daily snapshot + release name: planequery-aircraft Daily Release
on: on:
schedule: schedule:
@@ -29,18 +29,24 @@ jobs:
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -r requirements.txt pip install -r requirements.txt
- name: Run snapshot script - name: Run daily release script
run: | run: |
python src/snapshot_faa.py python src/create_daily_planequery_aircraft_release.py
ls -lah data/faa_releasable ls -lah data/faa_releasable
ls -lah data/planequery_aircraft
- name: Prepare release metadata - name: Prepare release metadata
id: meta id: meta
run: | run: |
DATE=$(date -u +"%Y-%m-%d") DATE=$(date -u +"%Y-%m-%d")
TAG="faa-${DATE}" TAG="planequery-aircraft-${DATE}"
# Find the CSV file in data/planequery_aircraft matching the pattern
CSV_FILE=$(ls data/planequery_aircraft/planequery_aircraft_*_${DATE}.csv | head -1)
CSV_BASENAME=$(basename "$CSV_FILE")
echo "date=$DATE" >> "$GITHUB_OUTPUT" echo "date=$DATE" >> "$GITHUB_OUTPUT"
echo "tag=$TAG" >> "$GITHUB_OUTPUT" echo "tag=$TAG" >> "$GITHUB_OUTPUT"
echo "csv_file=$CSV_FILE" >> "$GITHUB_OUTPUT"
echo "csv_basename=$CSV_BASENAME" >> "$GITHUB_OUTPUT"
echo "name=FAA ReleasableAircraft snapshot ($DATE)" >> "$GITHUB_OUTPUT" echo "name=FAA ReleasableAircraft snapshot ($DATE)" >> "$GITHUB_OUTPUT"
- name: Create GitHub Release and upload assets - name: Create GitHub Release and upload assets
@@ -52,10 +58,10 @@ jobs:
Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}. Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
Assets: Assets:
- ReleasableAircraft_${{ steps.meta.outputs.date }}.csv - ${{ steps.meta.outputs.csv_basename }}
- ReleasableAircraft_${{ steps.meta.outputs.date }}.zip - ReleasableAircraft_${{ steps.meta.outputs.date }}.zip
files: | files: |
data/faa_releasable/ReleasableAircraft_${{ steps.meta.outputs.date }}.csv ${{ steps.meta.outputs.csv_file }}
data/faa_releasable/ReleasableAircraft_${{ steps.meta.outputs.date }}.zip data/faa_releasable/ReleasableAircraft_${{ steps.meta.outputs.date }}.zip
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,171 @@
name: Process Historical FAA Data
on:
workflow_dispatch: # Manual trigger
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Generate date ranges
id: set-matrix
run: |
python3 << 'EOF'
import json
from datetime import datetime, timedelta
start = datetime(2023, 8, 16)
end = datetime(2026, 1, 1)
ranges = []
current = start
# Process in 4-day chunks
while current < end:
chunk_end = current + timedelta(days=4)
# Don't go past the end date
if chunk_end > end:
chunk_end = end
ranges.append({
"since": current.strftime("%Y-%m-%d"),
"until": chunk_end.strftime("%Y-%m-%d")
})
current = chunk_end
print(f"::set-output name=matrix::{json.dumps(ranges)}")
EOF
clone-faa-repo:
runs-on: ubuntu-latest
steps:
- name: Cache FAA repository
id: cache-faa-repo
uses: actions/cache@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
- name: Clone FAA repository
if: steps.cache-faa-repo.outputs.cache-hit != 'true'
run: |
mkdir -p data
git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft
echo "Repository cloned successfully"
process-chunk:
needs: [generate-matrix, clone-faa-repo]
runs-on: ubuntu-latest
strategy:
max-parallel: 5 # Process 5 chunks at a time
matrix:
range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Restore FAA repository cache
uses: actions/cache/restore@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
fail-on-cache-miss: true
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }}
run: |
python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}"
- name: Upload CSV artifact
uses: actions/upload-artifact@v4
with:
name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }}
path: data/faa_releasable_historical/*.csv
retention-days: 1
create-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare release files
run: |
mkdir -p release-files
find artifacts -name "*.csv" -exec cp {} release-files/ \;
ls -lh release-files/
- name: Create Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-${{ github.run_number }}
name: Historical FAA Data Release ${{ github.run_number }}
body: |
Automated release of historical FAA aircraft data
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: release-files/*.csv
draft: false
prerelease: false
concatenate-and-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare CSVs for concatenation
run: |
mkdir -p data/faa_releasable_historical
find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
ls -lh data/faa_releasable_historical/
- name: Concatenate all CSVs
run: |
python scripts/concat_csvs.py
- name: Create Combined Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-combined-${{ github.run_number }}
name: Historical FAA Data Combined Release ${{ github.run_number }}
body: |
Combined historical FAA aircraft data (all chunks concatenated)
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: data/planequery_aircraft/*.csv
draft: false
prerelease: false
View File
+14
View File
@@ -0,0 +1,14 @@
#unique_regulatory_id
# 1. read historoical and output
# 2. read sequentially
# Instead of reading all csvs I can read just the latest release csv to get everything.
from pathlib import Path
base = Path("data/faa_releasable_historical")
for day_dir in sorted(base.glob("2024-02-*")):
master = day_dir / "Master.txt"
if master.exists():
out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv")
print(day_dir.name, "->", out_csv)
+89
View File
@@ -0,0 +1,89 @@
from pathlib import Path
import pandas as pd
import re
from derive_from_faa_master_txt import concat_faa_historical_df
def concatenate_aircraft_csvs(
input_dir: Path = Path("data/concat"),
output_dir: Path = Path("data/planequery_aircraft"),
filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
):
"""
Read all CSVs matching the pattern from input_dir in order,
concatenate them using concat_faa_historical_df, and output a single CSV.
Args:
input_dir: Directory containing the CSV files to concatenate
output_dir: Directory where the output CSV will be saved
filename_pattern: Regex pattern to match CSV filenames
"""
input_dir = Path(input_dir)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Find all matching CSV files
pattern = re.compile(filename_pattern)
csv_files = []
for csv_path in sorted(input_dir.glob("*.csv")):
match = pattern.search(csv_path.name)
if match:
start_date = match.group(1)
end_date = match.group(2)
csv_files.append((start_date, end_date, csv_path))
# Sort by start date, then end date
csv_files.sort(key=lambda x: (x[0], x[1]))
if not csv_files:
raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
print(f"Found {len(csv_files)} CSV files to concatenate")
# Read first CSV as base
first_start_date, first_end_date, first_path = csv_files[0]
print(f"Reading base file: {first_path.name}")
df_base = pd.read_csv(
first_path,
dtype={
'transponder_code': str,
'unique_regulatory_id': str,
'registrant_county': str
}
)
# Concatenate remaining CSVs
for start_date, end_date, csv_path in csv_files[1:]:
print(f"Concatenating: {csv_path.name}")
df_new = pd.read_csv(
csv_path,
dtype={
'transponder_code': str,
'unique_regulatory_id': str,
'registrant_county': str
}
)
df_base = concat_faa_historical_df(df_base, df_new)
# Verify monotonic increasing download_date
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
# Output filename uses first start date and last end date
last_start_date, last_end_date, _ = csv_files[-1]
output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
output_path = output_dir / output_filename
print(f"Writing output to: {output_path}")
df_base.to_csv(output_path, index=False)
print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
print(f"Total rows: {len(df_base)}")
return output_path
if __name__ == "__main__":
# Example usage - modify these paths as needed
concatenate_aircraft_csvs(
input_dir=Path("data/concat"),
output_dir=Path("data/planequery_aircraft")
)
@@ -0,0 +1,33 @@
from pathlib import Path
from datetime import datetime, timezone
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
out_dir = Path("data/faa_releasable")
out_dir.mkdir(parents=True, exist_ok=True)
zip_name = f"ReleasableAircraft_{date_str}.zip"
zip_path = out_dir / zip_name
if not zip_path.exists():
# URL and paths
url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
from urllib.request import Request, urlopen
req = Request(
url,
headers={"User-Agent": "Mozilla/5.0"},
method="GET",
)
with urlopen(req, timeout=120) as r:
body = r.read()
zip_path.write_bytes(body)
OUT_ROOT = Path("data/planequery_aircraft")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
from get_latest_planequery_aircraft_release import get_latest_aircraft_csv_df
df_new = convert_faa_master_txt_to_df(zip_path, date_str)
df_base, start_date_str = get_latest_aircraft_csv_df()
df_base = concat_faa_historical_df(df_base, df_new)
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date_str}_{date_str}.csv", index=False)
+127
View File
@@ -0,0 +1,127 @@
from pathlib import Path
import zipfile
import pandas as pd
from faa_aircraft_registry import read
def convert_faa_master_txt_to_df(zip_path: Path, date: str):
with zipfile.ZipFile(zip_path) as z:
registrations = read(z)
df = pd.DataFrame(registrations['master'].values())
df.insert(0, "download_date", date)
registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
df = df.drop(columns="registrant").join(registrant)
# Move transponder_code_hex to second column (after registration_number)
cols = df.columns.tolist()
cols.remove("transponder_code_hex")
cols.insert(1, "transponder_code_hex")
df = df[cols]
df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
df = df.drop(columns="aircraft").join(aircraft)
df = df.rename(columns={"engine_type": "engine_type_2"})
engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
df = df.drop(columns="engine").join(engine)
certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
df = df.drop(columns="certification").join(certification)
# Create planequery_airframe_id
df["planequery_airframe_id"] = (
normalize(df["aircraft_manufacturer"])
+ "|"
+ normalize(df["aircraft_model"])
+ "|"
+ normalize(df["serial_number"])
)
# Move planequery_airframe_id to come after registration_number
cols = df.columns.tolist()
cols.remove("planequery_airframe_id")
reg_idx = cols.index("registration_number")
cols.insert(reg_idx + 1, "planequery_airframe_id")
df = df[cols]
# Convert all NaN to empty strings
df = df.fillna("")
return df
def normalize(s: pd.Series) -> pd.Series:
return (
s.fillna("")
.astype(str)
.str.upper()
.str.strip()
# collapse whitespace
.str.replace(r"\s+", " ", regex=True)
# remove characters that cause false mismatches
.str.replace(r"[^\w\-]", "", regex=True)
)
def concat_faa_historical_df(df_base, df_new):
df_new = df_new[df_base.columns]
df_base = pd.concat([df_base, df_new], ignore_index=True)
CONTENT_COLS = [
c for c in df_base.columns
if c not in {"download_date"}
]
# Normalize values to handle numeric type, formatting, and list ordering differences
def normalize_series(series):
def normalize_value(val):
# Handle lists (sort them for consistent comparison)
if isinstance(val, list):
return "|".join(sorted(str(v) for v in val))
# Convert to string
val_str = str(val).strip()
# Handle empty strings
if val_str == "" or val_str == "nan":
return ""
# Check if it looks like a list representation (starts with [ )
if val_str.startswith('[') and val_str.endswith(']'):
try:
# Try to parse as a list-like string
import ast
parsed = ast.literal_eval(val_str)
if isinstance(parsed, list):
return "|".join(sorted(str(v) for v in parsed))
except (ValueError, SyntaxError):
pass # Not a valid list, continue to other checks
# Try to normalize as number
try:
# Remove leading zeros and convert float/int representations
num_val = float(val_str)
# If it's a whole number, return as int string (no .0)
if num_val == int(num_val):
return str(int(num_val))
# Otherwise return as float
return str(num_val)
except (ValueError, OverflowError):
# Not a number, return as-is
return val_str
return series.apply(normalize_value)
df_base["row_fingerprint"] = (
df_base[CONTENT_COLS]
.apply(normalize_series, axis=0)
.apply(lambda row: "|".join(row), axis=1)
)
df_base = df_base.drop_duplicates(
subset=["row_fingerprint"],
keep="first"
).drop(columns=["row_fingerprint"])
return df_base
+86 -33
View File
@@ -1,63 +1,116 @@
'''Generated with ChatGPT 5.2 prompt """
scrape-faa-releasable-aircraft For each commit-day in Feb 2024 (last commit per day):
Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023. - Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
scrape-faa-releasable-aircraft % ls ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
ACFTREF.txt DOCINDEX.txt MASTER-1.txt MASTER-3.txt MASTER-5.txt MASTER-7.txt MASTER-9.txt RESERVED.txt - Recombine MASTER-*.txt into Master.txt
DEALER.txt ENGINE.txt MASTER-2.txt MASTER-4.txt MASTER-6.txt MASTER-8.txt README.md ardata.pdf - Produce Master.csv via convert_faa_master_txt_to_csv
'''
Assumes the non-master files are present in every commit.
"""
import subprocess, re import subprocess, re
from pathlib import Path from pathlib import Path
import shutil
from collections import OrderedDict from collections import OrderedDict
from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
import zipfile
import pandas as pd
import argparse
from datetime import datetime, timedelta
def run(*args: str) -> str: # Parse command line arguments
return subprocess.check_output(args, text=True).strip() parser = argparse.ArgumentParser(description="Process historical FAA data from git commits")
parser.add_argument("since", help="Start date (YYYY-MM-DD)")
parser.add_argument("until", help="End date (YYYY-MM-DD)")
args = parser.parse_args()
# Get commits that touched any MASTER-*.txt, oldest -> newest # Clone repository if it doesn't exist
log = run("git", "log", "--reverse", "--format=%H %cs", "--", ".") REPO = Path("data/scrape-faa-releasable-aircraft")
# If you want to restrict to only commits that touched the master parts, use: OUT_ROOT = Path("data/faa_releasable_historical")
# log = run("git", "log", "--reverse", "--format=%H %cs", "--", "MASTER-1.txt") OUT_ROOT.mkdir(parents=True, exist_ok=True)
def run_git_text(*args: str) -> str:
return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip()
def run_git_bytes(*args: str) -> bytes:
return subprocess.check_output(["git", "-C", str(REPO), *args])
# Parse dates and adjust --since to the day before
since_date = datetime.strptime(args.since, "%Y-%m-%d")
adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d")
# All commits in specified date range (oldest -> newest)
log = run_git_text(
"log",
"--reverse",
"--format=%H %cs",
f"--since={adjusted_since}",
f"--until={args.until}",
)
lines = [ln for ln in log.splitlines() if ln.strip()] lines = [ln for ln in log.splitlines() if ln.strip()]
if not lines: if not lines:
raise SystemExit("No commits found.") raise SystemExit(f"No commits found between {args.since} and {args.until}.")
# Map date -> last commit SHA on that date (Ordered by history) # date -> last SHA that day
date_to_sha = OrderedDict() date_to_sha = OrderedDict()
for ln in lines: for ln in lines:
sha, date = ln.split() sha, date = ln.split()
# keep last SHA per day
date_to_sha[date] = sha date_to_sha[date] = sha
out_root = Path("out_master_by_day") OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
out_root.mkdir(exist_ok=True)
master_re = re.compile(r"^MASTER-(\d+)\.txt$") master_re = re.compile(r"^MASTER-(\d+)\.txt$")
df_base = pd.DataFrame()
start_date = None
end_date = None
for date, sha in date_to_sha.items(): for date, sha in date_to_sha.items():
# list files at this commit, filter MASTER-*.txt in repo root if start_date is None:
names = run("git", "ls-tree", "--name-only", sha).splitlines() start_date = date
end_date = date
day_dir = OUT_ROOT / date
day_dir.mkdir(parents=True, exist_ok=True)
# Write auxiliary files (assumed present)
for fname in OTHER_FILES:
(day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
# Recombine MASTER parts
names = run_git_text("ls-tree", "--name-only", sha).splitlines()
parts = [] parts = []
for n in names: for n in names:
m = master_re.match(n) m = master_re.match(n)
if m: if m:
parts.append((int(m.group(1)), n)) parts.append((int(m.group(1)), n))
parts.sort() parts.sort()
if not parts: if not parts:
# no master parts in that commit/day; skip raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
continue
day_dir = out_root / date master_path = day_dir / "MASTER.txt"
day_dir.mkdir(parents=True, exist_ok=True) with master_path.open("wb") as w:
out_path = day_dir / "Master.txt"
with out_path.open("wb") as w:
for _, fname in parts: for _, fname in parts:
data = subprocess.check_output(["git", "show", f"{sha}:{fname}"]) data = run_git_bytes("show", f"{sha}:{fname}")
w.write(data) w.write(data)
if data and not data.endswith(b"\n"): if data and not data.endswith(b"\n"):
w.write(b"\n") w.write(b"\n")
print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)") # 3) Zip the day's files
zip_path = day_dir / f"ReleasableAircraft.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
for p in day_dir.iterdir():
z.write(p, arcname=p.name)
print(f"\nDone. Output root: {out_root.resolve()}") print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
# 4) Convert ZIP -> CSV
df_new = convert_faa_master_txt_to_df(zip_path, date)
if df_base.empty:
df_base = df_new
print(len(df_base), "total entries so far")
# Delete all files in the day directory
shutil.rmtree(day_dir)
continue
df_base = concat_faa_historical_df(df_base, df_new)
shutil.rmtree(day_dir)
print(len(df_base), "total entries so far")
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date}_{end_date}.csv", index=False)
# TODO: get average number of new rows per day.
@@ -0,0 +1,144 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional
import re
import urllib.request
import urllib.error
import json
REPO = "PlaneQuery/planequery-aircraft"
LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest"
@dataclass(frozen=True)
class ReleaseAsset:
name: str
download_url: str
size: int # bytes
def _http_get_json(url: str, headers: dict[str, str]) -> dict:
req = urllib.request.Request(url, headers=headers, method="GET")
with urllib.request.urlopen(req, timeout=120) as resp:
data = resp.read()
return json.loads(data.decode("utf-8"))
def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
url = f"https://api.github.com/repos/{repo}/releases/latest"
headers = {
"Accept": "application/vnd.github+json",
"User-Agent": "planequery-aircraft-downloader/1.0",
}
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
payload = _http_get_json(url, headers=headers)
assets = []
for a in payload.get("assets", []):
assets.append(
ReleaseAsset(
name=a["name"],
download_url=a["browser_download_url"],
size=int(a.get("size", 0)),
)
)
return assets
def pick_asset(
assets: Iterable[ReleaseAsset],
*,
exact_name: Optional[str] = None,
name_regex: Optional[str] = None,
) -> ReleaseAsset:
assets = list(assets)
if exact_name:
for a in assets:
if a.name == exact_name:
return a
raise FileNotFoundError(f"No asset exactly named {exact_name!r}. Available: {[a.name for a in assets]}")
if name_regex:
rx = re.compile(name_regex)
matches = [a for a in assets if rx.search(a.name)]
if not matches:
raise FileNotFoundError(f"No asset matched regex {name_regex!r}. Available: {[a.name for a in assets]}")
if len(matches) > 1:
raise FileExistsError(f"Regex {name_regex!r} matched multiple assets: {[m.name for m in matches]}")
return matches[0]
raise ValueError("Provide either exact_name=... or name_regex=...")
def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[str] = None) -> Path:
out_path = Path(out_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
headers = {
"User-Agent": "planequery-aircraft-downloader/1.0",
"Accept": "application/octet-stream",
}
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
req = urllib.request.Request(asset.download_url, headers=headers, method="GET")
try:
with urllib.request.urlopen(req, timeout=300) as resp, out_path.open("wb") as f:
# Stream download
while True:
chunk = resp.read(1024 * 1024) # 1 MiB
if not chunk:
break
f.write(chunk)
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace") if hasattr(e, "read") else ""
raise RuntimeError(f"HTTPError {e.code} downloading {asset.name}: {body[:500]}") from e
return out_path
def download_latest_aircraft_csv(
output_dir: Path = Path("downloads"),
github_token: Optional[str] = None,
repo: str = REPO,
) -> Path:
"""
Download the latest planequery_aircraft_*.csv file from the latest GitHub release.
Args:
output_dir: Directory to save the downloaded file (default: "downloads")
github_token: Optional GitHub token for authentication
repo: GitHub repository in format "owner/repo" (default: REPO)
Returns:
Path to the downloaded file
"""
assets = get_latest_release_assets(repo, github_token=github_token)
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_.*\.csv$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to
def get_latest_aircraft_csv_df():
csv_path = download_latest_aircraft_csv()
import pandas as pd
df = pd.read_csv(csv_path, dtype={'transponder_code': str,
'unique_regulatory_id': str,
'registrant_county': str})
df = df.fillna("")
# Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv
match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
date_str = match.group(1)
return df, date_str
if __name__ == "__main__":
download_latest_aircraft_csv()
-48
View File
@@ -1,48 +0,0 @@
from faa_aircraft_registry import read
import pandas as pd
import zipfile
import zipfile
from pathlib import Path
from datetime import datetime, timezone
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
out_dir = Path("data/faa_releasable")
out_dir.mkdir(parents=True, exist_ok=True)
zip_name = f"ReleasableAircraft_{date_str}.zip"
csv_name = f"ReleasableAircraft_{date_str}.csv"
zip_path = out_dir / zip_name
csv_path = out_dir / csv_name
# URL and paths
url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
from urllib.request import Request, urlopen
req = Request(
url,
headers={"User-Agent": "Mozilla/5.0"},
method="GET",
)
with urlopen(req, timeout=120) as r:
body = r.read()
zip_path.write_bytes(body)
with zipfile.ZipFile(zip_path) as z:
registrations = read(z)
df = pd.DataFrame(registrations['master'].values())
col = "transponder_code_hex"
df = df[[col] + [c for c in df.columns if c != col]]
df = df.rename(columns={"transponder_code_hex": "icao"})
registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
df = df.drop(columns="registrant").join(registrant)
df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_")
df = df.drop(columns="aircraft").join(aircraft)
df = df.rename(columns={"engine_type": "engine_type_2"})
engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
df = df.drop(columns="engine").join(engine)
df = df.sort_values(by=["icao"])
df.to_csv(csv_path, index=False)
View File