Create derived csv daily. Get historical FAA data

This commit is contained in:
ggman12
2026-02-02 20:48:35 -05:00
parent 2e60c64f61
commit 16a0a5fec8
12 changed files with 677 additions and 88 deletions
+14
View File
@@ -0,0 +1,14 @@
#unique_regulatory_id
# 1. read historoical and output
# 2. read sequentially
# Instead of reading all csvs I can read just the latest release csv to get everything.
from pathlib import Path
base = Path("data/faa_releasable_historical")
for day_dir in sorted(base.glob("2024-02-*")):
master = day_dir / "Master.txt"
if master.exists():
out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv")
print(day_dir.name, "->", out_csv)
+89
View File
@@ -0,0 +1,89 @@
from pathlib import Path
import pandas as pd
import re
from derive_from_faa_master_txt import concat_faa_historical_df
def concatenate_aircraft_csvs(
input_dir: Path = Path("data/concat"),
output_dir: Path = Path("data/planequery_aircraft"),
filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
):
"""
Read all CSVs matching the pattern from input_dir in order,
concatenate them using concat_faa_historical_df, and output a single CSV.
Args:
input_dir: Directory containing the CSV files to concatenate
output_dir: Directory where the output CSV will be saved
filename_pattern: Regex pattern to match CSV filenames
"""
input_dir = Path(input_dir)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Find all matching CSV files
pattern = re.compile(filename_pattern)
csv_files = []
for csv_path in sorted(input_dir.glob("*.csv")):
match = pattern.search(csv_path.name)
if match:
start_date = match.group(1)
end_date = match.group(2)
csv_files.append((start_date, end_date, csv_path))
# Sort by start date, then end date
csv_files.sort(key=lambda x: (x[0], x[1]))
if not csv_files:
raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
print(f"Found {len(csv_files)} CSV files to concatenate")
# Read first CSV as base
first_start_date, first_end_date, first_path = csv_files[0]
print(f"Reading base file: {first_path.name}")
df_base = pd.read_csv(
first_path,
dtype={
'transponder_code': str,
'unique_regulatory_id': str,
'registrant_county': str
}
)
# Concatenate remaining CSVs
for start_date, end_date, csv_path in csv_files[1:]:
print(f"Concatenating: {csv_path.name}")
df_new = pd.read_csv(
csv_path,
dtype={
'transponder_code': str,
'unique_regulatory_id': str,
'registrant_county': str
}
)
df_base = concat_faa_historical_df(df_base, df_new)
# Verify monotonic increasing download_date
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
# Output filename uses first start date and last end date
last_start_date, last_end_date, _ = csv_files[-1]
output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
output_path = output_dir / output_filename
print(f"Writing output to: {output_path}")
df_base.to_csv(output_path, index=False)
print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
print(f"Total rows: {len(df_base)}")
return output_path
if __name__ == "__main__":
# Example usage - modify these paths as needed
concatenate_aircraft_csvs(
input_dir=Path("data/concat"),
output_dir=Path("data/planequery_aircraft")
)
@@ -0,0 +1,33 @@
from pathlib import Path
from datetime import datetime, timezone
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
out_dir = Path("data/faa_releasable")
out_dir.mkdir(parents=True, exist_ok=True)
zip_name = f"ReleasableAircraft_{date_str}.zip"
zip_path = out_dir / zip_name
if not zip_path.exists():
# URL and paths
url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
from urllib.request import Request, urlopen
req = Request(
url,
headers={"User-Agent": "Mozilla/5.0"},
method="GET",
)
with urlopen(req, timeout=120) as r:
body = r.read()
zip_path.write_bytes(body)
OUT_ROOT = Path("data/planequery_aircraft")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
from get_latest_planequery_aircraft_release import get_latest_aircraft_csv_df
df_new = convert_faa_master_txt_to_df(zip_path, date_str)
df_base, start_date_str = get_latest_aircraft_csv_df()
df_base = concat_faa_historical_df(df_base, df_new)
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date_str}_{date_str}.csv", index=False)
+127
View File
@@ -0,0 +1,127 @@
from pathlib import Path
import zipfile
import pandas as pd
from faa_aircraft_registry import read
def convert_faa_master_txt_to_df(zip_path: Path, date: str):
with zipfile.ZipFile(zip_path) as z:
registrations = read(z)
df = pd.DataFrame(registrations['master'].values())
df.insert(0, "download_date", date)
registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
df = df.drop(columns="registrant").join(registrant)
# Move transponder_code_hex to second column (after registration_number)
cols = df.columns.tolist()
cols.remove("transponder_code_hex")
cols.insert(1, "transponder_code_hex")
df = df[cols]
df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
df = df.drop(columns="aircraft").join(aircraft)
df = df.rename(columns={"engine_type": "engine_type_2"})
engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
df = df.drop(columns="engine").join(engine)
certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
df = df.drop(columns="certification").join(certification)
# Create planequery_airframe_id
df["planequery_airframe_id"] = (
normalize(df["aircraft_manufacturer"])
+ "|"
+ normalize(df["aircraft_model"])
+ "|"
+ normalize(df["serial_number"])
)
# Move planequery_airframe_id to come after registration_number
cols = df.columns.tolist()
cols.remove("planequery_airframe_id")
reg_idx = cols.index("registration_number")
cols.insert(reg_idx + 1, "planequery_airframe_id")
df = df[cols]
# Convert all NaN to empty strings
df = df.fillna("")
return df
def normalize(s: pd.Series) -> pd.Series:
return (
s.fillna("")
.astype(str)
.str.upper()
.str.strip()
# collapse whitespace
.str.replace(r"\s+", " ", regex=True)
# remove characters that cause false mismatches
.str.replace(r"[^\w\-]", "", regex=True)
)
def concat_faa_historical_df(df_base, df_new):
df_new = df_new[df_base.columns]
df_base = pd.concat([df_base, df_new], ignore_index=True)
CONTENT_COLS = [
c for c in df_base.columns
if c not in {"download_date"}
]
# Normalize values to handle numeric type, formatting, and list ordering differences
def normalize_series(series):
def normalize_value(val):
# Handle lists (sort them for consistent comparison)
if isinstance(val, list):
return "|".join(sorted(str(v) for v in val))
# Convert to string
val_str = str(val).strip()
# Handle empty strings
if val_str == "" or val_str == "nan":
return ""
# Check if it looks like a list representation (starts with [ )
if val_str.startswith('[') and val_str.endswith(']'):
try:
# Try to parse as a list-like string
import ast
parsed = ast.literal_eval(val_str)
if isinstance(parsed, list):
return "|".join(sorted(str(v) for v in parsed))
except (ValueError, SyntaxError):
pass # Not a valid list, continue to other checks
# Try to normalize as number
try:
# Remove leading zeros and convert float/int representations
num_val = float(val_str)
# If it's a whole number, return as int string (no .0)
if num_val == int(num_val):
return str(int(num_val))
# Otherwise return as float
return str(num_val)
except (ValueError, OverflowError):
# Not a number, return as-is
return val_str
return series.apply(normalize_value)
df_base["row_fingerprint"] = (
df_base[CONTENT_COLS]
.apply(normalize_series, axis=0)
.apply(lambda row: "|".join(row), axis=1)
)
df_base = df_base.drop_duplicates(
subset=["row_fingerprint"],
keep="first"
).drop(columns=["row_fingerprint"])
return df_base
+86 -33
View File
@@ -1,63 +1,116 @@
'''Generated with ChatGPT 5.2 prompt
scrape-faa-releasable-aircraft
Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023.
scrape-faa-releasable-aircraft % ls
ACFTREF.txt DOCINDEX.txt MASTER-1.txt MASTER-3.txt MASTER-5.txt MASTER-7.txt MASTER-9.txt RESERVED.txt
DEALER.txt ENGINE.txt MASTER-2.txt MASTER-4.txt MASTER-6.txt MASTER-8.txt README.md ardata.pdf
'''
"""
For each commit-day in Feb 2024 (last commit per day):
- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
- Recombine MASTER-*.txt into Master.txt
- Produce Master.csv via convert_faa_master_txt_to_csv
Assumes the non-master files are present in every commit.
"""
import subprocess, re
from pathlib import Path
import shutil
from collections import OrderedDict
from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
import zipfile
import pandas as pd
import argparse
from datetime import datetime, timedelta
def run(*args: str) -> str:
return subprocess.check_output(args, text=True).strip()
# Parse command line arguments
parser = argparse.ArgumentParser(description="Process historical FAA data from git commits")
parser.add_argument("since", help="Start date (YYYY-MM-DD)")
parser.add_argument("until", help="End date (YYYY-MM-DD)")
args = parser.parse_args()
# Get commits that touched any MASTER-*.txt, oldest -> newest
log = run("git", "log", "--reverse", "--format=%H %cs", "--", ".")
# If you want to restrict to only commits that touched the master parts, use:
# log = run("git", "log", "--reverse", "--format=%H %cs", "--", "MASTER-1.txt")
# Clone repository if it doesn't exist
REPO = Path("data/scrape-faa-releasable-aircraft")
OUT_ROOT = Path("data/faa_releasable_historical")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
def run_git_text(*args: str) -> str:
return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip()
def run_git_bytes(*args: str) -> bytes:
return subprocess.check_output(["git", "-C", str(REPO), *args])
# Parse dates and adjust --since to the day before
since_date = datetime.strptime(args.since, "%Y-%m-%d")
adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d")
# All commits in specified date range (oldest -> newest)
log = run_git_text(
"log",
"--reverse",
"--format=%H %cs",
f"--since={adjusted_since}",
f"--until={args.until}",
)
lines = [ln for ln in log.splitlines() if ln.strip()]
if not lines:
raise SystemExit("No commits found.")
raise SystemExit(f"No commits found between {args.since} and {args.until}.")
# Map date -> last commit SHA on that date (Ordered by history)
# date -> last SHA that day
date_to_sha = OrderedDict()
for ln in lines:
sha, date = ln.split()
# keep last SHA per day
date_to_sha[date] = sha
out_root = Path("out_master_by_day")
out_root.mkdir(exist_ok=True)
OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
master_re = re.compile(r"^MASTER-(\d+)\.txt$")
df_base = pd.DataFrame()
start_date = None
end_date = None
for date, sha in date_to_sha.items():
# list files at this commit, filter MASTER-*.txt in repo root
names = run("git", "ls-tree", "--name-only", sha).splitlines()
if start_date is None:
start_date = date
end_date = date
day_dir = OUT_ROOT / date
day_dir.mkdir(parents=True, exist_ok=True)
# Write auxiliary files (assumed present)
for fname in OTHER_FILES:
(day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
# Recombine MASTER parts
names = run_git_text("ls-tree", "--name-only", sha).splitlines()
parts = []
for n in names:
m = master_re.match(n)
if m:
parts.append((int(m.group(1)), n))
parts.sort()
if not parts:
# no master parts in that commit/day; skip
continue
raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
day_dir = out_root / date
day_dir.mkdir(parents=True, exist_ok=True)
out_path = day_dir / "Master.txt"
with out_path.open("wb") as w:
master_path = day_dir / "MASTER.txt"
with master_path.open("wb") as w:
for _, fname in parts:
data = subprocess.check_output(["git", "show", f"{sha}:{fname}"])
data = run_git_bytes("show", f"{sha}:{fname}")
w.write(data)
if data and not data.endswith(b"\n"):
w.write(b"\n")
print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)")
# 3) Zip the day's files
zip_path = day_dir / f"ReleasableAircraft.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
for p in day_dir.iterdir():
z.write(p, arcname=p.name)
print(f"\nDone. Output root: {out_root.resolve()}")
print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
# 4) Convert ZIP -> CSV
df_new = convert_faa_master_txt_to_df(zip_path, date)
if df_base.empty:
df_base = df_new
print(len(df_base), "total entries so far")
# Delete all files in the day directory
shutil.rmtree(day_dir)
continue
df_base = concat_faa_historical_df(df_base, df_new)
shutil.rmtree(day_dir)
print(len(df_base), "total entries so far")
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date}_{end_date}.csv", index=False)
# TODO: get average number of new rows per day.
@@ -0,0 +1,144 @@
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional
import re
import urllib.request
import urllib.error
import json
REPO = "PlaneQuery/planequery-aircraft"
LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest"
@dataclass(frozen=True)
class ReleaseAsset:
name: str
download_url: str
size: int # bytes
def _http_get_json(url: str, headers: dict[str, str]) -> dict:
req = urllib.request.Request(url, headers=headers, method="GET")
with urllib.request.urlopen(req, timeout=120) as resp:
data = resp.read()
return json.loads(data.decode("utf-8"))
def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
url = f"https://api.github.com/repos/{repo}/releases/latest"
headers = {
"Accept": "application/vnd.github+json",
"User-Agent": "planequery-aircraft-downloader/1.0",
}
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
payload = _http_get_json(url, headers=headers)
assets = []
for a in payload.get("assets", []):
assets.append(
ReleaseAsset(
name=a["name"],
download_url=a["browser_download_url"],
size=int(a.get("size", 0)),
)
)
return assets
def pick_asset(
assets: Iterable[ReleaseAsset],
*,
exact_name: Optional[str] = None,
name_regex: Optional[str] = None,
) -> ReleaseAsset:
assets = list(assets)
if exact_name:
for a in assets:
if a.name == exact_name:
return a
raise FileNotFoundError(f"No asset exactly named {exact_name!r}. Available: {[a.name for a in assets]}")
if name_regex:
rx = re.compile(name_regex)
matches = [a for a in assets if rx.search(a.name)]
if not matches:
raise FileNotFoundError(f"No asset matched regex {name_regex!r}. Available: {[a.name for a in assets]}")
if len(matches) > 1:
raise FileExistsError(f"Regex {name_regex!r} matched multiple assets: {[m.name for m in matches]}")
return matches[0]
raise ValueError("Provide either exact_name=... or name_regex=...")
def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[str] = None) -> Path:
out_path = Path(out_path)
out_path.parent.mkdir(parents=True, exist_ok=True)
headers = {
"User-Agent": "planequery-aircraft-downloader/1.0",
"Accept": "application/octet-stream",
}
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
req = urllib.request.Request(asset.download_url, headers=headers, method="GET")
try:
with urllib.request.urlopen(req, timeout=300) as resp, out_path.open("wb") as f:
# Stream download
while True:
chunk = resp.read(1024 * 1024) # 1 MiB
if not chunk:
break
f.write(chunk)
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace") if hasattr(e, "read") else ""
raise RuntimeError(f"HTTPError {e.code} downloading {asset.name}: {body[:500]}") from e
return out_path
def download_latest_aircraft_csv(
output_dir: Path = Path("downloads"),
github_token: Optional[str] = None,
repo: str = REPO,
) -> Path:
"""
Download the latest planequery_aircraft_*.csv file from the latest GitHub release.
Args:
output_dir: Directory to save the downloaded file (default: "downloads")
github_token: Optional GitHub token for authentication
repo: GitHub repository in format "owner/repo" (default: REPO)
Returns:
Path to the downloaded file
"""
assets = get_latest_release_assets(repo, github_token=github_token)
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_.*\.csv$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to
def get_latest_aircraft_csv_df():
csv_path = download_latest_aircraft_csv()
import pandas as pd
df = pd.read_csv(csv_path, dtype={'transponder_code': str,
'unique_regulatory_id': str,
'registrant_county': str})
df = df.fillna("")
# Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv
match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
date_str = match.group(1)
return df, date_str
if __name__ == "__main__":
download_latest_aircraft_csv()
-48
View File
@@ -1,48 +0,0 @@
from faa_aircraft_registry import read
import pandas as pd
import zipfile
import zipfile
from pathlib import Path
from datetime import datetime, timezone
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
out_dir = Path("data/faa_releasable")
out_dir.mkdir(parents=True, exist_ok=True)
zip_name = f"ReleasableAircraft_{date_str}.zip"
csv_name = f"ReleasableAircraft_{date_str}.csv"
zip_path = out_dir / zip_name
csv_path = out_dir / csv_name
# URL and paths
url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
from urllib.request import Request, urlopen
req = Request(
url,
headers={"User-Agent": "Mozilla/5.0"},
method="GET",
)
with urlopen(req, timeout=120) as r:
body = r.read()
zip_path.write_bytes(body)
with zipfile.ZipFile(zip_path) as z:
registrations = read(z)
df = pd.DataFrame(registrations['master'].values())
col = "transponder_code_hex"
df = df[[col] + [c for c in df.columns if c != col]]
df = df.rename(columns={"transponder_code_hex": "icao"})
registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
df = df.drop(columns="registrant").join(registrant)
df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_")
df = df.drop(columns="aircraft").join(aircraft)
df = df.rename(columns={"engine_type": "engine_type_2"})
engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
df = df.drop(columns="engine").join(engine)
df = df.sort_values(by=["icao"])
df.to_csv(csv_path, index=False)