fix: handle missing aircraft and engine data in conversion process

feat: add combine_historical_faa.py to process historical FAA data
This commit is contained in:
ggman12
2026-02-01 14:44:27 -05:00
parent 8368bfcbc9
commit efe63743ab
3 changed files with 63 additions and 37 deletions
+14
View File
@@ -0,0 +1,14 @@
#unique_regulatory_id
# 1. read historoical and output
# 2. read sequentially
# Instead of reading all csvs I can read just the latest release csv to get everything.
from pathlib import Path
base = Path("data/faa_releasable_historical")
for day_dir in sorted(base.glob("2024-02-*")):
master = day_dir / "Master.txt"
if master.exists():
out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv")
print(day_dir.name, "->", out_csv)
+1 -1
View File
@@ -14,7 +14,7 @@ def convert_faa_master_txt_to_csv(zip_path: Path, csv_path: Path) -> None:
registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
df = df.drop(columns="registrant").join(registrant)
df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_")
aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
df = df.drop(columns="aircraft").join(aircraft)
df = df.rename(columns={"engine_type": "engine_type_2"})
engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
+48 -36
View File
@@ -1,75 +1,87 @@
'''Generated with ChatGPT 5.2 prompt
scrape-faa-releasable-aircraft
Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023.
scrape-faa-releasable-aircraft % ls
ACFTREF.txt DOCINDEX.txt MASTER-1.txt MASTER-3.txt MASTER-5.txt MASTER-7.txt MASTER-9.txt RESERVED.txt
DEALER.txt ENGINE.txt MASTER-2.txt MASTER-4.txt MASTER-6.txt MASTER-8.txt README.md ardata.pdf
'''
"""
For each commit-day in Feb 2024 (last commit per day):
- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
- Recombine MASTER-*.txt into Master.txt
- Produce Master.csv via convert_faa_master_txt_to_csv
Assumes the non-master files are present in every commit.
"""
import subprocess, re
from pathlib import Path
from collections import OrderedDict
from derive_from_faa_master_txt import convert_faa_master_txt_to_csv
import zipfile
REPO = "/Users/jonahgoode/Documents/PlaneQuery/Other-Code/scrape-faa-releasable-aircraft"
OUT_ROOT = Path("data/faa_releasable_historical")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
def run_git(*args: str, text: bool = True) -> str:
return subprocess.check_output(
["git", "-C", REPO, *args],
text=text
).strip()
def run_git_text(*args: str) -> str:
return subprocess.check_output(["git", "-C", REPO, *args], text=True).strip()
# Commits (oldest -> newest), restricted to master parts
log = run_git(
def run_git_bytes(*args: str) -> bytes:
return subprocess.check_output(["git", "-C", REPO, *args])
# All commits in Feb 2024 (oldest -> newest)
log = run_git_text(
"log",
"--reverse",
"--format=%H %cs",
"--",
"MASTER-1.txt"
"--since=2024-02-01",
"--until=2024-02-29",
)
lines = [ln for ln in log.splitlines() if ln.strip()]
if not lines:
raise SystemExit("No commits found.")
raise SystemExit("No commits found in February 2024.")
# Map date -> last commit SHA on that date (only Feb 2024)
# date -> last SHA that day
date_to_sha = OrderedDict()
for ln in lines:
sha, date = ln.split()
if date.startswith("2024-02"):
date_to_sha[date] = sha
if not date_to_sha:
raise SystemExit("No February 2024 commit-days found.")
date_to_sha[date] = sha
OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
master_re = re.compile(r"^MASTER-(\d+)\.txt$")
for date, sha in date_to_sha.items():
names = run_git("ls-tree", "--name-only", sha).splitlines()
day_dir = OUT_ROOT / date
day_dir.mkdir(parents=True, exist_ok=True)
# Write auxiliary files (assumed present)
for fname in OTHER_FILES:
(day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
# Recombine MASTER parts
names = run_git_text("ls-tree", "--name-only", sha).splitlines()
parts = []
for n in names:
m = master_re.match(n)
if m:
parts.append((int(m.group(1)), n))
parts.sort()
if not parts:
continue
raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
day_dir = OUT_ROOT / date
day_dir.mkdir(parents=True, exist_ok=True)
out_path = day_dir / "Master.txt"
with out_path.open("wb") as w:
master_path = day_dir / "MASTER.txt"
with master_path.open("wb") as w:
for _, fname in parts:
data = subprocess.check_output(
["git", "-C", REPO, "show", f"{sha}:{fname}"]
)
data = run_git_bytes("show", f"{sha}:{fname}")
w.write(data)
if data and not data.endswith(b"\n"):
w.write(b"\n")
print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)")
# 3) Zip the day's files
zip_path = day_dir / f"ReleasableAircraft.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
for p in day_dir.iterdir():
z.write(p, arcname=p.name)
# 4) Convert ZIP -> CSV
out_csv = day_dir / f"ReleasableAircraft_{date}.csv"
convert_faa_master_txt_to_csv(zip_path, out_csv)
print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
print(f"\nDone. Output root: {OUT_ROOT.resolve()}")