From efe63743ab3da84b0435363d73834ba0acb249b5 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Sun, 1 Feb 2026 14:44:27 -0500 Subject: [PATCH] fix: handle missing aircraft and engine data in conversion process feat: add combine_historical_faa.py to process historical FAA data --- src/combine_historical_faa.py | 14 ++++++ src/derive_from_faa_master_txt.py | 2 +- src/get_historical_faa.py | 84 ++++++++++++++++++------------- 3 files changed, 63 insertions(+), 37 deletions(-) create mode 100644 src/combine_historical_faa.py diff --git a/src/combine_historical_faa.py b/src/combine_historical_faa.py new file mode 100644 index 0000000..c66fec3 --- /dev/null +++ b/src/combine_historical_faa.py @@ -0,0 +1,14 @@ +#unique_regulatory_id +# 1. read historoical and output +# 2. read sequentially + +# Instead of reading all csvs I can read just the latest release csv to get everything. + +from pathlib import Path + +base = Path("data/faa_releasable_historical") +for day_dir in sorted(base.glob("2024-02-*")): + master = day_dir / "Master.txt" + if master.exists(): + out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv") + print(day_dir.name, "->", out_csv) \ No newline at end of file diff --git a/src/derive_from_faa_master_txt.py b/src/derive_from_faa_master_txt.py index c3a02fc..b1464af 100644 --- a/src/derive_from_faa_master_txt.py +++ b/src/derive_from_faa_master_txt.py @@ -14,7 +14,7 @@ def convert_faa_master_txt_to_csv(zip_path: Path, csv_path: Path) -> None: registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_") df = df.drop(columns="registrant").join(registrant) df = df.rename(columns={"aircraft_type": "aircraft_type_2"}) - aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_") + aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_") df = df.drop(columns="aircraft").join(aircraft) df = df.rename(columns={"engine_type": "engine_type_2"}) engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_") diff --git a/src/get_historical_faa.py b/src/get_historical_faa.py index 2ed7043..064d22a 100644 --- a/src/get_historical_faa.py +++ b/src/get_historical_faa.py @@ -1,75 +1,87 @@ -'''Generated with ChatGPT 5.2 prompt -scrape-faa-releasable-aircraft -Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023. -scrape-faa-releasable-aircraft % ls -ACFTREF.txt DOCINDEX.txt MASTER-1.txt MASTER-3.txt MASTER-5.txt MASTER-7.txt MASTER-9.txt RESERVED.txt -DEALER.txt ENGINE.txt MASTER-2.txt MASTER-4.txt MASTER-6.txt MASTER-8.txt README.md ardata.pdf -''' +""" +For each commit-day in Feb 2024 (last commit per day): +- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/ + ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt +- Recombine MASTER-*.txt into Master.txt +- Produce Master.csv via convert_faa_master_txt_to_csv + +Assumes the non-master files are present in every commit. +""" import subprocess, re from pathlib import Path from collections import OrderedDict +from derive_from_faa_master_txt import convert_faa_master_txt_to_csv +import zipfile REPO = "/Users/jonahgoode/Documents/PlaneQuery/Other-Code/scrape-faa-releasable-aircraft" OUT_ROOT = Path("data/faa_releasable_historical") OUT_ROOT.mkdir(parents=True, exist_ok=True) -def run_git(*args: str, text: bool = True) -> str: - return subprocess.check_output( - ["git", "-C", REPO, *args], - text=text - ).strip() +def run_git_text(*args: str) -> str: + return subprocess.check_output(["git", "-C", REPO, *args], text=True).strip() -# Commits (oldest -> newest), restricted to master parts -log = run_git( +def run_git_bytes(*args: str) -> bytes: + return subprocess.check_output(["git", "-C", REPO, *args]) + +# All commits in Feb 2024 (oldest -> newest) +log = run_git_text( "log", "--reverse", "--format=%H %cs", - "--", - "MASTER-1.txt" + "--since=2024-02-01", + "--until=2024-02-29", ) - lines = [ln for ln in log.splitlines() if ln.strip()] if not lines: - raise SystemExit("No commits found.") + raise SystemExit("No commits found in February 2024.") -# Map date -> last commit SHA on that date (only Feb 2024) +# date -> last SHA that day date_to_sha = OrderedDict() for ln in lines: sha, date = ln.split() - if date.startswith("2024-02"): - date_to_sha[date] = sha - -if not date_to_sha: - raise SystemExit("No February 2024 commit-days found.") + date_to_sha[date] = sha +OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"] master_re = re.compile(r"^MASTER-(\d+)\.txt$") for date, sha in date_to_sha.items(): - names = run_git("ls-tree", "--name-only", sha).splitlines() + day_dir = OUT_ROOT / date + day_dir.mkdir(parents=True, exist_ok=True) + # Write auxiliary files (assumed present) + for fname in OTHER_FILES: + (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}")) + + # Recombine MASTER parts + names = run_git_text("ls-tree", "--name-only", sha).splitlines() parts = [] for n in names: m = master_re.match(n) if m: parts.append((int(m.group(1)), n)) parts.sort() - if not parts: - continue + raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found") - day_dir = OUT_ROOT / date - day_dir.mkdir(parents=True, exist_ok=True) - out_path = day_dir / "Master.txt" - - with out_path.open("wb") as w: + master_path = day_dir / "MASTER.txt" + with master_path.open("wb") as w: for _, fname in parts: - data = subprocess.check_output( - ["git", "-C", REPO, "show", f"{sha}:{fname}"] - ) + data = run_git_bytes("show", f"{sha}:{fname}") w.write(data) if data and not data.endswith(b"\n"): w.write(b"\n") - print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)") + # 3) Zip the day's files + zip_path = day_dir / f"ReleasableAircraft.zip" + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z: + for p in day_dir.iterdir(): + z.write(p, arcname=p.name) + + # 4) Convert ZIP -> CSV + out_csv = day_dir / f"ReleasableAircraft_{date}.csv" + convert_faa_master_txt_to_csv(zip_path, out_csv) + + + print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})") print(f"\nDone. Output root: {OUT_ROOT.resolve()}")