From efe63743ab3da84b0435363d73834ba0acb249b5 Mon Sep 17 00:00:00 2001
From: ggman12 <goodejonah@gmail.com>
Date: Sun, 1 Feb 2026 14:44:27 -0500
Subject: [PATCH] fix: handle missing aircraft and engine data in conversion
 process

feat: add combine_historical_faa.py to process historical FAA data
---
 src/combine_historical_faa.py     | 14 ++++++
 src/derive_from_faa_master_txt.py |  2 +-
 src/get_historical_faa.py         | 84 ++++++++++++++++++-------------
 3 files changed, 63 insertions(+), 37 deletions(-)
 create mode 100644 src/combine_historical_faa.py

diff --git a/src/combine_historical_faa.py b/src/combine_historical_faa.py
new file mode 100644
index 0000000..c66fec3
--- /dev/null
+++ b/src/combine_historical_faa.py
@@ -0,0 +1,14 @@
+#unique_regulatory_id
+# 1. read historoical and output
+# 2. read sequentially
+
+# Instead of reading all csvs I can read just the latest release csv to get everything.
+
+from pathlib import Path
+
+base = Path("data/faa_releasable_historical")
+for day_dir in sorted(base.glob("2024-02-*")):
+    master = day_dir / "Master.txt"
+    if master.exists():
+        out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv")
+        print(day_dir.name, "->", out_csv)
\ No newline at end of file
diff --git a/src/derive_from_faa_master_txt.py b/src/derive_from_faa_master_txt.py
index c3a02fc..b1464af 100644
--- a/src/derive_from_faa_master_txt.py
+++ b/src/derive_from_faa_master_txt.py
@@ -14,7 +14,7 @@ def convert_faa_master_txt_to_csv(zip_path: Path, csv_path: Path) -> None:
     registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
     df = df.drop(columns="registrant").join(registrant)
     df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
-    aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_")
+    aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
     df = df.drop(columns="aircraft").join(aircraft)
     df = df.rename(columns={"engine_type": "engine_type_2"})
     engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
diff --git a/src/get_historical_faa.py b/src/get_historical_faa.py
index 2ed7043..064d22a 100644
--- a/src/get_historical_faa.py
+++ b/src/get_historical_faa.py
@@ -1,75 +1,87 @@
-'''Generated with ChatGPT 5.2 prompt
-scrape-faa-releasable-aircraft
-Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023.
-scrape-faa-releasable-aircraft % ls
-ACFTREF.txt     DOCINDEX.txt    MASTER-1.txt    MASTER-3.txt    MASTER-5.txt    MASTER-7.txt    MASTER-9.txt    RESERVED.txt
-DEALER.txt      ENGINE.txt      MASTER-2.txt    MASTER-4.txt    MASTER-6.txt    MASTER-8.txt    README.md       ardata.pdf
-'''
+"""
+For each commit-day in Feb 2024 (last commit per day):
+- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
+    ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
+- Recombine MASTER-*.txt into Master.txt
+- Produce Master.csv via convert_faa_master_txt_to_csv
+
+Assumes the non-master files are present in every commit.
+"""
 import subprocess, re
 from pathlib import Path
 from collections import OrderedDict
+from derive_from_faa_master_txt import convert_faa_master_txt_to_csv
+import zipfile
 
 REPO = "/Users/jonahgoode/Documents/PlaneQuery/Other-Code/scrape-faa-releasable-aircraft"
 OUT_ROOT = Path("data/faa_releasable_historical")
 OUT_ROOT.mkdir(parents=True, exist_ok=True)
 
-def run_git(*args: str, text: bool = True) -> str:
-    return subprocess.check_output(
-        ["git", "-C", REPO, *args],
-        text=text
-    ).strip()
+def run_git_text(*args: str) -> str:
+    return subprocess.check_output(["git", "-C", REPO, *args], text=True).strip()
 
-# Commits (oldest -> newest), restricted to master parts
-log = run_git(
+def run_git_bytes(*args: str) -> bytes:
+    return subprocess.check_output(["git", "-C", REPO, *args])
+
+# All commits in Feb 2024 (oldest -> newest)
+log = run_git_text(
     "log",
     "--reverse",
     "--format=%H %cs",
-    "--",
-    "MASTER-1.txt"
+    "--since=2024-02-01",
+    "--until=2024-02-29",
 )
-
 lines = [ln for ln in log.splitlines() if ln.strip()]
 if not lines:
-    raise SystemExit("No commits found.")
+    raise SystemExit("No commits found in February 2024.")
 
-# Map date -> last commit SHA on that date (only Feb 2024)
+# date -> last SHA that day
 date_to_sha = OrderedDict()
 for ln in lines:
     sha, date = ln.split()
-    if date.startswith("2024-02"):
-        date_to_sha[date] = sha
-
-if not date_to_sha:
-    raise SystemExit("No February 2024 commit-days found.")
+    date_to_sha[date] = sha
 
+OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
 master_re = re.compile(r"^MASTER-(\d+)\.txt$")
 
 for date, sha in date_to_sha.items():
-    names = run_git("ls-tree", "--name-only", sha).splitlines()
+    day_dir = OUT_ROOT / date
+    day_dir.mkdir(parents=True, exist_ok=True)
 
+    # Write auxiliary files (assumed present)
+    for fname in OTHER_FILES:
+        (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
+
+    # Recombine MASTER parts
+    names = run_git_text("ls-tree", "--name-only", sha).splitlines()
     parts = []
     for n in names:
         m = master_re.match(n)
         if m:
             parts.append((int(m.group(1)), n))
     parts.sort()
-
     if not parts:
-        continue
+        raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
 
-    day_dir = OUT_ROOT / date
-    day_dir.mkdir(parents=True, exist_ok=True)
-    out_path = day_dir / "Master.txt"
-
-    with out_path.open("wb") as w:
+    master_path = day_dir / "MASTER.txt"
+    with master_path.open("wb") as w:
         for _, fname in parts:
-            data = subprocess.check_output(
-                ["git", "-C", REPO, "show", f"{sha}:{fname}"]
-            )
+            data = run_git_bytes("show", f"{sha}:{fname}")
             w.write(data)
             if data and not data.endswith(b"\n"):
                 w.write(b"\n")
 
-    print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)")
+    # 3) Zip the day's files
+    zip_path = day_dir / f"ReleasableAircraft.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
+        for p in day_dir.iterdir():
+            z.write(p, arcname=p.name)
+
+    # 4) Convert ZIP -> CSV
+    out_csv = day_dir / f"ReleasableAircraft_{date}.csv"
+    convert_faa_master_txt_to_csv(zip_path, out_csv)
+
+
+    print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
 
 print(f"\nDone. Output root: {OUT_ROOT.resolve()}")