mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-04-23 19:46:09 +02:00
fix: handle missing aircraft and engine data in conversion process
feat: add combine_historical_faa.py to process historical FAA data
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
#unique_regulatory_id
|
||||
# 1. read historoical and output
|
||||
# 2. read sequentially
|
||||
|
||||
# Instead of reading all csvs I can read just the latest release csv to get everything.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
base = Path("data/faa_releasable_historical")
|
||||
for day_dir in sorted(base.glob("2024-02-*")):
|
||||
master = day_dir / "Master.txt"
|
||||
if master.exists():
|
||||
out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv")
|
||||
print(day_dir.name, "->", out_csv)
|
||||
@@ -14,7 +14,7 @@ def convert_faa_master_txt_to_csv(zip_path: Path, csv_path: Path) -> None:
|
||||
registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
|
||||
df = df.drop(columns="registrant").join(registrant)
|
||||
df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
|
||||
aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_")
|
||||
aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
|
||||
df = df.drop(columns="aircraft").join(aircraft)
|
||||
df = df.rename(columns={"engine_type": "engine_type_2"})
|
||||
engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
|
||||
|
||||
+48
-36
@@ -1,75 +1,87 @@
|
||||
'''Generated with ChatGPT 5.2 prompt
|
||||
scrape-faa-releasable-aircraft
|
||||
Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023.
|
||||
scrape-faa-releasable-aircraft % ls
|
||||
ACFTREF.txt DOCINDEX.txt MASTER-1.txt MASTER-3.txt MASTER-5.txt MASTER-7.txt MASTER-9.txt RESERVED.txt
|
||||
DEALER.txt ENGINE.txt MASTER-2.txt MASTER-4.txt MASTER-6.txt MASTER-8.txt README.md ardata.pdf
|
||||
'''
|
||||
"""
|
||||
For each commit-day in Feb 2024 (last commit per day):
|
||||
- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
|
||||
ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
|
||||
- Recombine MASTER-*.txt into Master.txt
|
||||
- Produce Master.csv via convert_faa_master_txt_to_csv
|
||||
|
||||
Assumes the non-master files are present in every commit.
|
||||
"""
|
||||
import subprocess, re
|
||||
from pathlib import Path
|
||||
from collections import OrderedDict
|
||||
from derive_from_faa_master_txt import convert_faa_master_txt_to_csv
|
||||
import zipfile
|
||||
|
||||
REPO = "/Users/jonahgoode/Documents/PlaneQuery/Other-Code/scrape-faa-releasable-aircraft"
|
||||
OUT_ROOT = Path("data/faa_releasable_historical")
|
||||
OUT_ROOT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def run_git(*args: str, text: bool = True) -> str:
|
||||
return subprocess.check_output(
|
||||
["git", "-C", REPO, *args],
|
||||
text=text
|
||||
).strip()
|
||||
def run_git_text(*args: str) -> str:
|
||||
return subprocess.check_output(["git", "-C", REPO, *args], text=True).strip()
|
||||
|
||||
# Commits (oldest -> newest), restricted to master parts
|
||||
log = run_git(
|
||||
def run_git_bytes(*args: str) -> bytes:
|
||||
return subprocess.check_output(["git", "-C", REPO, *args])
|
||||
|
||||
# All commits in Feb 2024 (oldest -> newest)
|
||||
log = run_git_text(
|
||||
"log",
|
||||
"--reverse",
|
||||
"--format=%H %cs",
|
||||
"--",
|
||||
"MASTER-1.txt"
|
||||
"--since=2024-02-01",
|
||||
"--until=2024-02-29",
|
||||
)
|
||||
|
||||
lines = [ln for ln in log.splitlines() if ln.strip()]
|
||||
if not lines:
|
||||
raise SystemExit("No commits found.")
|
||||
raise SystemExit("No commits found in February 2024.")
|
||||
|
||||
# Map date -> last commit SHA on that date (only Feb 2024)
|
||||
# date -> last SHA that day
|
||||
date_to_sha = OrderedDict()
|
||||
for ln in lines:
|
||||
sha, date = ln.split()
|
||||
if date.startswith("2024-02"):
|
||||
date_to_sha[date] = sha
|
||||
|
||||
if not date_to_sha:
|
||||
raise SystemExit("No February 2024 commit-days found.")
|
||||
date_to_sha[date] = sha
|
||||
|
||||
OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
|
||||
master_re = re.compile(r"^MASTER-(\d+)\.txt$")
|
||||
|
||||
for date, sha in date_to_sha.items():
|
||||
names = run_git("ls-tree", "--name-only", sha).splitlines()
|
||||
day_dir = OUT_ROOT / date
|
||||
day_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write auxiliary files (assumed present)
|
||||
for fname in OTHER_FILES:
|
||||
(day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
|
||||
|
||||
# Recombine MASTER parts
|
||||
names = run_git_text("ls-tree", "--name-only", sha).splitlines()
|
||||
parts = []
|
||||
for n in names:
|
||||
m = master_re.match(n)
|
||||
if m:
|
||||
parts.append((int(m.group(1)), n))
|
||||
parts.sort()
|
||||
|
||||
if not parts:
|
||||
continue
|
||||
raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
|
||||
|
||||
day_dir = OUT_ROOT / date
|
||||
day_dir.mkdir(parents=True, exist_ok=True)
|
||||
out_path = day_dir / "Master.txt"
|
||||
|
||||
with out_path.open("wb") as w:
|
||||
master_path = day_dir / "MASTER.txt"
|
||||
with master_path.open("wb") as w:
|
||||
for _, fname in parts:
|
||||
data = subprocess.check_output(
|
||||
["git", "-C", REPO, "show", f"{sha}:{fname}"]
|
||||
)
|
||||
data = run_git_bytes("show", f"{sha}:{fname}")
|
||||
w.write(data)
|
||||
if data and not data.endswith(b"\n"):
|
||||
w.write(b"\n")
|
||||
|
||||
print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)")
|
||||
# 3) Zip the day's files
|
||||
zip_path = day_dir / f"ReleasableAircraft.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
|
||||
for p in day_dir.iterdir():
|
||||
z.write(p, arcname=p.name)
|
||||
|
||||
# 4) Convert ZIP -> CSV
|
||||
out_csv = day_dir / f"ReleasableAircraft_{date}.csv"
|
||||
convert_faa_master_txt_to_csv(zip_path, out_csv)
|
||||
|
||||
|
||||
print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
|
||||
|
||||
print(f"\nDone. Output root: {OUT_ROOT.resolve()}")
|
||||
|
||||
Reference in New Issue
Block a user