""" For each commit-day in Feb 2024 (last commit per day): - Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/ ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt - Recombine MASTER-*.txt into Master.txt - Produce Master.csv via convert_faa_master_txt_to_csv Assumes the non-master files are present in every commit. """ import subprocess, re from pathlib import Path import shutil from collections import OrderedDict from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df import zipfile import pandas as pd import argparse from datetime import datetime, timedelta # Parse command line arguments parser = argparse.ArgumentParser(description="Process historical FAA data from git commits") parser.add_argument("since", help="Start date (YYYY-MM-DD)") parser.add_argument("until", help="End date (YYYY-MM-DD)") args = parser.parse_args() # Clone repository if it doesn't exist REPO = Path("data/scrape-faa-releasable-aircraft") OUT_ROOT = Path("data/faa_releasable_historical") OUT_ROOT.mkdir(parents=True, exist_ok=True) def run_git_text(*args: str) -> str: return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip() def run_git_bytes(*args: str) -> bytes: return subprocess.check_output(["git", "-C", str(REPO), *args]) # Parse dates and adjust --since to the day before since_date = datetime.strptime(args.since, "%Y-%m-%d") adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d") # All commits in specified date range (oldest -> newest) log = run_git_text( "log", "--reverse", "--format=%H %cs", f"--since={adjusted_since}", f"--until={args.until}", ) lines = [ln for ln in log.splitlines() if ln.strip()] if not lines: raise SystemExit(f"No commits found between {args.since} and {args.until}.") # date -> last SHA that day date_to_sha = OrderedDict() for ln in lines: sha, date = ln.split() date_to_sha[date] = sha OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"] master_re = re.compile(r"^MASTER-(\d+)\.txt$") df_base = pd.DataFrame() start_date = None end_date = None for date, sha in date_to_sha.items(): if start_date is None: start_date = date end_date = date day_dir = OUT_ROOT / date day_dir.mkdir(parents=True, exist_ok=True) # Write auxiliary files (assumed present) for fname in OTHER_FILES: (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}")) # Recombine MASTER parts names = run_git_text("ls-tree", "--name-only", sha).splitlines() parts = [] for n in names: m = master_re.match(n) if m: parts.append((int(m.group(1)), n)) parts.sort() if not parts: raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found") master_path = day_dir / "MASTER.txt" with master_path.open("wb") as w: for _, fname in parts: data = run_git_bytes("show", f"{sha}:{fname}") w.write(data) if data and not data.endswith(b"\n"): w.write(b"\n") # 3) Zip the day's files zip_path = day_dir / f"ReleasableAircraft.zip" with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z: for p in day_dir.iterdir(): z.write(p, arcname=p.name) print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})") # 4) Convert ZIP -> CSV df_new = convert_faa_master_txt_to_df(zip_path, date) if df_base.empty: df_base = df_new print(len(df_base), "total entries so far") # Delete all files in the day directory shutil.rmtree(day_dir) continue df_base = concat_faa_historical_df(df_base, df_new) shutil.rmtree(day_dir) print(len(df_base), "total entries so far") assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date}_{end_date}.csv", index=False) # TODO: get average number of new rows per day.