diff --git a/src/derive_from_faa_master_txt.py b/src/derive_from_faa_master_txt.py index e8dcadc..259532c 100644 --- a/src/derive_from_faa_master_txt.py +++ b/src/derive_from_faa_master_txt.py @@ -3,14 +3,13 @@ import zipfile import pandas as pd from faa_aircraft_registry import read -def convert_faa_master_txt_to_csv(zip_path: Path, csv_path: Path, date: str = None): +def convert_faa_master_txt_to_df(zip_path: Path, date: str): with zipfile.ZipFile(zip_path) as z: registrations = read(z) df = pd.DataFrame(registrations['master'].values()) - if date is not None: - df.insert(0, "download_date", date) + df.insert(0, "download_date", date) registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_") df = df.drop(columns="registrant").join(registrant) @@ -45,8 +44,6 @@ def convert_faa_master_txt_to_csv(zip_path: Path, csv_path: Path, date: str = No reg_idx = cols.index("registration_number") cols.insert(reg_idx + 1, "planequery_airframe_id") df = df[cols] - - df.to_csv(csv_path, index=False) return df diff --git a/src/get_historical_faa.py b/src/get_historical_faa.py index 4683ad8..573942c 100644 --- a/src/get_historical_faa.py +++ b/src/get_historical_faa.py @@ -11,7 +11,7 @@ import subprocess, re from pathlib import Path import shutil from collections import OrderedDict -from derive_from_faa_master_txt import convert_faa_master_txt_to_csv, concat_faa_historical_df +from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df import zipfile import pandas as pd import argparse @@ -106,7 +106,7 @@ for date, sha in date_to_sha.items(): print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})") # 4) Convert ZIP -> CSV out_csv = day_dir / f"ReleasableAircraft_{date}.csv" - df_new = convert_faa_master_txt_to_csv(zip_path, out_csv, date) + df_new = convert_faa_master_txt_to_df(zip_path, out_csv, date) if df_base.empty: df_base = df_new print(len(df_base), "total entries so far") @@ -119,5 +119,5 @@ for date, sha in date_to_sha.items(): print(len(df_base), "total entries so far") assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" -df_base.to_csv(OUT_ROOT / f"MASTER_{start_date}_{end_date}.csv", index=False) +df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date}_{end_date}.csv", index=False) # TODO: get average number of new rows per day. diff --git a/src/snapshot_faa.py b/src/snapshot_faa.py index 0d7712f..e86528f 100644 --- a/src/snapshot_faa.py +++ b/src/snapshot_faa.py @@ -1,4 +1,3 @@ -import zipfile from pathlib import Path from datetime import datetime, timezone date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") @@ -6,10 +5,8 @@ date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d") out_dir = Path("data/faa_releasable") out_dir.mkdir(parents=True, exist_ok=True) zip_name = f"ReleasableAircraft_{date_str}.zip" -csv_name = f"Master_{date_str}.csv" zip_path = out_dir / zip_name -csv_path = out_dir / csv_name # URL and paths url = "https://registry.faa.gov/database/ReleasableAircraft.zip" @@ -25,5 +22,12 @@ with urlopen(req, timeout=120) as r: body = r.read() zip_path.write_bytes(body) -from derive_from_faa_master_txt import convert_faa_master_txt_to_csv -convert_faa_master_txt_to_csv(zip_path, csv_path) +OUT_ROOT = Path("data/planequery_aircraft") +OUT_ROOT.mkdir(parents=True, exist_ok=True) +from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df +from get_latest_planequery_aircraft_release import get_latest_aircraft_csv_df +df_new = convert_faa_master_txt_to_df(zip_path, date_str) +df_base, start_date_str = get_latest_aircraft_csv_df() +df_base = concat_faa_historical_df(df_base, df_new) +assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" +df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date_str}_{date_str}.csv", index=False) \ No newline at end of file