Files
OpenAirframes/src/derive_from_faa_master_txt.py
T
2026-02-01 20:33:22 -05:00

84 lines
2.7 KiB
Python

from pathlib import Path
import zipfile
import pandas as pd
from faa_aircraft_registry import read
def convert_faa_master_txt_to_df(zip_path: Path, date: str):
with zipfile.ZipFile(zip_path) as z:
registrations = read(z)
df = pd.DataFrame(registrations['master'].values())
df.insert(0, "download_date", date)
registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
df = df.drop(columns="registrant").join(registrant)
# Move transponder_code_hex to second column (after registration_number)
cols = df.columns.tolist()
cols.remove("transponder_code_hex")
cols.insert(1, "transponder_code_hex")
df = df[cols]
df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
df = df.drop(columns="aircraft").join(aircraft)
df = df.rename(columns={"engine_type": "engine_type_2"})
engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
df = df.drop(columns="engine").join(engine)
certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
df = df.drop(columns="certification").join(certification)
# Create planequery_airframe_id
df["planequery_airframe_id"] = (
normalize(df["aircraft_manufacturer"])
+ "|"
+ normalize(df["aircraft_model"])
+ "|"
+ normalize(df["serial_number"])
)
# Move planequery_airframe_id to come after registration_number
cols = df.columns.tolist()
cols.remove("planequery_airframe_id")
reg_idx = cols.index("registration_number")
cols.insert(reg_idx + 1, "planequery_airframe_id")
df = df[cols]
return df
def normalize(s: pd.Series) -> pd.Series:
return (
s.fillna("")
.astype(str)
.str.upper()
.str.strip()
# collapse whitespace
.str.replace(r"\s+", " ", regex=True)
# remove characters that cause false mismatches
.str.replace(r"[^\w\-]", "", regex=True)
)
def concat_faa_historical_df(df_base, df_new):
df_base = pd.concat([df_base, df_new], ignore_index=True)
CONTENT_COLS = [
c for c in df_base.columns
if c not in {"download_date"}
]
df_base["row_fingerprint"] = (
df_base[CONTENT_COLS]
.fillna("")
.astype(str)
.apply(lambda row: "|".join(row), axis=1)
)
df_base = df_base.drop_duplicates(
subset=["row_fingerprint"],
keep="first"
).drop(columns=["row_fingerprint"])
return df_base