OpenAirframes/src/derive_from_faa_master_txt.py

from pathlib import Path
import zipfile
import pandas as pd
from faa_aircraft_registry import read

def convert_faa_master_txt_to_df(zip_path: Path, date: str):
    with zipfile.ZipFile(zip_path) as z:
        registrations = read(z)

    df = pd.DataFrame(registrations['master'].values())

    df.insert(0, "download_date", date)

    registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
    df = df.drop(columns="registrant").join(registrant)

    # Move transponder_code_hex to second column (after registration_number)
    cols = df.columns.tolist()
    cols.remove("transponder_code_hex")
    cols.insert(1, "transponder_code_hex")
    df = df[cols]

    df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
    aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
    df = df.drop(columns="aircraft").join(aircraft)
    df = df.rename(columns={"engine_type": "engine_type_2"})
    engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
    df = df.drop(columns="engine").join(engine)
    certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
    df = df.drop(columns="certification").join(certification)

    # Create planequery_airframe_id
    df["planequery_airframe_id"] = (
        normalize(df["aircraft_manufacturer"])
        + "|"
        + normalize(df["aircraft_model"])
        + "|"
        + normalize(df["serial_number"])
    )

    # Move planequery_airframe_id to come after registration_number
    cols = df.columns.tolist()
    cols.remove("planequery_airframe_id")
    reg_idx = cols.index("registration_number")
    cols.insert(reg_idx + 1, "planequery_airframe_id")
    df = df[cols]
    return df


def normalize(s: pd.Series) -> pd.Series:
    return (
        s.fillna("")
         .astype(str)
         .str.upper()
         .str.strip()
         # collapse whitespace
         .str.replace(r"\s+", " ", regex=True)
         # remove characters that cause false mismatches
         .str.replace(r"[^\w\-]", "", regex=True)
    )


def concat_faa_historical_df(df_base, df_new):

    df_base = pd.concat([df_base, df_new], ignore_index=True)

    CONTENT_COLS = [
        c for c in df_base.columns
        if c not in {"download_date"}
    ]

    df_base["row_fingerprint"] = (
        df_base[CONTENT_COLS]
        .fillna("")
        .astype(str)
        .apply(lambda row: "|".join(row), axis=1)
    )

    df_base = df_base.drop_duplicates(
              subset=["row_fingerprint"],
              keep="first"
          ).drop(columns=["row_fingerprint"])
    return df_base