From 66a1108ac9399cedccecf7a5319db02cfe2898a0 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Sun, 1 Feb 2026 18:37:59 -0500 Subject: [PATCH] belive this works. --- src/get_historical_faa.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/get_historical_faa.py b/src/get_historical_faa.py index 0554525..0cf39eb 100644 --- a/src/get_historical_faa.py +++ b/src/get_historical_faa.py @@ -29,8 +29,8 @@ log = run_git_text( "log", "--reverse", "--format=%H %cs", - "--since=2024-01-25", - "--until=2024-02-05", + "--since=2024-01-01", + "--until=2024-08-08", ) lines = [ln for ln in log.splitlines() if ln.strip()] if not lines: @@ -89,20 +89,35 @@ for date, sha in date_to_sha.items(): df_new = convert_faa_master_txt_to_csv(zip_path, out_csv, date) if df_base.empty: df_base = df_new - print(df_base["unique_regulatory_id"].size, "total unique_regulatory_id entries so far") + print(len(df_base), "total entries so far") # Delete all files in the day directory for p in day_dir.iterdir(): p.unlink() day_dir.rmdir() continue - key = "unique_regulatory_id" - df_to_add = df_new[~df_new[key].isin(df_base[key])] - df_base = pd.concat([df_base, df_to_add], ignore_index=True) - print(df_base[key].size, "total unique_regulatory_id entries so far") - - # Delete all files in the day directory - for p in day_dir.iterdir(): - p.unlink() - day_dir.rmdir() + + # Concatenate and deduplicate based on content fingerprint + df_base = pd.concat([df_base, df_new], ignore_index=True) + + CONTENT_COLS = [ + c for c in df_base.columns + if c not in {"download_date"} + ] + + df_base["row_fingerprint"] = ( + df_base[CONTENT_COLS] + .fillna("") + .astype(str) + .apply(lambda row: "|".join(row), axis=1) + ) + + df_base = df_base.drop_duplicates( + subset=["row_fingerprint"], + keep="first" + ).drop(columns=["row_fingerprint"]) + + + print(len(df_base), "total entries so far") df_base.to_csv(OUT_ROOT / f"MASTER_{start_date}_{end_date}.csv", index=False) +# TODO: get average number of new rows per day. \ No newline at end of file