diff --git a/src/derive_from_faa_master_txt.py b/src/derive_from_faa_master_txt.py index 1c73470..e8dcadc 100644 --- a/src/derive_from_faa_master_txt.py +++ b/src/derive_from_faa_master_txt.py @@ -63,3 +63,25 @@ def normalize(s: pd.Series) -> pd.Series: .str.replace(r"[^\w\-]", "", regex=True) ) + +def concat_faa_historical_df(df_base, df_new): + + df_base = pd.concat([df_base, df_new], ignore_index=True) + + CONTENT_COLS = [ + c for c in df_base.columns + if c not in {"download_date"} + ] + + df_base["row_fingerprint"] = ( + df_base[CONTENT_COLS] + .fillna("") + .astype(str) + .apply(lambda row: "|".join(row), axis=1) + ) + + df_base = df_base.drop_duplicates( + subset=["row_fingerprint"], + keep="first" + ).drop(columns=["row_fingerprint"]) + return df_base \ No newline at end of file diff --git a/src/get_historical_faa.py b/src/get_historical_faa.py index d04c731..4683ad8 100644 --- a/src/get_historical_faa.py +++ b/src/get_historical_faa.py @@ -9,8 +9,9 @@ Assumes the non-master files are present in every commit. """ import subprocess, re from pathlib import Path +import shutil from collections import OrderedDict -from derive_from_faa_master_txt import convert_faa_master_txt_to_csv +from derive_from_faa_master_txt import convert_faa_master_txt_to_csv, concat_faa_historical_df import zipfile import pandas as pd import argparse @@ -110,33 +111,13 @@ for date, sha in date_to_sha.items(): df_base = df_new print(len(df_base), "total entries so far") # Delete all files in the day directory - for p in day_dir.iterdir(): - p.unlink() - day_dir.rmdir() + shutil.rmtree(day_dir) continue - # Concatenate and deduplicate based on content fingerprint - df_base = pd.concat([df_base, df_new], ignore_index=True) - - CONTENT_COLS = [ - c for c in df_base.columns - if c not in {"download_date"} - ] - - df_base["row_fingerprint"] = ( - df_base[CONTENT_COLS] - .fillna("") - .astype(str) - .apply(lambda row: "|".join(row), axis=1) - ) - - df_base = df_base.drop_duplicates( - subset=["row_fingerprint"], - keep="first" - ).drop(columns=["row_fingerprint"]) - - + df_base = concat_faa_historical_df(df_base, df_new) + shutil.rmtree(day_dir) print(len(df_base), "total entries so far") +assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" df_base.to_csv(OUT_ROOT / f"MASTER_{start_date}_{end_date}.csv", index=False) -# TODO: get average number of new rows per day. \ No newline at end of file +# TODO: get average number of new rows per day.