From 9964ce576b53811875c57080d8d6c8cc7b7c437b Mon Sep 17 00:00:00 2001 From: ggman12 Date: Sun, 15 Feb 2026 20:32:33 -0500 Subject: [PATCH] slight update for compress by day --- src/adsb/combine_chunks_to_csv.py | 3 ++- src/adsb/compress_adsb_to_aircraft_data.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/adsb/combine_chunks_to_csv.py b/src/adsb/combine_chunks_to_csv.py index 046becb..b14082c 100644 --- a/src/adsb/combine_chunks_to_csv.py +++ b/src/adsb/combine_chunks_to_csv.py @@ -126,7 +126,8 @@ def download_and_merge_base_release(compressed_df: pl.DataFrame) -> tuple[pl.Dat # Reorder columns to match compressed_df = compressed_df.select(base_df.columns) - # Concat and deduplicate by icao (keep new data - it comes last) + # Concat base (old days) with new data (new days) + # No deduplication needed since they represent different UTC days combined = pl.concat([base_df, compressed_df]) print(f"After concat: {len(combined)} records") diff --git a/src/adsb/compress_adsb_to_aircraft_data.py b/src/adsb/compress_adsb_to_aircraft_data.py index 6252925..6740932 100644 --- a/src/adsb/compress_adsb_to_aircraft_data.py +++ b/src/adsb/compress_adsb_to_aircraft_data.py @@ -123,6 +123,10 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra # partition_by with as_dict=True returns tuple keys: (date, icao) date_val, icao = group_key compressed = compress_df_polars(group_df, str(icao)) + # Set time to start of UTC day for consistent deduplication + compressed = compressed.with_columns( + pl.lit(date_val).cast(pl.Date).cast(pl.Datetime).alias('time') + ) compressed_dfs.append(compressed) if compressed_dfs: