From b0526f0a95d5fc8be186fb53bfb0e18ae754448a Mon Sep 17 00:00:00 2001 From: ggman12 Date: Tue, 24 Feb 2026 03:18:53 -0500 Subject: [PATCH] Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data. --- src/adsb/concat_parquet_to_final.py | 33 ++++++++++++++----- .../create_daily_community_release.py | 2 +- src/get_latest_release.py | 17 ++++++---- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/src/adsb/concat_parquet_to_final.py b/src/adsb/concat_parquet_to_final.py index da88019..a38b796 100644 --- a/src/adsb/concat_parquet_to_final.py +++ b/src/adsb/concat_parquet_to_final.py @@ -37,14 +37,31 @@ def main(): if args.concat_with_latest_csv: print("Loading latest CSV from GitHub releases to concatenate with...") from src.get_latest_release import get_latest_aircraft_adsb_csv_df - df_latest_csv, csv_date = get_latest_aircraft_adsb_csv_df() - # Ensure column order matches before concatenating - df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS) - from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs - df_final = concat_compressed_dfs(df_latest_csv, df) - df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS) - final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_date}_{args.date}.csv.gz" - df_final.write_csv(final_csv_output_path, compression="gzip") + from datetime import datetime + + df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df() + + # Compare dates: end_date is exclusive, so if csv_end_date > args.date, + # the latest CSV already includes this day's data + csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d") + args_dt = datetime.strptime(args.date, "%Y-%m-%d") + + if csv_end_dt >= args_dt: + print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)") + print("Writing latest CSV directly without concatenation to avoid duplicates") + final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz" + df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS) + df_latest_csv.write_csv(final_csv_output_path, compression="gzip") + else: + print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})") + # Ensure column order matches before concatenating + df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS) + from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs + df_final = concat_compressed_dfs(df_latest_csv, df) + df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS) + final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz" + df_final.write_csv(final_csv_output_path, compression="gzip") + print(f"Final CSV written to {final_csv_output_path}") if __name__ == "__main__": main() \ No newline at end of file diff --git a/src/contributions/create_daily_community_release.py b/src/contributions/create_daily_community_release.py index ec4d060..73352a6 100644 --- a/src/contributions/create_daily_community_release.py +++ b/src/contributions/create_daily_community_release.py @@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]: """Read all JSON submissions from the community directory.""" all_submissions = [] - for json_file in sorted(community_dir.glob("*.json")): + for json_file in sorted(community_dir.glob("**/*.json")): try: with open(json_file) as f: data = json.load(f) diff --git a/src/get_latest_release.py b/src/get_latest_release.py index 72d9302..27a2eca 100644 --- a/src/get_latest_release.py +++ b/src/get_latest_release.py @@ -207,7 +207,11 @@ def download_latest_aircraft_adsb_csv( import polars as pl def get_latest_aircraft_adsb_csv_df(): - """Download and load the latest ADS-B CSV from GitHub releases.""" + """Download and load the latest ADS-B CSV from GitHub releases. + + Returns: + tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format + """ import re csv_path = download_latest_aircraft_adsb_csv() @@ -231,15 +235,16 @@ def get_latest_aircraft_adsb_csv_df(): if df[col].dtype == pl.Utf8: df = df.with_columns(pl.col(col).fill_null("")) - # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz] - match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) + # Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz] + match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path)) if not match: - raise ValueError(f"Could not extract date from filename: {csv_path.name}") + raise ValueError(f"Could not extract dates from filename: {csv_path.name}") - date_str = match.group(1) + start_date = match.group(1) + end_date = match.group(2) print(df.columns) print(df.dtypes) - return df, date_str + return df, start_date, end_date