Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.

2026-06-07 13:53:52 +02:00 · 2026-02-24 03:18:53 -05:00
parent 4b6a043a9d
commit b0526f0a95
3 changed files with 37 additions and 15 deletions
@@ -37,14 +37,31 @@ def main():
    if args.concat_with_latest_csv:
        print("Loading latest CSV from GitHub releases to concatenate with...")
        from src.get_latest_release import get_latest_aircraft_adsb_csv_df
-        df_latest_csv, csv_date = get_latest_aircraft_adsb_csv_df()
-        # Ensure column order matches before concatenating
-        df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
-        from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
-        df_final = concat_compressed_dfs(df_latest_csv, df)
-        df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
-        final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_date}_{args.date}.csv.gz"
-        df_final.write_csv(final_csv_output_path, compression="gzip")
+        from datetime import datetime
+        
+        df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
+        
+        # Compare dates: end_date is exclusive, so if csv_end_date > args.date, 
+        # the latest CSV already includes this day's data
+        csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
+        args_dt = datetime.strptime(args.date, "%Y-%m-%d")
+        
+        if csv_end_dt >= args_dt:
+            print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
+            print("Writing latest CSV directly without concatenation to avoid duplicates")
+            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
+            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+            df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
+        else:
+            print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
+            # Ensure column order matches before concatenating
+            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+            from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
+            df_final = concat_compressed_dfs(df_latest_csv, df)
+            df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
+            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
+            df_final.write_csv(final_csv_output_path, compression="gzip")
+        print(f"Final CSV written to {final_csv_output_path}")

 if __name__ == "__main__":
    main()
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
    """Read all JSON submissions from the community directory."""
    all_submissions = []
    
-    for json_file in sorted(community_dir.glob("*.json")):
+    for json_file in sorted(community_dir.glob("**/*.json")):
        try:
            with open(json_file) as f:
                data = json.load(f)
@@ -207,7 +207,11 @@ def download_latest_aircraft_adsb_csv(

 import polars as pl
 def get_latest_aircraft_adsb_csv_df():
-    """Download and load the latest ADS-B CSV from GitHub releases."""
+    """Download and load the latest ADS-B CSV from GitHub releases.
+    
+    Returns:
+        tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
+    """
    import re
    
    csv_path = download_latest_aircraft_adsb_csv()
@@ -231,15 +235,16 @@ def get_latest_aircraft_adsb_csv_df():
        if df[col].dtype == pl.Utf8:
            df = df.with_columns(pl.col(col).fill_null(""))
    
-    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
-    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    # Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
+    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
    if not match:
-        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+        raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
    
-    date_str = match.group(1)
+    start_date = match.group(1)
+    end_date = match.group(2)
    print(df.columns)
    print(df.dtypes)
-    return df, date_str
+    return df, start_date, end_date