FEATURE: Add contributions framework. Fix and improve daily adsb release using Github actions for map reduce.

2026-04-30 06:48:05 +02:00 · 2026-02-11 14:04:27 -05:00
parent 27da93801e
commit 722bcdf791
29 changed files with 2347 additions and 343 deletions
@@ -13,18 +13,13 @@ from datetime import datetime, timedelta
 from pathlib import Path

 import boto3
-import pandas as pd
+import polars as pl

-from compress_adsb_to_aircraft_data import load_historical_for_day, COLUMNS
-
-
-def deduplicate_by_signature(df: pd.DataFrame) -> pd.DataFrame:
-    """For each icao, keep only the earliest row with each unique signature."""
-    df["_signature"] = df[COLUMNS].astype(str).agg("|".join, axis=1)
-    df_deduped = df.groupby(["icao", "_signature"], as_index=False).first()
-    df_deduped = df_deduped.drop(columns=["_signature"])
-    df_deduped = df_deduped.sort_values("time")
-    return df_deduped
+from compress_adsb_to_aircraft_data import (
+    load_historical_for_day,
+    deduplicate_by_signature,
+    COLUMNS,
+)


 def main():
@@ -39,28 +34,20 @@ def main():
    total_days = (end_date - start_date).days
    print(f"Worker: processing {total_days} days [{start_date_str}, {end_date_str})")

-    df_accumulated = pd.DataFrame()
+    dfs = []
    current_date = start_date

    while current_date < end_date:
        day_str = current_date.strftime("%Y-%m-%d")
        print(f"  Loading {day_str}...")

-        try:
-            df_compressed = load_historical_for_day(current_date)
-        except Exception as e:
-            print(f"  WARNING: Failed to load {day_str}: {e}")
-            current_date += timedelta(days=1)
-            continue
+        df_compressed = load_historical_for_day(current_date)
+        if df_compressed.height == 0:
+            raise RuntimeError(f"No data found for {day_str}")

-        if df_accumulated.empty:
-            df_accumulated = df_compressed
-        else:
-            df_accumulated = pd.concat(
-                [df_accumulated, df_compressed], ignore_index=True
-            )
-
-        print(f"  +{len(df_compressed)} rows (total: {len(df_accumulated)})")
+        dfs.append(df_compressed)
+        total_rows = sum(df.height for df in dfs)
+        print(f"  +{df_compressed.height} rows (total: {total_rows})")

        # Delete local cache after each day to save disk in container
        cache_dir = Path("data/adsb")
@@ -70,23 +57,31 @@ def main():

        current_date += timedelta(days=1)

-    if df_accumulated.empty:
-        print("No data collected — exiting.")
-        return
+    # Concatenate all days
+    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()

    # Deduplicate within this chunk
    df_accumulated = deduplicate_by_signature(df_accumulated)
-    print(f"After dedup: {len(df_accumulated)} rows")
+    print(f"After dedup: {df_accumulated.height} rows")

    # Write to local file then upload to S3
-    local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
-    df_accumulated.to_csv(local_path, index=False, compression="gzip")
+    local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv")
+    df_accumulated.write_csv(local_path)
+    
+    # Compress with gzip
+    import gzip
+    import shutil
+    gz_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
+    with open(local_path, 'rb') as f_in:
+        with gzip.open(gz_path, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    local_path.unlink()  # Remove uncompressed file

    s3_key = f"intermediate/{run_id}/chunk_{start_date_str}_{end_date_str}.csv.gz"
    print(f"Uploading to s3://{s3_bucket}/{s3_key}")

    s3 = boto3.client("s3")
-    s3.upload_file(str(local_path), s3_bucket, s3_key)
+    s3.upload_file(str(gz_path), s3_bucket, s3_key)
    print("Done.")