Daily ADSB and Histoircal updates. Update readme.md

2026-06-08 06:03:55 +02:00 · 2026-02-13 11:49:18 -05:00
parent 4015a5fcf1
commit d216ea9329
32 changed files with 1489 additions and 1744 deletions
@@ -1,11 +0,0 @@
-FROM --platform=linux/arm64 python:3.12-slim
-
-WORKDIR /app
-
-COPY requirements.reducer.txt requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY compress_adsb_to_aircraft_data.py .
-COPY reducer.py .
-
-CMD ["python", "-u", "reducer.py"]
@@ -1,12 +0,0 @@
-FROM --platform=linux/arm64 python:3.12-slim
-
-WORKDIR /app
-
-COPY requirements.worker.txt requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY compress_adsb_to_aircraft_data.py .
-COPY download_adsb_data_to_parquet.py .
-COPY worker.py .
-
-CMD ["python", "-u", "worker.py"]
@@ -1,250 +0,0 @@
-"""
-Combines chunk parquet files and compresses to final aircraft CSV.
-This is the reduce phase of the map-reduce pipeline.
-
-Supports both single-day (daily) and multi-day (historical) modes.
-
-Memory-efficient: processes each chunk separately, compresses, then combines.
-
-Usage:
-    # Daily mode
-    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
-    
-    # Historical mode
-    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
-"""
-import gc
-import os
-import sys
-import glob
-import argparse
-from datetime import datetime, timedelta
-
-import polars as pl
-
-from src.adsb.download_adsb_data_to_parquet import OUTPUT_DIR, get_resource_usage
-from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLUMNS
-
-
-DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
-FINAL_OUTPUT_DIR = "./data/openairframes"
-os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
-
-
-def get_target_day() -> datetime:
-    """Get yesterday's date (the day we're processing)."""
-    return datetime.utcnow() - timedelta(days=1)
-
-
-def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
-    """Load and compress a single chunk parquet file.
-    
-    Args:
-        chunk_path: Path to parquet file
-        delete_after_load: If True, delete the parquet file after loading to free disk space
-    """
-    print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
-    
-    # Load chunk - only columns we need
-    needed_columns = ['time', 'icao'] + COLUMNS
-    df = pl.read_parquet(chunk_path, columns=needed_columns)
-    print(f"  Loaded {len(df)} rows")
-    
-    # Delete file immediately after loading to free disk space
-    if delete_after_load:
-        try:
-            os.remove(chunk_path)
-            print(f"  Deleted {chunk_path} to free disk space")
-        except Exception as e:
-            print(f"  Warning: Failed to delete {chunk_path}: {e}")
-    
-    # Compress to aircraft records (one per ICAO) using shared function
-    compressed = compress_multi_icao_df(df, verbose=True)
-    print(f"  Compressed to {len(compressed)} aircraft records")
-    
-    del df
-    gc.collect()
-    
-    return compressed
-
-
-def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFrame:
-    """Combine multiple compressed DataFrames.
-    
-    Since chunks are partitioned by ICAO hash, each ICAO only appears in one chunk.
-    No deduplication needed here - just concatenate.
-    """
-    print(f"Combining {len(compressed_dfs)} compressed chunks... | {get_resource_usage()}")
-    
-    # Concat all
-    combined = pl.concat(compressed_dfs)
-    print(f"Combined: {len(combined)} records")
-    
-    return combined
-
-
-def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame:
-    """Download base release and merge with new data."""
-    from src.get_latest_release import download_latest_aircraft_adsb_csv
-    
-    print("Downloading base ADS-B release...")
-    try:
-        base_path = download_latest_aircraft_adsb_csv(
-            output_dir="./data/openairframes_base"
-        )
-        print(f"Download returned: {base_path}")
-        
-        if base_path and os.path.exists(str(base_path)):
-            print(f"Loading base release from {base_path}")
-            base_df = pl.read_csv(base_path)
-            print(f"Base release has {len(base_df)} records")
-            
-            # Ensure columns match
-            base_cols = set(base_df.columns)
-            new_cols = set(compressed_df.columns)
-            print(f"Base columns: {sorted(base_cols)}")
-            print(f"New columns: {sorted(new_cols)}")
-            
-            # Add missing columns
-            for col in new_cols - base_cols:
-                base_df = base_df.with_columns(pl.lit(None).alias(col))
-            for col in base_cols - new_cols:
-                compressed_df = compressed_df.with_columns(pl.lit(None).alias(col))
-            
-            # Reorder columns to match
-            compressed_df = compressed_df.select(base_df.columns)
-            
-            # Concat and deduplicate by icao (keep new data - it comes last)
-            combined = pl.concat([base_df, compressed_df])
-            print(f"After concat: {len(combined)} records")
-            
-            deduplicated = combined.unique(subset=["icao"], keep="last")
-            
-            print(f"Combined with base: {len(combined)} -> {len(deduplicated)} after dedup")
-            
-            del base_df, combined
-            gc.collect()
-            
-            return deduplicated
-        else:
-            print(f"No base release found at {base_path}, using only new data")
-            return compressed_df
-    except Exception as e:
-        import traceback
-        print(f"Failed to download base release: {e}")
-        traceback.print_exc()
-        return compressed_df
-
-
-def cleanup_chunks(output_id: str, chunks_dir: str):
-    """Delete chunk parquet files after successful merge."""
-    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
-    chunk_files = glob.glob(pattern)
-    for f in chunk_files:
-        try:
-            os.remove(f)
-            print(f"Deleted {f}")
-        except Exception as e:
-            print(f"Failed to delete {f}: {e}")
-
-
-def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
-    """Find chunk parquet files matching the output ID."""
-    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
-    chunk_files = sorted(glob.glob(pattern))
-    
-    if not chunk_files:
-        # Try recursive search for historical mode with merged artifacts
-        pattern = os.path.join(chunks_dir, "**", "*.parquet")
-        chunk_files = sorted(glob.glob(pattern, recursive=True))
-    
-    return chunk_files
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
-    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
-    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
-    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
-    parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
-    parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
-    parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
-    parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
-    args = parser.parse_args()
-    
-    # Determine output ID and filename based on mode
-    if args.start_date and args.end_date:
-        # Historical mode
-        output_id = f"{args.start_date}_{args.end_date}"
-        output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv"
-        print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
-    else:
-        # Daily mode - use same date for start and end
-        if args.date:
-            target_day = datetime.strptime(args.date, "%Y-%m-%d")
-        else:
-            target_day = get_target_day()
-        
-        date_str = target_day.strftime("%Y-%m-%d")
-        output_id = date_str
-        output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv"
-        print(f"Combining chunks for {date_str}")
-    
-    chunks_dir = args.chunks_dir
-    print(f"Chunks directory: {chunks_dir}")
-    print(f"Resource usage at start: {get_resource_usage()}")
-    
-    # Find chunk files
-    chunk_files = find_chunk_files(chunks_dir, output_id)
-    
-    if not chunk_files:
-        print(f"No chunk files found in: {chunks_dir}")
-        sys.exit(1)
-    
-    print(f"Found {len(chunk_files)} chunk files")
-    
-    # Process each chunk separately to save memory
-    # With --stream, delete parquet files immediately after loading to save disk space
-    compressed_chunks = []
-    for chunk_path in chunk_files:
-        compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
-        compressed_chunks.append(compressed)
-        gc.collect()
-    
-    # Combine all compressed chunks
-    combined = combine_compressed_chunks(compressed_chunks)
-    
-    # Free memory from individual chunks
-    del compressed_chunks
-    gc.collect()
-    print(f"After combining: {get_resource_usage()}")
-    
-    # Merge with base release (unless skipped)
-    if not args.skip_base:
-        combined = download_and_merge_base_release(combined)
-    
-    # Convert list columns to strings for CSV compatibility
-    for col in combined.columns:
-        if combined[col].dtype == pl.List:
-            combined = combined.with_columns(
-                pl.col(col).list.join(",").alias(col)
-            )
-    
-    # Sort by time for consistent output
-    if 'time' in combined.columns:
-        combined = combined.sort('time')
-    
-    # Write final CSV
-    output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
-    combined.write_csv(output_path)
-    print(f"Wrote {len(combined)} records to {output_path}")
-    
-    # Cleanup
-    if not args.keep_chunks:
-        cleanup_chunks(output_id, chunks_dir)
-    
-    print(f"Done! | {get_resource_usage()}")
-
-
-if __name__ == "__main__":
-    main()
@@ -5,23 +5,6 @@ import polars as pl
 COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']


-def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
-    """For each icao, keep only the earliest row with each unique signature.
-    
-    This is used for deduplicating across multiple compressed chunks.
-    """
-    # Create signature column
-    df = df.with_columns(
-        pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
-    )
-    # Group by icao and signature, take first row (earliest due to time sort)
-    df = df.sort("time")
-    df_deduped = df.group_by(["icao", "_signature"]).first()
-    df_deduped = df_deduped.drop("_signature")
-    df_deduped = df_deduped.sort("time")
-    return df_deduped
-
-
 def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
    """Compress a single ICAO group to its most informative row using Polars."""
    # Create signature string
@@ -99,9 +82,6 @@ def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
 def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
    """Compress a DataFrame with multiple ICAOs to one row per ICAO.
    
-    This is the main entry point for compressing ADS-B data.
-    Used by both daily GitHub Actions runs and historical AWS runs.
-    
    Args:
        df: DataFrame with columns ['time', 'icao'] + COLUMNS
        verbose: Whether to print progress
@@ -120,29 +100,27 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
    
-    # First pass: quick deduplication of exact duplicates
+    # Quick deduplication of exact duplicates
    df = df.unique(subset=['icao'] + COLUMNS, keep='first')
    if verbose:
        print(f"After quick dedup: {df.height} records")
    
-    # Second pass: sophisticated compression per ICAO
+    # Compress per ICAO
    if verbose:
        print("Compressing per ICAO...")
    
-    # Process each ICAO group
    icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []
    
    for icao_key, group_df in icao_groups.items():
-        # partition_by with as_dict=True returns tuple keys, extract first element
-        icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
+        icao = icao_key[0]
        compressed = compress_df_polars(group_df, str(icao))
        compressed_dfs.append(compressed)
    
    if compressed_dfs:
        df_compressed = pl.concat(compressed_dfs)
    else:
-        df_compressed = df.head(0)  # Empty with same schema
+        df_compressed = df.head(0)
    
    if verbose:
        print(f"After compress: {df_compressed.height} records")
@@ -155,45 +133,22 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
    return df_compressed


-def load_raw_adsb_for_day(day):
-    """Load raw ADS-B data for a day from parquet file."""
-    from datetime import timedelta
+def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
+    """Load a single parquet part file for a date.
+    
+    Args:
+        part_id: Part ID (e.g., 1, 2, 3)
+        date: Date string in YYYY-MM-DD format
+    
+    Returns:
+        DataFrame with ADS-B data
+    """
    from pathlib import Path
    
-    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
-    
-    # Check for parquet file first
-    version_date = f"v{start_time.strftime('%Y.%m.%d')}"
-    parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
+    parquet_file = Path(f"data/output/parquet_output/part_{part_id}_{date}.parquet")
    
    if not parquet_file.exists():
-        # Try to generate parquet file by calling the download function
-        print(f"  Parquet file not found: {parquet_file}")
-        print(f"  Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
-        
-        from download_adsb_data_to_parquet import create_parquet_for_day
-        result_path = create_parquet_for_day(start_time, keep_folders=False)
-        
-        if result_path:
-            print(f"  Successfully generated parquet file: {result_path}")
-        else:
-            raise Exception("Failed to generate parquet file")
-    
-    if parquet_file.exists():
-        print(f"  Loading from parquet: {parquet_file}")
-        df = pl.read_parquet(
-            parquet_file, 
-            columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
-        )
-        
-        # Convert to timezone-naive datetime
-        if df["time"].dtype == pl.Datetime:
-            df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
-        
-        return df
-    else:
-        # Return empty DataFrame if parquet file doesn't exist
-        print(f"  No data available for {start_time.strftime('%Y-%m-%d')}")
+        print(f"Parquet file not found: {parquet_file}")
        return pl.DataFrame(schema={
            'time': pl.Datetime,
            'icao': pl.Utf8,
@@ -205,17 +160,33 @@ def load_raw_adsb_for_day(day):
            'desc': pl.Utf8,
            'aircraft_category': pl.Utf8
        })
+    
+    print(f"Loading from parquet: {parquet_file}")
+    df = pl.read_parquet(
+        parquet_file,
+        columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
+    )
+    
+    # Convert to timezone-naive datetime
+    if df["time"].dtype == pl.Datetime:
+        df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
+    os.remove(parquet_file)
+    return df


-def load_historical_for_day(day):
-    """Load and compress historical ADS-B data for a day."""
-    df = load_raw_adsb_for_day(day)
+def compress_parquet_part(part_id: int, date: str) -> pl.DataFrame:
+    """Load and compress a single parquet part file."""
+    df = load_parquet_part(part_id, date)
+    
    if df.height == 0:
        return df
+
+    # Filter to rows within the given date (UTC-naive). This is because sometimes adsb.lol export can have rows at 00:00:00 of next day or similar.
+    date_lit = pl.lit(date).str.strptime(pl.Date, "%Y-%m-%d")
+    df = df.filter(pl.col("time").dt.date() == date_lit)
    
-    print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")
+    print(f"Loaded {df.height} raw records for part {part_id}, date {date}")
    
-    # Use shared compression function
    return compress_multi_icao_df(df, verbose=True)


@@ -223,52 +194,4 @@ def concat_compressed_dfs(df_base, df_new):
    """Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
    # Combine both dataframes
    df_combined = pl.concat([df_base, df_new])
-    
-    # Sort by ICAO and time
-    df_combined = df_combined.sort(['icao', 'time'])
-    
-    # Fill null values
-    for col in COLUMNS:
-        if col in df_combined.columns:
-            df_combined = df_combined.with_columns(pl.col(col).fill_null(""))
-    
-    # Apply compression logic per ICAO to get the best row
-    icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
-    compressed_dfs = []
-    
-    for icao, group_df in icao_groups.items():
-        compressed = compress_df_polars(group_df, icao)
-        compressed_dfs.append(compressed)
-    
-    if compressed_dfs:
-        df_compressed = pl.concat(compressed_dfs)
-    else:
-        df_compressed = df_combined.head(0)
-    
-    # Sort by time
-    df_compressed = df_compressed.sort('time')
-    
-    return df_compressed
-
-
-def get_latest_aircraft_adsb_csv_df():
-    """Download and load the latest ADS-B CSV from GitHub releases."""
-    from get_latest_release import download_latest_aircraft_adsb_csv
-    import re
-    
-    csv_path = download_latest_aircraft_adsb_csv()
-    df = pl.read_csv(csv_path, null_values=[""])
-    
-    # Fill nulls with empty strings
-    for col in df.columns:
-        if df[col].dtype == pl.Utf8:
-            df = df.with_columns(pl.col(col).fill_null(""))
-    
-    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
-    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
-    if not match:
-        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
-    
-    date_str = match.group(1)
-    return df, date_str
-
+    return df_combined
@@ -0,0 +1,50 @@
+from pathlib import Path
+import polars as pl
+import argparse
+
+OUTPUT_DIR = Path("./data/output")
+CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
+
+def main():
+    parser = argparse.ArgumentParser(description="Concatenate compressed parquet files for a single day")
+    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
+    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Whether to also concatenate with the latest CSV from GitHub releases")
+    args = parser.parse_args()
+
+    compressed_dir = OUTPUT_DIR / "compressed"
+    date_dir = compressed_dir / args.date
+    if not date_dir.is_dir():
+        raise FileNotFoundError(f"No date folder found: {date_dir}")
+
+    parquet_files = sorted(date_dir.glob("*.parquet"))
+    if not parquet_files:
+        raise FileNotFoundError(f"No parquet files found in {date_dir}")
+
+    frames = [pl.read_parquet(p) for p in parquet_files]
+    df = pl.concat(frames, how="vertical", rechunk=True)
+
+    df = df.sort(["time", "icao"])
+    df = df.select(CORRECT_ORDER_OF_COLUMNS)
+    
+    output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
+    print(f"Writing combined parquet to {output_path} with {df.height} rows")
+    df.write_parquet(output_path)
+
+    csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
+    print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
+    df.write_csv(csv_output_path, compression="gzip")
+
+    if args.concat_with_latest_csv:
+        print("Loading latest CSV from GitHub releases to concatenate with...")
+        from src.get_latest_release import get_latest_aircraft_adsb_csv_df
+        df_latest_csv, csv_date = get_latest_aircraft_adsb_csv_df()
+        # Ensure column order matches before concatenating
+        df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+        from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
+        df_final = concat_compressed_dfs(df_latest_csv, df)
+        df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
+        final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_date}_{args.date}.csv.gz"
+        df_final.write_csv(final_csv_output_path, compression="gzip")
+
+if __name__ == "__main__":
+    main()
@@ -1,42 +1,33 @@
 """
 Downloads adsb.lol data and writes to Parquet files.

-Usage:
-    python -m src.process_historical_adsb_data.download_to_parquet 2025-01-01 2025-01-02
-
-This will download trace data for the specified date range and output Parquet files.
-
-This file is self-contained and does not import from other project modules.
+This file contains utility functions for downloading and processing adsb.lol trace data.
+Used by the historical ADS-B processing pipeline.
 """
-import gc
-import glob
+import datetime as dt
 import gzip
+import os
+import re
 import resource
 import shutil
-import sys
-import logging
-import time
-import re
 import signal
-import concurrent.futures
 import subprocess
-import os
-import argparse
-import datetime as dt
-from datetime import datetime, timedelta, timezone
-import urllib.request
+import sys
 import urllib.error
-
+import urllib.request
+from datetime import datetime
+import time
 import orjson
 import pyarrow as pa
 import pyarrow.parquet as pq
+from pathlib import Path


 # ============================================================================
 # Configuration
 # ============================================================================

-OUTPUT_DIR = "./data/output"
+OUTPUT_DIR = Path("./data/output")
 os.makedirs(OUTPUT_DIR, exist_ok=True)

 PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output")
@@ -76,20 +67,16 @@ def timeout_handler(signum, frame):
    raise DownloadTimeoutException("Download timed out after 40 seconds")


-def fetch_releases(version_date: str) -> list:
-    """Fetch GitHub releases for a given version date from adsblol."""
-    year = version_date.split('.')[0][1:]
-    if version_date == "v2024.12.31":
-        year = "2025"
+def _fetch_releases_from_repo(year: str, version_date: str) -> list:
+    """Fetch GitHub releases for a given version date from a specific year's adsblol repo."""
    BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
-    # Match exact release name, exclude tmp releases
-    PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+$"
+    PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+(tmp)?$"
    releases = []
    page = 1
    
    while True:
        max_retries = 10
-        retry_delay = 60
+        retry_delay = 60*5
        
        for attempt in range(1, max_retries + 1):
            try:
@@ -101,7 +88,7 @@ def fetch_releases(version_date: str) -> list:
                    else:
                        print(f"Failed to fetch releases (attempt {attempt}/{max_retries}): {response.status} {response.reason}")
                        if attempt < max_retries:
-                            print(f"Waiting {retry_delay} seconds before retry...")
+                            print(f"Waiting {retry_delay} seconds before retry")
                            time.sleep(retry_delay)
                        else:
                            print(f"Giving up after {max_retries} attempts")
@@ -109,7 +96,7 @@ def fetch_releases(version_date: str) -> list:
            except Exception as e:
                print(f"Request exception (attempt {attempt}/{max_retries}): {e}")
                if attempt < max_retries:
-                    print(f"Waiting {retry_delay} seconds before retry...")
+                    print(f"Waiting {retry_delay} seconds before retry")
                    time.sleep(retry_delay)
                else:
                    print(f"Giving up after {max_retries} attempts")
@@ -123,6 +110,25 @@ def fetch_releases(version_date: str) -> list:
    return releases


+def fetch_releases(version_date: str) -> list:
+    """Fetch GitHub releases for a given version date from adsblol.
+    
+    For Dec 31 dates, if no releases are found in the current year's repo,
+    also checks the next year's repo (adsblol sometimes publishes Dec 31
+    data in the following year's repository).
+    """
+    year = version_date.split('.')[0][1:]
+    releases = _fetch_releases_from_repo(year, version_date)
+    
+    # For last day of year, also check next year's repo if nothing found
+    if not releases and version_date.endswith(".12.31"):
+        next_year = str(int(year) + 1)
+        print(f"No releases found for {version_date} in {year} repo, checking {next_year} repo")
+        releases = _fetch_releases_from_repo(next_year, version_date)
+    
+    return releases
+
+
 def download_asset(asset_url: str, file_path: str) -> bool:
    """Download a single release asset."""
    os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
@@ -131,33 +137,58 @@ def download_asset(asset_url: str, file_path: str) -> bool:
        print(f"[SKIP] {file_path} already downloaded.")
        return True
    
-    print(f"Downloading {asset_url}...")
-    try:
-        signal.signal(signal.SIGALRM, timeout_handler)
-        signal.alarm(40)  # 40-second timeout
-        
-        req = urllib.request.Request(asset_url, headers=HEADERS)
-        with urllib.request.urlopen(req) as response:
-            signal.alarm(0)
-            
-            if response.status == 200:
-                with open(file_path, "wb") as file:
-                    while True:
-                        chunk = response.read(8192)
-                        if not chunk:
-                            break
-                        file.write(chunk)
-                print(f"Saved {file_path}")
-                return True
+    max_retries = 2
+    retry_delay = 30
+    timeout_seconds = 140
+    
+    for attempt in range(1, max_retries + 1):
+        print(f"Downloading {asset_url} (attempt {attempt}/{max_retries})")
+        try:
+            req = urllib.request.Request(asset_url, headers=HEADERS)
+            with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
+                if response.status == 200:
+                    with open(file_path, "wb") as file:
+                        while True:
+                            chunk = response.read(8192)
+                            if not chunk:
+                                break
+                            file.write(chunk)
+                    print(f"Saved {file_path}")
+                    return True
+                else:
+                    print(f"Failed to download {asset_url}: {response.status} {response.msg}")
+                    if attempt < max_retries:
+                        print(f"Waiting {retry_delay} seconds before retry")
+                        time.sleep(retry_delay)
+                    else:
+                        return False
+        except urllib.error.HTTPError as e:
+            if e.code == 404:
+                print(f"404 Not Found: {asset_url}")
+                raise Exception(f"Asset not found (404): {asset_url}")
+            else:
+                print(f"HTTP error occurred (attempt {attempt}/{max_retries}): {e.code} {e.reason}")
+                if attempt < max_retries:
+                    print(f"Waiting {retry_delay} seconds before retry")
+                    time.sleep(retry_delay)
+                else:
+                    return False
+        except urllib.error.URLError as e:
+            print(f"URL/Timeout error (attempt {attempt}/{max_retries}): {e}")
+            if attempt < max_retries:
+                print(f"Waiting {retry_delay} seconds before retry")
+                time.sleep(retry_delay)
            else:
-                print(f"Failed to download {asset_url}: {response.status} {response.msg}")
                return False
-    except DownloadTimeoutException as e:
-        print(f"Download aborted for {asset_url}: {e}")
-        return False
-    except Exception as e:
-        print(f"An error occurred while downloading {asset_url}: {e}")
-        return False
+        except Exception as e:
+            print(f"An error occurred (attempt {attempt}/{max_retries}): {e}")
+            if attempt < max_retries:
+                print(f"Waiting {retry_delay} seconds before retry")
+                time.sleep(retry_delay)
+            else:
+                return False
+    
+    return False


 def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
@@ -389,8 +420,6 @@ COLUMNS = [

 OS_CPU_COUNT = os.cpu_count() or 1
 MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1
-CHUNK_SIZE = MAX_WORKERS * 500  # Reduced for lower RAM usage
-BATCH_SIZE = 250_000  # Fixed size for predictable memory usage (~500MB per batch)

 # PyArrow schema for efficient Parquet writing
 PARQUET_SCHEMA = pa.schema([
@@ -478,211 +507,6 @@ def collect_trace_files_with_find(root_dir):
    return trace_dict


-def generate_version_dates(start_date: str, end_date: str) -> list:
-    """Generate a list of dates from start_date to end_date inclusive."""
-    start = datetime.strptime(start_date, "%Y-%m-%d")
-    end = datetime.strptime(end_date, "%Y-%m-%d")
-    delta = end - start
-    return [start + timedelta(days=i) for i in range(delta.days + 1)]
-
-
-def safe_process(fp):
-    """Safely process a file, returning empty list on error."""
-    try:
-        return process_file(fp)
-    except Exception as e:
-        logging.error(f"Error processing {fp}: {e}")
-        return []
-
-
-def rows_to_arrow_table(rows: list) -> pa.Table:
-    """Convert list of rows to a PyArrow Table directly (no pandas)."""
-    # Transpose rows into columns
-    columns = list(zip(*rows))
-    
-    # Build arrays for each column according to schema
-    arrays = []
-    for i, field in enumerate(PARQUET_SCHEMA):
-        col_data = list(columns[i]) if i < len(columns) else [None] * len(rows)
-        arrays.append(pa.array(col_data, type=field.type))
-    
-    return pa.Table.from_arrays(arrays, schema=PARQUET_SCHEMA)
-
-
-def write_batch_to_parquet(rows: list, version_date: str, batch_idx: int):
-    """Write a batch of rows to a Parquet file."""
-    if not rows:
-        return
-    
-    table = rows_to_arrow_table(rows)
-    
-    parquet_path = os.path.join(PARQUET_DIR, f"{version_date}_batch_{batch_idx:04d}.parquet")
-    
-    pq.write_table(table, parquet_path, compression='snappy')
-    
-    print(f"Written parquet batch {batch_idx} ({len(rows)} rows) | {get_resource_usage()}")
-
-
-def merge_parquet_files(version_date: str, delete_batches: bool = True):
-    """Merge all batch parquet files for a version_date into a single file using streaming."""
-    pattern = os.path.join(PARQUET_DIR, f"{version_date}_batch_*.parquet")
-    batch_files = sorted(glob.glob(pattern))
-    
-    if not batch_files:
-        print(f"No batch files found for {version_date}")
-        return None
-    
-    print(f"Merging {len(batch_files)} batch files for {version_date} (streaming)...")
-    
-    merged_path = os.path.join(PARQUET_DIR, f"{version_date}.parquet")
-    total_rows = 0
-    
-    # Stream write: read one batch at a time to minimize RAM usage
-    writer = None
-    try:
-        for i, f in enumerate(batch_files):
-            table = pq.read_table(f)
-            total_rows += table.num_rows
-            
-            if writer is None:
-                writer = pq.ParquetWriter(merged_path, table.schema, compression='snappy')
-            
-            writer.write_table(table)
-            
-            # Delete batch file immediately after reading to free disk space
-            if delete_batches:
-                os.remove(f)
-            
-            # Free memory
-            del table
-            if (i + 1) % 10 == 0:
-                gc.collect()
-                print(f"  Merged {i + 1}/{len(batch_files)} batches... | {get_resource_usage()}")
-    finally:
-        if writer is not None:
-            writer.close()
-    
-    print(f"Merged parquet file written to {merged_path} ({total_rows} total rows) | {get_resource_usage()}")
-    
-    if delete_batches:
-        print(f"Deleted {len(batch_files)} batch files during merge")
-    
-    gc.collect()
-    return merged_path
-
-
-def process_version_date(version_date: str, keep_folders: bool = False):
-    """Download, extract, and process trace files for a single version date."""
-    print(f"\nProcessing version_date: {version_date}")
-    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
-    
-    def collect_trace_files_for_version_date(vd):
-        releases = fetch_releases(vd)
-        if len(releases) == 0:
-            print(f"No releases found for {vd}.")
-            return None
-        
-        downloaded_files = []
-        for release in releases:
-            tag_name = release["tag_name"]
-            print(f"Processing release: {tag_name}")
-
-            # Only download prod-0 if available, else prod-0tmp
-            assets = release.get("assets", [])
-            normal_assets = [
-                a for a in assets
-                if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
-            ]
-            tmp_assets = [
-                a for a in assets
-                if "planes-readsb-prod-0tmp" in a["name"]
-            ]
-            use_assets = normal_assets if normal_assets else tmp_assets
-
-            for asset in use_assets:
-                asset_name = asset["name"]
-                asset_url = asset["browser_download_url"]
-                file_path = os.path.join(OUTPUT_DIR, asset_name)
-                result = download_asset(asset_url, file_path)
-                if result:
-                    downloaded_files.append(file_path)
-
-        extract_split_archive(downloaded_files, extract_dir)
-        return collect_trace_files_with_find(extract_dir)
-
-    # Check if files already exist
-    pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
-    matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
-    
-    if matches:
-        print(f"Found existing files for {version_date}:")
-        # Prefer non-tmp slices when reusing existing files
-        normal_matches = [
-            p for p in matches
-            if "-planes-readsb-prod-0." in os.path.basename(p)
-            and "tmp" not in os.path.basename(p)
-        ]
-        downloaded_files = normal_matches if normal_matches else matches
-        
-        extract_split_archive(downloaded_files, extract_dir)
-        trace_files = collect_trace_files_with_find(extract_dir)
-    else:
-        trace_files = collect_trace_files_for_version_date(version_date)
-    
-    if trace_files is None or len(trace_files) == 0:
-        print(f"No trace files found for version_date: {version_date}")
-        return 0
-    
-    file_list = list(trace_files.values())
-    
-    start_time = time.perf_counter()
-    total_num_rows = 0
-    batch_rows = []
-    batch_idx = 0
-    
-    # Process files in chunks
-    for offset in range(0, len(file_list), CHUNK_SIZE):
-        chunk = file_list[offset:offset + CHUNK_SIZE]
-        with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_executor:
-            for rows in process_executor.map(safe_process, chunk):
-                if not rows:
-                    continue
-                batch_rows.extend(rows)
-                
-                if len(batch_rows) >= BATCH_SIZE:
-                    total_num_rows += len(batch_rows)
-                    write_batch_to_parquet(batch_rows, version_date, batch_idx)
-                    batch_idx += 1
-                    batch_rows = []
-                    
-                    elapsed = time.perf_counter() - start_time
-                    speed = total_num_rows / elapsed if elapsed > 0 else 0
-                    print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
-        
-        gc.collect()
-    
-    # Final batch
-    if batch_rows:
-        total_num_rows += len(batch_rows)
-        write_batch_to_parquet(batch_rows, version_date, batch_idx)
-        elapsed = time.perf_counter() - start_time
-        speed = total_num_rows / elapsed if elapsed > 0 else 0
-        print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
-    
-    print(f"Total rows processed for version_date {version_date}: {total_num_rows}")
-    
-    # Clean up extracted directory immediately after processing (before merging parquet files)
-    if not keep_folders and os.path.isdir(extract_dir):
-        print(f"Deleting extraction directory with 100,000+ files: {extract_dir}")
-        shutil.rmtree(extract_dir)
-        print(f"Successfully deleted extraction directory: {extract_dir} | {get_resource_usage()}")
-    
-    # Merge batch files into a single parquet file
-    merge_parquet_files(version_date, delete_batches=True)
-    
-    return total_num_rows
-
-
 def create_parquet_for_day(day, keep_folders: bool = False):
    """Create parquet file for a single day.
    
@@ -706,42 +530,10 @@ def create_parquet_for_day(day, keep_folders: bool = False):
        print(f"Parquet file already exists: {parquet_path}")
        return parquet_path
    
-    print(f"Creating parquet for {version_date}...")
+    print(f"Creating parquet for {version_date}")
    rows_processed = process_version_date(version_date, keep_folders)
    
    if rows_processed > 0 and parquet_path.exists():
        return parquet_path
    else:
        return None
-
-
-def main(start_date: str, end_date: str, keep_folders: bool = False):
-    """Main function to download and convert adsb.lol data to Parquet."""
-    version_dates = [f"v{date.strftime('%Y.%m.%d')}" for date in generate_version_dates(start_date, end_date)]
-    print(f"Processing dates: {version_dates}")
-    
-    total_rows_all = 0
-    for version_date in version_dates:
-        rows_processed = process_version_date(version_date, keep_folders)
-        total_rows_all += rows_processed
-    
-    print(f"\n=== Summary ===")
-    print(f"Total dates processed: {len(version_dates)}")
-    print(f"Total rows written to Parquet: {total_rows_all}")
-    print(f"Parquet files location: {PARQUET_DIR}")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
-    
-    parser = argparse.ArgumentParser(
-        description="Download adsb.lol data and write to Parquet files"
-    )
-    parser.add_argument("start_date", type=str, help="Start date in YYYY-MM-DD format")
-    parser.add_argument("end_date", type=str, help="End date in YYYY-MM-DD format")
-    parser.add_argument("--keep-folders", action="store_true", 
-                        help="Keep extracted folders after processing")
-    
-    args = parser.parse_args()
-    
-    main(args.start_date, args.end_date, args.keep_folders)
@@ -1,9 +1,7 @@
 """
-Downloads and extracts adsb.lol tar files, then lists all ICAO folders.
+Downloads and extracts adsb.lol tar files for a single day, then lists all ICAO folders.
 This is the first step of the map-reduce pipeline.

-Supports both single-day (daily) and multi-day (historical) modes.
-
 Outputs:
 - Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
 - ICAO manifest at data/output/icao_manifest_{date}.txt
@@ -25,11 +23,6 @@ from src.adsb.download_adsb_data_to_parquet import (
 )


-def get_target_day() -> datetime:
-    """Get yesterday's date (the day we're processing)."""
-    return datetime.utcnow() - timedelta(days=1)
-
-
 def download_and_extract(version_date: str) -> str | None:
    """Download and extract tar files, return extract directory path."""
    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
@@ -59,6 +52,12 @@ def download_and_extract(version_date: str) -> str | None:
            print(f"No releases found for {version_date}")
            return None
        
+        # Prefer non-tmp releases; only use tmp if no normal releases exist
+        normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
+        tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
+        releases = normal_releases if normal_releases else tmp_releases
+        print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
+        
        downloaded_files = []
        for release in releases:
            tag_name = release["tag_name"]
@@ -100,21 +99,6 @@ def list_icao_folders(extract_dir: str) -> list[str]:
    return icaos


-def write_manifest(icaos: list[str], manifest_id: str) -> str:
-    """Write ICAO list to manifest file.
-    
-    Args:
-        icaos: List of ICAO codes
-        manifest_id: Identifier for manifest file (date or date range)
-    """
-    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
-    with open(manifest_path, "w") as f:
-        for icao in sorted(icaos):
-            f.write(f"{icao}\n")
-    print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
-    return manifest_path
-
-
 def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    """Process a single day: download, extract, list ICAOs.
    
@@ -129,82 +113,50 @@ def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    extract_dir = download_and_extract(version_date)
    if not extract_dir:
        print(f"Failed to download/extract data for {date_str}")
-        return None, []
+        raise Exception(f"No data available for {date_str}")
    
    icaos = list_icao_folders(extract_dir)
    print(f"Found {len(icaos)} ICAOs for {date_str}")
    
    return extract_dir, icaos

-
-def process_date_range(start_date: datetime, end_date: datetime) -> set[str]:
-    """Process multiple days: download, extract, combine ICAO lists.
-    
-    Args:
-        start_date: Start date (inclusive)
-        end_date: End date (inclusive)
-    
-    Returns:
-        Combined set of all ICAOs across the date range
-    """
-    all_icaos: set[str] = set()
-    current = start_date
-    
-    # Both start and end are inclusive
-    while current <= end_date:
-        _, icaos = process_single_day(current)
-        all_icaos.update(icaos)
-        current += timedelta(days=1)
-    
-    return all_icaos
+from pathlib import Path
+import tarfile
+NUMBER_PARTS = 4
+def split_folders_into_gzip_archives(extract_dir: Path, tar_output_dir: Path, icaos: list[str], parts = NUMBER_PARTS) -> list[str]:
+    traces_dir = extract_dir / "traces"
+    buckets = sorted(traces_dir.iterdir())
+    tars = []
+    for i in range(parts):
+        tar_path = tar_output_dir / f"{tar_output_dir.name}_part_{i}.tar.gz"
+        tars.append(tarfile.open(tar_path, "w:gz"))
+    for idx, bucket_path in enumerate(buckets):
+        tar_idx = idx % parts
+        tars[tar_idx].add(bucket_path, arcname=bucket_path.name)
+    for tar in tars:
+        tar.close()


 def main():
-    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
+    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data for a single day")
    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
-    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
-    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    args = parser.parse_args()
    
-    # Determine mode: single day or date range
-    if args.start_date and args.end_date:
-        # Historical mode: process date range
-        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
-        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
-        
-        print(f"Processing date range: {args.start_date} to {args.end_date}")
-        
-        all_icaos = process_date_range(start_date, end_date)
-        
-        if not all_icaos:
-            print("No ICAOs found in date range")
-            sys.exit(1)
-        
-        # Write combined manifest with range identifier
-        manifest_id = f"{args.start_date}_{args.end_date}"
-        write_manifest(list(all_icaos), manifest_id)
-        
-        print(f"\nDone! Total ICAOs: {len(all_icaos)}")
-        
-    else:
-        # Daily mode: single day
-        if args.date:
-            target_day = datetime.strptime(args.date, "%Y-%m-%d")
-        else:
-            target_day = get_target_day()
-        
-        date_str = target_day.strftime("%Y-%m-%d")
-        
-        extract_dir, icaos = process_single_day(target_day)
-        
-        if not icaos:
-            print("No ICAOs found")
-            sys.exit(1)
-        
-        write_manifest(icaos, date_str)
-        
-        print(f"\nDone! Extract dir: {extract_dir}")
-        print(f"Total ICAOs: {len(icaos)}")
+    target_day = datetime.strptime(args.date, "%Y-%m-%d")
+    date_str = target_day.strftime("%Y-%m-%d")
+    tar_output_dir = Path(f"./data/output/adsb_archives/{date_str}")
+    
+    extract_dir, icaos = process_single_day(target_day)
+    extract_dir = Path(extract_dir)
+    print(extract_dir)
+    tar_output_dir.mkdir(parents=True, exist_ok=True)
+    split_folders_into_gzip_archives(extract_dir, tar_output_dir, icaos)
+    if not icaos:
+        print("No ICAOs found")
+        sys.exit(1)
+    
+    print(f"\nDone! Extract dir: {extract_dir}")
+    print(f"Total ICAOs: {len(icaos)}")


 if __name__ == "__main__":
@@ -41,7 +41,7 @@ def main() -> None:
    """Main entry point for GitHub Actions."""
    start_date = os.environ.get("INPUT_START_DATE")
    end_date = os.environ.get("INPUT_END_DATE")
-    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
+    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "1"))
    
    if not start_date or not end_date:
        print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
@@ -0,0 +1,78 @@
+"""
+Main pipeline for processing ADS-B data from adsb.lol.
+
+Usage:
+    python -m src.adsb.main --date 2026-01-01
+    python -m src.adsb.main --start_date 2026-01-01 --end_date 2026-01-03
+"""
+import argparse
+import subprocess
+import sys
+from datetime import datetime, timedelta
+
+import polars as pl
+
+from src.adsb.download_and_list_icaos import NUMBER_PARTS
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process ADS-B data for a single day or date range")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format")
+    parser.add_argument("--start_date", type=str, help="Start date (inclusive, YYYY-MM-DD)")
+    parser.add_argument("--end_date", type=str, help="End date (exclusive, YYYY-MM-DD)")
+    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
+    args = parser.parse_args()
+
+    if args.date and (args.start_date or args.end_date):
+        raise SystemExit("Use --date or --start_date/--end_date, not both.")
+
+    if args.date:
+        start_date = datetime.strptime(args.date, "%Y-%m-%d")
+        end_date = start_date + timedelta(days=1)
+    else:
+        if not args.start_date or not args.end_date:
+            raise SystemExit("Provide --start_date and --end_date, or use --date.")
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+
+    current = start_date
+    while current < end_date:
+        date_str = current.strftime("%Y-%m-%d")
+        print(f"Processing day: {date_str}")
+
+        # Download and split
+        subprocess.run([sys.executable, "-m", "src.adsb.download_and_list_icaos", "--date", date_str], check=True)
+
+        # Process parts
+        for part_id in range(NUMBER_PARTS):
+            subprocess.run([sys.executable, "-m", "src.adsb.process_icao_chunk", "--part-id", str(part_id), "--date", date_str], check=True)
+
+        # Concatenate
+        concat_cmd = [sys.executable, "-m", "src.adsb.concat_parquet_to_final", "--date", date_str]
+        if args.concat_with_latest_csv:
+            concat_cmd.append("--concat_with_latest_csv")
+        subprocess.run(concat_cmd, check=True)
+
+        current += timedelta(days=1)
+
+    if end_date - start_date > timedelta(days=1):
+        dates = []
+        cur = start_date
+        while cur < end_date:
+            dates.append(cur.strftime("%Y-%m-%d"))
+            cur += timedelta(days=1)
+        csv_files = [
+            f"data/outputs/openairframes_adsb_{d}_{d}.csv"
+            for d in dates
+        ]
+        frames = [pl.read_csv(p) for p in csv_files]
+        df = pl.concat(frames, how="vertical", rechunk=True)
+        output_path = f"data/outputs/openairframes_adsb_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv"
+        df.write_csv(output_path)
+        print(f"Wrote combined CSV: {output_path}")
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,18 +1,9 @@
 """
-Processes a chunk of ICAOs from pre-extracted trace files.
+Processes trace files from a single archive part for a single day.
 This is the map phase of the map-reduce pipeline.

-Supports both single-day (daily) and multi-day (historical) modes.
-
-Expects extract_dir to already exist with trace files.
-Reads ICAO manifest to determine which ICAOs to process based on chunk-id.
-
 Usage:
-    # Daily mode (single day)
-    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
-    
-    # Historical mode (date range)
-    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
+    python -m src.adsb.process_icao_chunk --part-id 1 --date 2026-01-01
 """
 import gc
 import os
@@ -21,6 +12,9 @@ import argparse
 import time
 import concurrent.futures
 from datetime import datetime, timedelta
+import tarfile
+import tempfile
+import shutil

 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -37,72 +31,21 @@ from src.adsb.download_adsb_data_to_parquet import (
 )


-CHUNK_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
-os.makedirs(CHUNK_OUTPUT_DIR, exist_ok=True)
-
 # Smaller batch size for memory efficiency
 BATCH_SIZE = 100_000

-
-def get_target_day() -> datetime:
-    """Get yesterday's date (the day we're processing)."""
-    return datetime.utcnow() - timedelta(days=1)
-
-
-def read_manifest(manifest_id: str) -> list[str]:
-    """Read ICAO manifest file.
+def build_trace_file_map(archive_path: str) -> dict[str, str]:
+    """Build a map of ICAO -> trace file path by extracting tar.gz archive."""
+    print(f"Extracting {archive_path}...")
    
-    Args:
-        manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
-    """
-    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
-    if not os.path.exists(manifest_path):
-        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
+    temp_dir = tempfile.mkdtemp(prefix="adsb_extract_")
    
-    with open(manifest_path, "r") as f:
-        icaos = [line.strip() for line in f if line.strip()]
-    return icaos
-
-
-def deterministic_hash(s: str) -> int:
-    """Return a deterministic hash for a string (unlike Python's hash() which is randomized)."""
-    # Use sum of byte values - simple but deterministic
-    return sum(ord(c) for c in s)
-
-
-def get_chunk_icaos(icaos: list[str], chunk_id: int, total_chunks: int) -> list[str]:
-    """Get the subset of ICAOs for this chunk based on deterministic hash partitioning."""
-    return [icao for icao in icaos if deterministic_hash(icao) % total_chunks == chunk_id]
-
-
-def build_trace_file_map(extract_dir: str) -> dict[str, str]:
-    """Build a map of ICAO -> trace file path using find command."""
-    print(f"Building trace file map from {extract_dir}...")
+    with tarfile.open(archive_path, 'r:gz') as tar:
+        tar.extractall(path=temp_dir, filter='data')
    
-    # Debug: check what's in extract_dir
-    if os.path.isdir(extract_dir):
-        items = os.listdir(extract_dir)[:10]
-        print(f"First 10 items in extract_dir: {items}")
-        # Check if there are subdirectories
-        for item in items[:3]:
-            subpath = os.path.join(extract_dir, item)
-            if os.path.isdir(subpath):
-                subitems = os.listdir(subpath)[:5]
-                print(f"  Contents of {item}/: {subitems}")
-    
-    trace_map = collect_trace_files_with_find(extract_dir)
+    trace_map = collect_trace_files_with_find(temp_dir)
    print(f"Found {len(trace_map)} trace files")
    
-    if len(trace_map) == 0:
-        # Debug: try manual find
-        import subprocess
-        result = subprocess.run(
-            ['find', extract_dir, '-type', 'f', '-name', 'trace_full_*'],
-            capture_output=True, text=True
-        )
-        print(f"Manual find output (first 500 chars): {result.stdout[:500]}")
-        print(f"Manual find stderr: {result.stderr[:200]}")
-    
    return trace_map


@@ -125,42 +68,13 @@ def rows_to_table(rows: list) -> pa.Table:


 def process_chunk(
-    chunk_id: int,
-    total_chunks: int,
-    trace_map: dict[str, str],
-    icaos: list[str],
-    output_id: str,
+    trace_files: list[str],
+    part_id: int,
+    date_str: str,
 ) -> str | None:
-    """Process a chunk of ICAOs and write to parquet.
+    """Process trace files and write to a single parquet file."""
    
-    Args:
-        chunk_id: This chunk's ID (0-indexed)
-        total_chunks: Total number of chunks
-        trace_map: Map of ICAO -> trace file path
-        icaos: Full list of ICAOs from manifest
-        output_id: Identifier for output file (date or date range)
-    """
-    chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
-    print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
-    
-    if not chunk_icaos:
-        print(f"Chunk {chunk_id}: No ICAOs to process")
-        return None
-    
-    # Get trace file paths from the map
-    trace_files = []
-    for icao in chunk_icaos:
-        if icao in trace_map:
-            trace_files.append(trace_map[icao])
-    
-    print(f"Chunk {chunk_id}: Found {len(trace_files)} trace files")
-    
-    if not trace_files:
-        print(f"Chunk {chunk_id}: No trace files found")
-        return None
-    
-    # Process files and write parquet in batches
-    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
+    output_path = os.path.join(PARQUET_DIR, f"part_{part_id}_{date_str}.parquet")
    
    start_time = time.perf_counter()
    total_rows = 0
@@ -168,7 +82,8 @@ def process_chunk(
    writer = None
    
    try:
-        # Process in parallel batches
+        writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
+        
        files_per_batch = MAX_WORKERS * 100
        for offset in range(0, len(trace_files), files_per_batch):
            batch_files = trace_files[offset:offset + files_per_batch]
@@ -178,166 +93,63 @@ def process_chunk(
                    if rows:
                        batch_rows.extend(rows)
                        
-                        # Write when batch is full
                        if len(batch_rows) >= BATCH_SIZE:
-                            table = rows_to_table(batch_rows)
+                            writer.write_table(rows_to_table(batch_rows))
                            total_rows += len(batch_rows)
-                            
-                            if writer is None:
-                                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
-                            writer.write_table(table)
-                            
                            batch_rows = []
-                            del table
                            gc.collect()
-                            
-                            elapsed = time.perf_counter() - start_time
-                            print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
-            
            gc.collect()
        
-        # Write remaining rows
        if batch_rows:
-            table = rows_to_table(batch_rows)
+            writer.write_table(rows_to_table(batch_rows))
            total_rows += len(batch_rows)
-            
-            if writer is None:
-                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
-            writer.write_table(table)
-            del table
    
    finally:
        if writer:
            writer.close()
    
-    elapsed = time.perf_counter() - start_time
-    print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
+    print(f"Part {part_id}: Done! {total_rows} rows in {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
    
-    if total_rows > 0:
-        return output_path
-    return None
-
-
-def process_single_day(
-    chunk_id: int,
-    total_chunks: int,
-    target_day: datetime,
-) -> str | None:
-    """Process a single day for this chunk."""
-    date_str = target_day.strftime("%Y-%m-%d")
-    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
-    
-    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
-    
-    if not os.path.isdir(extract_dir):
-        print(f"Extract directory not found: {extract_dir}")
-        return None
-    
-    trace_map = build_trace_file_map(extract_dir)
-    if not trace_map:
-        print("No trace files found")
-        return None
-    
-    icaos = read_manifest(date_str)
-    print(f"Total ICAOs in manifest: {len(icaos)}")
-    
-    return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
-
-
-def process_date_range(
-    chunk_id: int,
-    total_chunks: int,
-    start_date: datetime,
-    end_date: datetime,
-) -> str | None:
-    """Process a date range for this chunk.
-    
-    Combines trace files from all days in the range.
-    
-    Args:
-        chunk_id: This chunk's ID (0-indexed)
-        total_chunks: Total number of chunks
-        start_date: Start date (inclusive)
-        end_date: End date (inclusive)
-    """
-    start_str = start_date.strftime("%Y-%m-%d")
-    end_str = end_date.strftime("%Y-%m-%d")
-    manifest_id = f"{start_str}_{end_str}"
-    
-    print(f"Processing date range: {start_str} to {end_str}")
-    
-    # Build combined trace map from all days
-    combined_trace_map: dict[str, str] = {}
-    current = start_date
-    
-    # Both start and end are inclusive
-    while current <= end_date:
-        version_date = f"v{current.strftime('%Y.%m.%d')}"
-        extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
-        
-        if os.path.isdir(extract_dir):
-            trace_map = build_trace_file_map(extract_dir)
-            # Later days override earlier days (use most recent trace file)
-            combined_trace_map.update(trace_map)
-            print(f"  {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
-        else:
-            print(f"  {current.strftime('%Y-%m-%d')}: no extract directory")
-        
-        current += timedelta(days=1)
-    
-    if not combined_trace_map:
-        print("No trace files found in date range")
-        return None
-    
-    print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
-    
-    icaos = read_manifest(manifest_id)
-    print(f"Total ICAOs in manifest: {len(icaos)}")
-    
-    return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)
+    return output_path if total_rows > 0 else None

+from pathlib import Path

 def main():
-    parser = argparse.ArgumentParser(description="Process a chunk of ICAOs")
-    parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
-    parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks")
-    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
-    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
-    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
+    parser = argparse.ArgumentParser(description="Process a single archive part for a day")
+    parser.add_argument("--part-id", type=int, required=True, help="Part ID (1-indexed)")
+    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
    args = parser.parse_args()
    
-    print(f"Processing chunk {args.chunk_id}/{args.total_chunks}")
-    print(f"OUTPUT_DIR: {OUTPUT_DIR}")
-    print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
-    print(f"Resource usage at start: {get_resource_usage()}")
+    print(f"Processing part {args.part_id} for {args.date}")
    
-    # Debug: List what's in OUTPUT_DIR
-    print(f"\nContents of {OUTPUT_DIR}:")
-    if os.path.isdir(OUTPUT_DIR):
-        for item in os.listdir(OUTPUT_DIR)[:20]:
-            print(f"  - {item}")
-    else:
-        print(f"  Directory does not exist!")
+    # Get specific archive file for this part
+    archive_path = os.path.join(OUTPUT_DIR, "adsb_archives", args.date, f"{args.date}_part_{args.part_id}.tar.gz")
    
-    # Determine mode: single day or date range
-    if args.start_date and args.end_date:
-        # Historical mode
-        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
-        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
-        output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
-    else:
-        # Daily mode
-        if args.date:
-            target_day = datetime.strptime(args.date, "%Y-%m-%d")
-        else:
-            target_day = get_target_day()
-        output_path = process_single_day(args.chunk_id, args.total_chunks, target_day)
+    # Extract and collect trace files
+    trace_map = build_trace_file_map(archive_path)
+    all_trace_files = list(trace_map.values())
    
-    if output_path:
-        print(f"Output: {output_path}")
-    else:
-        print("No output generated")
+    print(f"Total trace files: {len(all_trace_files)}")
+    
+    # Process and write output
+    output_path = process_chunk(all_trace_files, args.part_id, args.date)
+    
+    from src.adsb.compress_adsb_to_aircraft_data import compress_parquet_part
+    df_compressed = compress_parquet_part(args.part_id, args.date)
+    
+    # Write parquet
+    df_compressed_output = OUTPUT_DIR / "compressed" / args.date/ f"part_{args.part_id}_{args.date}.parquet"
+    os.makedirs(df_compressed_output.parent, exist_ok=True)
+    df_compressed.write_parquet(df_compressed_output, compression='snappy')
+    
+    # Write CSV
+    csv_output = OUTPUT_DIR / "compressed" / args.date / f"part_{args.part_id}_{args.date}.csv"
+    df_compressed.write_csv(csv_output)
+    
+    print(f"Raw output: {output_path}" if output_path else "No raw output generated")
+    print(f"Compressed parquet: {df_compressed_output}")
+    print(f"Compressed CSV: {csv_output}")


 if __name__ == "__main__":
-    main()
+    main()
@@ -1,97 +0,0 @@
-"""
-Reduce step: downloads all chunk CSVs from S3, combines them,
-deduplicates across the full dataset, and uploads the final result.
-
-Environment variables:
-  S3_BUCKET         — bucket with intermediate results
-  RUN_ID            — run identifier matching the map workers
-  GLOBAL_START_DATE — overall start date for output filename
-  GLOBAL_END_DATE   — overall end date for output filename
-"""
-import gzip
-import os
-import shutil
-from pathlib import Path
-
-import boto3
-import polars as pl
-
-from compress_adsb_to_aircraft_data import COLUMNS, deduplicate_by_signature
-
-
-def main():
-    s3_bucket = os.environ["S3_BUCKET"]
-    run_id = os.environ.get("RUN_ID", "default")
-    global_start = os.environ["GLOBAL_START_DATE"]
-    global_end = os.environ["GLOBAL_END_DATE"]
-
-    s3 = boto3.client("s3")
-    prefix = f"intermediate/{run_id}/"
-
-    # List all chunk files for this run
-    paginator = s3.get_paginator("list_objects_v2")
-    chunk_keys = []
-    for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix):
-        for obj in page.get("Contents", []):
-            if obj["Key"].endswith(".csv.gz"):
-                chunk_keys.append(obj["Key"])
-
-    chunk_keys.sort()
-    print(f"Found {len(chunk_keys)} chunks to combine")
-
-    if not chunk_keys:
-        print("No chunks found — nothing to reduce.")
-        return
-
-    # Download and concatenate all chunks
-    download_dir = Path("/tmp/chunks")
-    download_dir.mkdir(parents=True, exist_ok=True)
-
-    dfs = []
-
-    for key in chunk_keys:
-        gz_path = download_dir / Path(key).name
-        csv_path = gz_path.with_suffix("")  # Remove .gz
-        print(f"Downloading {key}...")
-        s3.download_file(s3_bucket, key, str(gz_path))
-
-        # Decompress
-        with gzip.open(gz_path, 'rb') as f_in:
-            with open(csv_path, 'wb') as f_out:
-                shutil.copyfileobj(f_in, f_out)
-        gz_path.unlink()
-
-        df_chunk = pl.read_csv(csv_path)
-        print(f"  Loaded {df_chunk.height} rows from {csv_path.name}")
-        dfs.append(df_chunk)
-
-        # Free disk space after loading
-        csv_path.unlink()
-
-    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
-    print(f"Combined: {df_accumulated.height} rows before dedup")
-
-    # Final global deduplication
-    df_accumulated = deduplicate_by_signature(df_accumulated)
-    print(f"After dedup: {df_accumulated.height} rows")
-
-    # Write and upload final result
-    output_name = f"openairframes_adsb_{global_start}_{global_end}.csv.gz"
-    csv_output = Path(f"/tmp/openairframes_adsb_{global_start}_{global_end}.csv")
-    gz_output = Path(f"/tmp/{output_name}")
-    
-    df_accumulated.write_csv(csv_output)
-    with open(csv_output, 'rb') as f_in:
-        with gzip.open(gz_output, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    csv_output.unlink()
-
-    final_key = f"final/{output_name}"
-    print(f"Uploading to s3://{s3_bucket}/{final_key}")
-    s3.upload_file(str(gz_output), s3_bucket, final_key)
-
-    print(f"Final output: {df_accumulated.height} records -> {final_key}")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,2 +0,0 @@
-polars>=1.0
-boto3>=1.34
@@ -1,5 +0,0 @@
-polars>=1.0
-pyarrow>=14.0
-orjson>=3.9
-boto3>=1.34
-zstandard>=0.22
@@ -1,89 +0,0 @@
-"""
-Map worker: processes a date range chunk, uploads result to S3.
-
-Environment variables:
-  START_DATE  — inclusive, YYYY-MM-DD
-  END_DATE    — exclusive, YYYY-MM-DD
-  S3_BUCKET   — bucket for intermediate results
-  RUN_ID      — unique run identifier for namespacing S3 keys
-"""
-import os
-import sys
-from datetime import datetime, timedelta
-from pathlib import Path
-
-import boto3
-import polars as pl
-
-from compress_adsb_to_aircraft_data import (
-    load_historical_for_day,
-    deduplicate_by_signature,
-    COLUMNS,
-)
-
-
-def main():
-    start_date_str = os.environ["START_DATE"]
-    end_date_str = os.environ["END_DATE"]
-    s3_bucket = os.environ["S3_BUCKET"]
-    run_id = os.environ.get("RUN_ID", "default")
-
-    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
-    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
-
-    total_days = (end_date - start_date).days
-    print(f"Worker: processing {total_days} days [{start_date_str}, {end_date_str})")
-
-    dfs = []
-    current_date = start_date
-
-    while current_date < end_date:
-        day_str = current_date.strftime("%Y-%m-%d")
-        print(f"  Loading {day_str}...")
-
-        df_compressed = load_historical_for_day(current_date)
-        if df_compressed.height == 0:
-            raise RuntimeError(f"No data found for {day_str}")
-
-        dfs.append(df_compressed)
-        total_rows = sum(df.height for df in dfs)
-        print(f"  +{df_compressed.height} rows (total: {total_rows})")
-
-        # Delete local cache after each day to save disk in container
-        cache_dir = Path("data/adsb")
-        if cache_dir.exists():
-            import shutil
-            shutil.rmtree(cache_dir)
-
-        current_date += timedelta(days=1)
-
-    # Concatenate all days
-    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
-
-    # Deduplicate within this chunk
-    df_accumulated = deduplicate_by_signature(df_accumulated)
-    print(f"After dedup: {df_accumulated.height} rows")
-
-    # Write to local file then upload to S3
-    local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv")
-    df_accumulated.write_csv(local_path)
-    
-    # Compress with gzip
-    import gzip
-    import shutil
-    gz_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
-    with open(local_path, 'rb') as f_in:
-        with gzip.open(gz_path, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    local_path.unlink()  # Remove uncompressed file
-
-    s3_key = f"intermediate/{run_id}/chunk_{start_date_str}_{end_date_str}.csv.gz"
-    print(f"Uploading to s3://{s3_bucket}/{s3_key}")
-
-    s3 = boto3.client("s3")
-    s3.upload_file(str(gz_path), s3_bucket, s3_key)
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""
+Download ADS-B Exchange basic-ac-db.json.gz.
+
+Usage:
+    python -m src.contributions.create_daily_adsbexchange_release [--date YYYY-MM-DD]
+"""
+from __future__ import annotations
+
+import argparse
+import shutil
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.request import Request, urlopen
+
+URL = "https://downloads.adsbexchange.com/downloads/basic-ac-db.json.gz"
+OUT_ROOT = Path("data/openairframes")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Create daily ADS-B Exchange JSON release")
+    parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
+    args = parser.parse_args()
+
+    date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+
+    gz_path = OUT_ROOT / f"basic-ac-db_{date_str}.json.gz"
+
+    print(f"Downloading {URL}...")
+    req = Request(URL, headers={"User-Agent": "openairframes-downloader/1.0"}, method="GET")
+    with urlopen(req, timeout=300) as r, gz_path.open("wb") as f:
+        shutil.copyfileobj(r, f)
+
+    print(f"Wrote: {gz_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+Download Mictronics aircraft database zip.
+
+Usage:
+    python -m src.contributions.create_daily_microtonics_release [--date YYYY-MM-DD]
+"""
+from __future__ import annotations
+
+import argparse
+import shutil
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.error import URLError
+from urllib.request import Request, urlopen
+
+URL = "https://www.mictronics.de/aircraft-database/indexedDB_old.php"
+OUT_ROOT = Path("data/openairframes")
+MAX_RETRIES = 3
+RETRY_DELAY = 30  # seconds
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Create daily Mictronics database release")
+    parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
+    args = parser.parse_args()
+
+    date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+
+    zip_path = OUT_ROOT / f"mictronics-db_{date_str}.zip"
+
+    for attempt in range(1, MAX_RETRIES + 1):
+        try:
+            print(f"Downloading {URL} (attempt {attempt}/{MAX_RETRIES})...")
+            req = Request(URL, headers={"User-Agent": "Mozilla/5.0 (compatible; openairframes-downloader/1.0)"}, method="GET")
+            with urlopen(req, timeout=120) as r, zip_path.open("wb") as f:
+                shutil.copyfileobj(r, f)
+            print(f"Wrote: {zip_path}")
+            return
+        except (URLError, TimeoutError) as e:
+            print(f"Attempt {attempt} failed: {e}")
+            if attempt < MAX_RETRIES:
+                print(f"Retrying in {RETRY_DELAY} seconds...")
+                time.sleep(RETRY_DELAY)
+            else:
+                print("All retries exhausted. Mictronics download failed.")
+                sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -1,84 +0,0 @@
-from pathlib import Path
-from datetime import datetime, timezone, timedelta
-import sys
-
-import polars as pl
-
-# Add adsb directory to path
-sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation
-
-from adsb.compress_adsb_to_aircraft_data import (
-    load_historical_for_day,
-    concat_compressed_dfs,
-    get_latest_aircraft_adsb_csv_df,
-)
-
-if __name__ == '__main__':
-    # Get yesterday's date (data for the previous day)
-    day = datetime.now(timezone.utc) - timedelta(days=1)
-
-    # Find a day with complete data
-    max_attempts = 2  # Don't look back more than a week
-    for attempt in range(max_attempts):
-        date_str = day.strftime("%Y-%m-%d")
-        print(f"Processing ADS-B data for {date_str}")
-        
-        print("Loading new ADS-B data...")
-        df_new = load_historical_for_day(day)
-        if df_new.height == 0:
-            day = day - timedelta(days=1)
-            continue
-        max_time = df_new['time'].max()
-        if max_time is not None:
-            # Handle timezone
-            max_time_dt = max_time
-            if hasattr(max_time_dt, 'replace'):
-                max_time_dt = max_time_dt.replace(tzinfo=timezone.utc)
-            
-            end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5)
-            
-            # Convert polars datetime to python datetime if needed
-            if isinstance(max_time_dt, datetime):
-                if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day:
-                    break
-            else:
-                # Polars returns python datetime already
-                if max_time >= day.replace(hour=23, minute=54, second=59):
-                    break
-        
-        print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.")
-        day = day - timedelta(days=1)
-    else:
-        raise RuntimeError(f"Could not find complete data in the last {max_attempts} days")
-
-    try:
-        # Get the latest release data
-        print("Downloading latest ADS-B release...")
-        df_base, start_date_str = get_latest_aircraft_adsb_csv_df()
-        # Combine with historical data
-        print("Combining with historical data...")
-        df_combined = concat_compressed_dfs(df_base, df_new)
-    except Exception as e:
-        print(f"Error downloading latest ADS-B release: {e}")
-        df_combined = df_new
-        start_date_str = date_str
-
-    # Sort by time for consistent ordering
-    df_combined = df_combined.sort('time')
-    
-    # Convert any list columns to strings for CSV compatibility
-    for col in df_combined.columns:
-        if df_combined[col].dtype == pl.List:
-            df_combined = df_combined.with_columns(
-                pl.col(col).list.join(",").alias(col)
-            )
-
-    # Save the result
-    OUT_ROOT = Path("data/openairframes")
-    OUT_ROOT.mkdir(parents=True, exist_ok=True)
-
-    output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv"
-    df_combined.write_csv(output_file)
-
-    print(f"Saved: {output_file}")
-    print(f"Total aircraft: {df_combined.height}")
@@ -47,6 +47,9 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
    
    # Convert all NaN to empty strings
    df = df.fillna("")
+    # The FAA parser can produce the literal string "None" for missing values;
+    # replace those so they match the empty-string convention used everywhere else.
+    df = df.replace("None", "")
    
    return df

@@ -84,8 +87,8 @@ def concat_faa_historical_df(df_base, df_new):
            # Convert to string
            val_str = str(val).strip()
            
-            # Handle empty strings
-            if val_str == "" or val_str == "nan":
+            # Handle empty strings and null-like literals
+            if val_str == "" or val_str == "nan" or val_str == "None":
                return ""
            
            # Check if it looks like a list representation (starts with [ )
@@ -119,6 +119,7 @@ def download_latest_aircraft_csv(
    Returns:
        Path to the downloaded file
    """
+    output_dir = Path(output_dir)
    assets = get_latest_release_assets(repo, github_token=github_token)
    try:
        asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$")
@@ -164,26 +165,50 @@ def download_latest_aircraft_adsb_csv(
    Returns:
        Path to the downloaded file
    """
+    output_dir = Path(output_dir)
    assets = get_latest_release_assets(repo, github_token=github_token)
-    asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$")
+    asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
    return saved_to

-
+import polars as pl
 def get_latest_aircraft_adsb_csv_df():
+    """Download and load the latest ADS-B CSV from GitHub releases."""
+    import re
+    
    csv_path = download_latest_aircraft_adsb_csv()
-    import pandas as pd
-    df = pd.read_csv(csv_path)
-    df = df.fillna("")
-    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
+    df = pl.read_csv(csv_path, null_values=[""])
+    
+    # Parse time column: values like "2025-12-31T00:00:00.040" or "2025-05-11T15:15:50.540+0000"
+    # Try with timezone first (convert to naive), then without timezone
+    df = df.with_columns(
+        pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f%z", strict=False)
+            .dt.replace_time_zone(None)  # Convert to naive datetime first
+            .fill_null(pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f", strict=False))
+    )
+
+    # Cast dbFlags and year to strings to match the schema used in compress functions
+    for col in ['dbFlags', 'year']:
+        if col in df.columns:
+            df = df.with_columns(pl.col(col).cast(pl.Utf8))
+    
+    # Fill nulls with empty strings for string columns
+    for col in df.columns:
+        if df[col].dtype == pl.Utf8:
+            df = df.with_columns(pl.col(col).fill_null(""))
+    
+    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
    
    date_str = match.group(1)
+    print(df.columns)
+    print(df.dtypes)
    return df, date_str


+
 if __name__ == "__main__":
    download_latest_aircraft_csv()