OpenAirframes/src/adsb/compress_adsb_to_aircraft_data.py

# Shared compression logic for ADS-B aircraft data
import os
import polars as pl

COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']


def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
    """For each icao, keep only the earliest row with each unique signature.

    This is used for deduplicating across multiple compressed chunks.
    """
    # Create signature column
    df = df.with_columns(
        pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
    )
    # Group by icao and signature, take first row (earliest due to time sort)
    df = df.sort("time")
    df_deduped = df.group_by(["icao", "_signature"]).first()
    df_deduped = df_deduped.drop("_signature")
    df_deduped = df_deduped.sort("time")
    return df_deduped


def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
    """Compress a single ICAO group to its most informative row using Polars."""
    # Create signature string
    df = df.with_columns(
        pl.concat_str([pl.col(c).cast(pl.Utf8) for c in COLUMNS], separator="|").alias("_signature")
    )

    # Compute signature counts
    signature_counts = df.group_by("_signature").len().rename({"len": "_sig_count"})

    # Group by signature and take first row
    df = df.group_by("_signature").first()

    if df.height == 1:
        # Only one unique signature, return it
        result = df.drop("_signature").with_columns(pl.lit(icao).alias("icao"))
        return result

    # For each row, create dict of non-empty column values and check subsets
    # Convert to list of dicts for subset checking (same logic as pandas version)
    rows_data = []
    for row in df.iter_rows(named=True):
        non_empty = {col: row[col] for col in COLUMNS if row[col] != '' and row[col] is not None}
        rows_data.append({
            'signature': row['_signature'],
            'non_empty_dict': non_empty,
            'non_empty_count': len(non_empty),
            'row_data': row
        })

    # Check if row i's non-empty values are a subset of row j's non-empty values
    def is_subset_of_any(idx):
        row_dict = rows_data[idx]['non_empty_dict']
        row_count = rows_data[idx]['non_empty_count']

        for other_idx, other_data in enumerate(rows_data):
            if idx == other_idx:
                continue
            other_dict = other_data['non_empty_dict']
            other_count = other_data['non_empty_count']

            # Check if all non-empty values in current row match those in other row
            if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):
                # If they match and other has more defined columns, current row is redundant
                if other_count > row_count:
                    return True
        return False

    # Keep rows that are not subsets of any other row
    keep_indices = [i for i in range(len(rows_data)) if not is_subset_of_any(i)]

    if len(keep_indices) == 0:
        keep_indices = [0]  # Fallback: keep first row

    remaining_signatures = [rows_data[i]['signature'] for i in keep_indices]
    df = df.filter(pl.col("_signature").is_in(remaining_signatures))

    if df.height > 1:
        # Use signature counts to pick the most frequent one
        df = df.join(signature_counts, on="_signature", how="left")
        max_count = df["_sig_count"].max()
        df = df.filter(pl.col("_sig_count") == max_count).head(1)
        df = df.drop("_sig_count")

    result = df.drop("_signature").with_columns(pl.lit(icao).alias("icao"))

    # Ensure empty strings are preserved
    for col in COLUMNS:
        if col in result.columns:
            result = result.with_columns(pl.col(col).fill_null(""))

    return result


def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
    """Compress a DataFrame with multiple ICAOs to one row per ICAO.

    This is the main entry point for compressing ADS-B data.
    Used by both daily GitHub Actions runs and historical AWS runs.

    Args:
        df: DataFrame with columns ['time', 'icao'] + COLUMNS
        verbose: Whether to print progress

    Returns:
        Compressed DataFrame with one row per ICAO
    """
    if df.height == 0:
        return df

    # Sort by icao and time
    df = df.sort(['icao', 'time'])

    # Fill null values with empty strings for COLUMNS
    for col in COLUMNS:
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))

    # First pass: quick deduplication of exact duplicates
    df = df.unique(subset=['icao'] + COLUMNS, keep='first')
    if verbose:
        print(f"After quick dedup: {df.height} records")

    # Second pass: sophisticated compression per ICAO
    if verbose:
        print("Compressing per ICAO...")

    # Process each ICAO group
    icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []

    for icao_key, group_df in icao_groups.items():
        # partition_by with as_dict=True returns tuple keys, extract first element
        icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
        compressed = compress_df_polars(group_df, str(icao))
        compressed_dfs.append(compressed)

    if compressed_dfs:
        df_compressed = pl.concat(compressed_dfs)
    else:
        df_compressed = df.head(0)  # Empty with same schema

    if verbose:
        print(f"After compress: {df_compressed.height} records")

    # Reorder columns: time first, then icao
    cols = df_compressed.columns
    ordered_cols = ['time', 'icao'] + [c for c in cols if c not in ['time', 'icao']]
    df_compressed = df_compressed.select(ordered_cols)

    return df_compressed


def load_raw_adsb_for_day(day):
    """Load raw ADS-B data for a day from parquet file."""
    from datetime import timedelta
    from pathlib import Path

    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)

    # Check for parquet file first
    version_date = f"v{start_time.strftime('%Y.%m.%d')}"
    parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")

    if not parquet_file.exists():
        # Try to generate parquet file by calling the download function
        print(f"  Parquet file not found: {parquet_file}")
        print(f"  Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")

        from download_adsb_data_to_parquet import create_parquet_for_day
        result_path = create_parquet_for_day(start_time, keep_folders=False)

        if result_path:
            print(f"  Successfully generated parquet file: {result_path}")
        else:
            raise Exception("Failed to generate parquet file")

    if parquet_file.exists():
        print(f"  Loading from parquet: {parquet_file}")
        df = pl.read_parquet(
            parquet_file,
            columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
        )

        # Convert to timezone-naive datetime
        if df["time"].dtype == pl.Datetime:
            df = df.with_columns(pl.col("time").dt.replace_time_zone(None))

        return df
    else:
        # Return empty DataFrame if parquet file doesn't exist
        print(f"  No data available for {start_time.strftime('%Y-%m-%d')}")
        return pl.DataFrame(schema={
            'time': pl.Datetime,
            'icao': pl.Utf8,
            'r': pl.Utf8,
            't': pl.Utf8,
            'dbFlags': pl.Int64,
            'ownOp': pl.Utf8,
            'year': pl.Int64,
            'desc': pl.Utf8,
            'aircraft_category': pl.Utf8
        })


def load_historical_for_day(day):
    """Load and compress historical ADS-B data for a day."""
    df = load_raw_adsb_for_day(day)
    if df.height == 0:
        return df

    print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")

    # Use shared compression function
    return compress_multi_icao_df(df, verbose=True)


def concat_compressed_dfs(df_base, df_new):
    """Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
    # Combine both dataframes
    df_combined = pl.concat([df_base, df_new])

    # Sort by ICAO and time
    df_combined = df_combined.sort(['icao', 'time'])

    # Fill null values
    for col in COLUMNS:
        if col in df_combined.columns:
            df_combined = df_combined.with_columns(pl.col(col).fill_null(""))

    # Apply compression logic per ICAO to get the best row
    icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []

    for icao, group_df in icao_groups.items():
        compressed = compress_df_polars(group_df, icao)
        compressed_dfs.append(compressed)

    if compressed_dfs:
        df_compressed = pl.concat(compressed_dfs)
    else:
        df_compressed = df_combined.head(0)

    # Sort by time
    df_compressed = df_compressed.sort('time')

    return df_compressed


def get_latest_aircraft_adsb_csv_df():
    """Download and load the latest ADS-B CSV from GitHub releases."""
    from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv
    import re

    csv_path = download_latest_aircraft_adsb_csv()
    df = pl.read_csv(csv_path, null_values=[""])

    # Fill nulls with empty strings
    for col in df.columns:
        if df[col].dtype == pl.Utf8:
            df = df.with_columns(pl.col(col).fill_null(""))

    # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv
    match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
        raise ValueError(f"Could not extract date from filename: {csv_path.name}")

    date_str = match.group(1)
    return df, date_str