mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-06-10 23:17:47 +02:00
275 lines
9.8 KiB
Python
275 lines
9.8 KiB
Python
# Shared compression logic for ADS-B aircraft data
|
|
import os
|
|
import polars as pl
|
|
|
|
COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']
|
|
|
|
|
|
def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
|
|
"""For each icao, keep only the earliest row with each unique signature.
|
|
|
|
This is used for deduplicating across multiple compressed chunks.
|
|
"""
|
|
# Create signature column
|
|
df = df.with_columns(
|
|
pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
|
|
)
|
|
# Group by icao and signature, take first row (earliest due to time sort)
|
|
df = df.sort("time")
|
|
df_deduped = df.group_by(["icao", "_signature"]).first()
|
|
df_deduped = df_deduped.drop("_signature")
|
|
df_deduped = df_deduped.sort("time")
|
|
return df_deduped
|
|
|
|
|
|
def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
|
|
"""Compress a single ICAO group to its most informative row using Polars."""
|
|
# Create signature string
|
|
df = df.with_columns(
|
|
pl.concat_str([pl.col(c).cast(pl.Utf8) for c in COLUMNS], separator="|").alias("_signature")
|
|
)
|
|
|
|
# Compute signature counts
|
|
signature_counts = df.group_by("_signature").len().rename({"len": "_sig_count"})
|
|
|
|
# Group by signature and take first row
|
|
df = df.group_by("_signature").first()
|
|
|
|
if df.height == 1:
|
|
# Only one unique signature, return it
|
|
result = df.drop("_signature").with_columns(pl.lit(icao).alias("icao"))
|
|
return result
|
|
|
|
# For each row, create dict of non-empty column values and check subsets
|
|
# Convert to list of dicts for subset checking (same logic as pandas version)
|
|
rows_data = []
|
|
for row in df.iter_rows(named=True):
|
|
non_empty = {col: row[col] for col in COLUMNS if row[col] != '' and row[col] is not None}
|
|
rows_data.append({
|
|
'signature': row['_signature'],
|
|
'non_empty_dict': non_empty,
|
|
'non_empty_count': len(non_empty),
|
|
'row_data': row
|
|
})
|
|
|
|
# Check if row i's non-empty values are a subset of row j's non-empty values
|
|
def is_subset_of_any(idx):
|
|
row_dict = rows_data[idx]['non_empty_dict']
|
|
row_count = rows_data[idx]['non_empty_count']
|
|
|
|
for other_idx, other_data in enumerate(rows_data):
|
|
if idx == other_idx:
|
|
continue
|
|
other_dict = other_data['non_empty_dict']
|
|
other_count = other_data['non_empty_count']
|
|
|
|
# Check if all non-empty values in current row match those in other row
|
|
if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):
|
|
# If they match and other has more defined columns, current row is redundant
|
|
if other_count > row_count:
|
|
return True
|
|
return False
|
|
|
|
# Keep rows that are not subsets of any other row
|
|
keep_indices = [i for i in range(len(rows_data)) if not is_subset_of_any(i)]
|
|
|
|
if len(keep_indices) == 0:
|
|
keep_indices = [0] # Fallback: keep first row
|
|
|
|
remaining_signatures = [rows_data[i]['signature'] for i in keep_indices]
|
|
df = df.filter(pl.col("_signature").is_in(remaining_signatures))
|
|
|
|
if df.height > 1:
|
|
# Use signature counts to pick the most frequent one
|
|
df = df.join(signature_counts, on="_signature", how="left")
|
|
max_count = df["_sig_count"].max()
|
|
df = df.filter(pl.col("_sig_count") == max_count).head(1)
|
|
df = df.drop("_sig_count")
|
|
|
|
result = df.drop("_signature").with_columns(pl.lit(icao).alias("icao"))
|
|
|
|
# Ensure empty strings are preserved
|
|
for col in COLUMNS:
|
|
if col in result.columns:
|
|
result = result.with_columns(pl.col(col).fill_null(""))
|
|
|
|
return result
|
|
|
|
|
|
def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
|
|
"""Compress a DataFrame with multiple ICAOs to one row per ICAO.
|
|
|
|
This is the main entry point for compressing ADS-B data.
|
|
Used by both daily GitHub Actions runs and historical AWS runs.
|
|
|
|
Args:
|
|
df: DataFrame with columns ['time', 'icao'] + COLUMNS
|
|
verbose: Whether to print progress
|
|
|
|
Returns:
|
|
Compressed DataFrame with one row per ICAO
|
|
"""
|
|
if df.height == 0:
|
|
return df
|
|
|
|
# Sort by icao and time
|
|
df = df.sort(['icao', 'time'])
|
|
|
|
# Fill null values with empty strings for COLUMNS
|
|
for col in COLUMNS:
|
|
if col in df.columns:
|
|
df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
|
|
|
|
# First pass: quick deduplication of exact duplicates
|
|
df = df.unique(subset=['icao'] + COLUMNS, keep='first')
|
|
if verbose:
|
|
print(f"After quick dedup: {df.height} records")
|
|
|
|
# Second pass: sophisticated compression per ICAO
|
|
if verbose:
|
|
print("Compressing per ICAO...")
|
|
|
|
# Process each ICAO group
|
|
icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
|
|
compressed_dfs = []
|
|
|
|
for icao_key, group_df in icao_groups.items():
|
|
# partition_by with as_dict=True returns tuple keys, extract first element
|
|
icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
|
|
compressed = compress_df_polars(group_df, str(icao))
|
|
compressed_dfs.append(compressed)
|
|
|
|
if compressed_dfs:
|
|
df_compressed = pl.concat(compressed_dfs)
|
|
else:
|
|
df_compressed = df.head(0) # Empty with same schema
|
|
|
|
if verbose:
|
|
print(f"After compress: {df_compressed.height} records")
|
|
|
|
# Reorder columns: time first, then icao
|
|
cols = df_compressed.columns
|
|
ordered_cols = ['time', 'icao'] + [c for c in cols if c not in ['time', 'icao']]
|
|
df_compressed = df_compressed.select(ordered_cols)
|
|
|
|
return df_compressed
|
|
|
|
|
|
def load_raw_adsb_for_day(day):
|
|
"""Load raw ADS-B data for a day from parquet file."""
|
|
from datetime import timedelta
|
|
from pathlib import Path
|
|
|
|
start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
|
|
# Check for parquet file first
|
|
version_date = f"v{start_time.strftime('%Y.%m.%d')}"
|
|
parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
|
|
|
|
if not parquet_file.exists():
|
|
# Try to generate parquet file by calling the download function
|
|
print(f" Parquet file not found: {parquet_file}")
|
|
print(f" Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
|
|
|
|
from download_adsb_data_to_parquet import create_parquet_for_day
|
|
result_path = create_parquet_for_day(start_time, keep_folders=False)
|
|
|
|
if result_path:
|
|
print(f" Successfully generated parquet file: {result_path}")
|
|
else:
|
|
raise Exception("Failed to generate parquet file")
|
|
|
|
if parquet_file.exists():
|
|
print(f" Loading from parquet: {parquet_file}")
|
|
df = pl.read_parquet(
|
|
parquet_file,
|
|
columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
|
|
)
|
|
|
|
# Convert to timezone-naive datetime
|
|
if df["time"].dtype == pl.Datetime:
|
|
df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
|
|
|
|
return df
|
|
else:
|
|
# Return empty DataFrame if parquet file doesn't exist
|
|
print(f" No data available for {start_time.strftime('%Y-%m-%d')}")
|
|
return pl.DataFrame(schema={
|
|
'time': pl.Datetime,
|
|
'icao': pl.Utf8,
|
|
'r': pl.Utf8,
|
|
't': pl.Utf8,
|
|
'dbFlags': pl.Int64,
|
|
'ownOp': pl.Utf8,
|
|
'year': pl.Int64,
|
|
'desc': pl.Utf8,
|
|
'aircraft_category': pl.Utf8
|
|
})
|
|
|
|
|
|
def load_historical_for_day(day):
|
|
"""Load and compress historical ADS-B data for a day."""
|
|
df = load_raw_adsb_for_day(day)
|
|
if df.height == 0:
|
|
return df
|
|
|
|
print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")
|
|
|
|
# Use shared compression function
|
|
return compress_multi_icao_df(df, verbose=True)
|
|
|
|
|
|
def concat_compressed_dfs(df_base, df_new):
|
|
"""Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
|
|
# Combine both dataframes
|
|
df_combined = pl.concat([df_base, df_new])
|
|
|
|
# Sort by ICAO and time
|
|
df_combined = df_combined.sort(['icao', 'time'])
|
|
|
|
# Fill null values
|
|
for col in COLUMNS:
|
|
if col in df_combined.columns:
|
|
df_combined = df_combined.with_columns(pl.col(col).fill_null(""))
|
|
|
|
# Apply compression logic per ICAO to get the best row
|
|
icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
|
|
compressed_dfs = []
|
|
|
|
for icao, group_df in icao_groups.items():
|
|
compressed = compress_df_polars(group_df, icao)
|
|
compressed_dfs.append(compressed)
|
|
|
|
if compressed_dfs:
|
|
df_compressed = pl.concat(compressed_dfs)
|
|
else:
|
|
df_compressed = df_combined.head(0)
|
|
|
|
# Sort by time
|
|
df_compressed = df_compressed.sort('time')
|
|
|
|
return df_compressed
|
|
|
|
|
|
def get_latest_aircraft_adsb_csv_df():
|
|
"""Download and load the latest ADS-B CSV from GitHub releases."""
|
|
from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv
|
|
import re
|
|
|
|
csv_path = download_latest_aircraft_adsb_csv()
|
|
df = pl.read_csv(csv_path, null_values=[""])
|
|
|
|
# Fill nulls with empty strings
|
|
for col in df.columns:
|
|
if df[col].dtype == pl.Utf8:
|
|
df = df.with_columns(pl.col(col).fill_null(""))
|
|
|
|
# Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv
|
|
match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
|
|
if not match:
|
|
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
|
|
|
|
date_str = match.group(1)
|
|
return df, date_str
|
|
|