mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-04-30 06:48:05 +02:00
FEATURE: Add contributions framework. Fix and improve daily adsb release using Github actions for map reduce.
This commit is contained in:
+28
-33
@@ -13,18 +13,13 @@ from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import boto3
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
|
||||
from compress_adsb_to_aircraft_data import load_historical_for_day, COLUMNS
|
||||
|
||||
|
||||
def deduplicate_by_signature(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""For each icao, keep only the earliest row with each unique signature."""
|
||||
df["_signature"] = df[COLUMNS].astype(str).agg("|".join, axis=1)
|
||||
df_deduped = df.groupby(["icao", "_signature"], as_index=False).first()
|
||||
df_deduped = df_deduped.drop(columns=["_signature"])
|
||||
df_deduped = df_deduped.sort_values("time")
|
||||
return df_deduped
|
||||
from compress_adsb_to_aircraft_data import (
|
||||
load_historical_for_day,
|
||||
deduplicate_by_signature,
|
||||
COLUMNS,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
@@ -39,28 +34,20 @@ def main():
|
||||
total_days = (end_date - start_date).days
|
||||
print(f"Worker: processing {total_days} days [{start_date_str}, {end_date_str})")
|
||||
|
||||
df_accumulated = pd.DataFrame()
|
||||
dfs = []
|
||||
current_date = start_date
|
||||
|
||||
while current_date < end_date:
|
||||
day_str = current_date.strftime("%Y-%m-%d")
|
||||
print(f" Loading {day_str}...")
|
||||
|
||||
try:
|
||||
df_compressed = load_historical_for_day(current_date)
|
||||
except Exception as e:
|
||||
print(f" WARNING: Failed to load {day_str}: {e}")
|
||||
current_date += timedelta(days=1)
|
||||
continue
|
||||
df_compressed = load_historical_for_day(current_date)
|
||||
if df_compressed.height == 0:
|
||||
raise RuntimeError(f"No data found for {day_str}")
|
||||
|
||||
if df_accumulated.empty:
|
||||
df_accumulated = df_compressed
|
||||
else:
|
||||
df_accumulated = pd.concat(
|
||||
[df_accumulated, df_compressed], ignore_index=True
|
||||
)
|
||||
|
||||
print(f" +{len(df_compressed)} rows (total: {len(df_accumulated)})")
|
||||
dfs.append(df_compressed)
|
||||
total_rows = sum(df.height for df in dfs)
|
||||
print(f" +{df_compressed.height} rows (total: {total_rows})")
|
||||
|
||||
# Delete local cache after each day to save disk in container
|
||||
cache_dir = Path("data/adsb")
|
||||
@@ -70,23 +57,31 @@ def main():
|
||||
|
||||
current_date += timedelta(days=1)
|
||||
|
||||
if df_accumulated.empty:
|
||||
print("No data collected — exiting.")
|
||||
return
|
||||
# Concatenate all days
|
||||
df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
|
||||
|
||||
# Deduplicate within this chunk
|
||||
df_accumulated = deduplicate_by_signature(df_accumulated)
|
||||
print(f"After dedup: {len(df_accumulated)} rows")
|
||||
print(f"After dedup: {df_accumulated.height} rows")
|
||||
|
||||
# Write to local file then upload to S3
|
||||
local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
|
||||
df_accumulated.to_csv(local_path, index=False, compression="gzip")
|
||||
local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv")
|
||||
df_accumulated.write_csv(local_path)
|
||||
|
||||
# Compress with gzip
|
||||
import gzip
|
||||
import shutil
|
||||
gz_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
|
||||
with open(local_path, 'rb') as f_in:
|
||||
with gzip.open(gz_path, 'wb') as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
local_path.unlink() # Remove uncompressed file
|
||||
|
||||
s3_key = f"intermediate/{run_id}/chunk_{start_date_str}_{end_date_str}.csv.gz"
|
||||
print(f"Uploading to s3://{s3_bucket}/{s3_key}")
|
||||
|
||||
s3 = boto3.client("s3")
|
||||
s3.upload_file(str(local_path), s3_bucket, s3_key)
|
||||
s3.upload_file(str(gz_path), s3_bucket, s3_key)
|
||||
print("Done.")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user