Files
OpenAirframes/scripts/concat_downloads.py

49 lines
1.5 KiB
Python

#!/usr/bin/env python3
import re
from pathlib import Path
import polars as pl
# Find all CSV.gz files in the downloaded artifacts
artifacts_dir = Path("downloads/adsb_artifacts")
files = sorted(artifacts_dir.glob("*/openairframes_adsb_*.csv.gz"))
if not files:
raise SystemExit("No CSV.gz files found in downloads/adsb_artifacts/")
print(f"Found {len(files)} files to concatenate")
# Extract dates from filenames to determine range
def extract_dates(path: Path) -> tuple[str, str]:
"""Extract start and end dates from filename"""
m = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv\.gz", path.name)
if m:
return m.group(1), m.group(2)
return None, None
# Collect all dates
all_dates = []
for f in files:
start, end = extract_dates(f)
if start and end:
all_dates.extend([start, end])
print(f" {f.name}: {start} to {end}")
if not all_dates:
raise SystemExit("Could not extract dates from filenames")
# Find earliest and latest dates
earliest = min(all_dates)
latest = max(all_dates)
print(f"\nDate range: {earliest} to {latest}")
# Read and concatenate all files
print("\nReading and concatenating files...")
frames = [pl.read_csv(f) for f in files]
df = pl.concat(frames, how="vertical", rechunk=True)
# Write output
output_path = Path("downloads") / f"openairframes_adsb_{earliest}_{latest}.csv.gz"
output_path.parent.mkdir(parents=True, exist_ok=True)
df.write_csv(output_path, compression="gzip")
print(f"\nWrote {output_path} with {df.height:,} rows")