mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-05-04 16:45:11 +02:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9c744b0baf | |||
| ebda04767f | |||
| 3fdf443894 | |||
| 24313603c5 | |||
| 2bb0a5eac3 | |||
| b54f33aa56 |
@@ -101,6 +101,51 @@ jobs:
|
|||||||
date: ${{ needs.resolve-dates.outputs.adsb_date }}
|
date: ${{ needs.resolve-dates.outputs.adsb_date }}
|
||||||
concat_with_latest_csv: true
|
concat_with_latest_csv: true
|
||||||
|
|
||||||
|
adsb-reduce:
|
||||||
|
needs: [resolve-dates, adsb-to-aircraft]
|
||||||
|
if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
|
||||||
|
runs-on: ubuntu-24.04-arm
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
|
||||||
|
- name: Setup Python
|
||||||
|
uses: actions/setup-python@v6
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Download compressed outputs
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
|
||||||
|
path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
|
||||||
|
merge-multiple: true
|
||||||
|
|
||||||
|
- name: Concatenate final outputs
|
||||||
|
env:
|
||||||
|
DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
|
||||||
|
CONCAT_WITH_LATEST_CSV: true
|
||||||
|
run: |
|
||||||
|
EXTRA=""
|
||||||
|
if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
|
||||||
|
EXTRA="--concat_with_latest_csv"
|
||||||
|
fi
|
||||||
|
python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
|
||||||
|
ls -lah data/output/ || true
|
||||||
|
|
||||||
|
- name: Upload final artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
|
||||||
|
path: data/output/openairframes_adsb_*
|
||||||
|
retention-days: 30
|
||||||
|
if-no-files-found: error
|
||||||
|
|
||||||
build-community:
|
build-community:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: github.event_name != 'schedule'
|
if: github.event_name != 'schedule'
|
||||||
@@ -188,13 +233,13 @@ jobs:
|
|||||||
|
|
||||||
create-release:
|
create-release:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [resolve-dates, build-faa, adsb-to-aircraft, build-community, build-adsbexchange-json, build-mictronics-db]
|
needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
|
||||||
if: github.event_name != 'schedule' && !cancelled()
|
if: github.event_name != 'schedule' && !cancelled()
|
||||||
steps:
|
steps:
|
||||||
- name: Check adsb-to-aircraft status
|
- name: Check ADS-B workflow status
|
||||||
if: needs.adsb-to-aircraft.result != 'success'
|
if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
|
||||||
run: |
|
run: |
|
||||||
echo "WARNING: adsb-to-aircraft result was '${{ needs.adsb-to-aircraft.result }}', will continue without ADS-B artifacts"
|
echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"
|
||||||
|
|
||||||
- name: Checkout for gh CLI
|
- name: Checkout for gh CLI
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -211,7 +256,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Download ADS-B artifacts
|
- name: Download ADS-B artifacts
|
||||||
uses: actions/download-artifact@v5
|
uses: actions/download-artifact@v5
|
||||||
if: needs.adsb-to-aircraft.result == 'success'
|
if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
with:
|
with:
|
||||||
name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
|
name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
|
||||||
|
|||||||
@@ -16,11 +16,19 @@ A daily release is created at **06:00 UTC** and includes:
|
|||||||
- **openairframes_community.csv**
|
- **openairframes_community.csv**
|
||||||
All community submissions
|
All community submissions
|
||||||
|
|
||||||
|
- **openairframes_adsb.csv**
|
||||||
|
Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that day’s ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
|
||||||
|
Example Usage:
|
||||||
|
```python
|
||||||
|
import pandas as pd
|
||||||
|
url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
|
||||||
|
df = pd.read_csv(url)
|
||||||
|
df
|
||||||
|
```
|
||||||
|

|
||||||
- **openairframes_faa.csv**
|
- **openairframes_faa.csv**
|
||||||
All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)
|
All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)
|
||||||
|
|
||||||
- **openairframes_adsb.csv**
|
|
||||||
Airframe information derived from ADS-B messages on the [ADSB.lol](https://www.adsb.lol/) network, from 2026-02-12 to present (will be from 2024-01-01 soon). The airframe information originates from [mictronics aircraft database](https://www.mictronics.de/aircraft-database/) (~5 MB).
|
|
||||||
|
|
||||||
- **ReleasableAircraft_{date}.zip**
|
- **ReleasableAircraft_{date}.zip**
|
||||||
A daily snapshot of the FAA database, which updates at **05:30 UTC**
|
A daily snapshot of the FAA database, which updates at **05:30 UTC**
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 99 KiB |
@@ -0,0 +1,242 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
|
||||||
|
|
||||||
|
Source: "TheAirTraffic Database - Aircraft 2.csv"
|
||||||
|
Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
|
||||||
|
|
||||||
|
Categories in the spreadsheet columns (paired: name, registrations, separator):
|
||||||
|
Col 1-3: Business
|
||||||
|
Col 4-6: Government
|
||||||
|
Col 7-9: People
|
||||||
|
Col 10-12: Sports
|
||||||
|
Col 13-15: Celebrity
|
||||||
|
Col 16-18: State Govt./Law
|
||||||
|
Col 19-21: Other
|
||||||
|
Col 22-24: Test Aircraft
|
||||||
|
Col 25-27: YouTubers
|
||||||
|
Col 28-30: Formula 1 VIP's
|
||||||
|
Col 31-33: Active GII's and GIII's (test/demo aircraft)
|
||||||
|
Col 34-37: Russia & Ukraine (extra col for old/new)
|
||||||
|
Col 38-40: Helicopters & Blimps
|
||||||
|
Col 41-43: Unique Reg's
|
||||||
|
Col 44-46: Saudi & UAE
|
||||||
|
Col 47-49: Schools
|
||||||
|
Col 50-52: Special Charter
|
||||||
|
Col 53-55: Unknown Owners
|
||||||
|
Col 56-59: Frequent Flyers (extra cols: name, aircraft, logged, hours)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import hashlib
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ── Category mapping ────────────────────────────────────────────────────────
|
||||||
|
# Each entry: (name_col, reg_col, owner_category_tags)
|
||||||
|
# owner_category_tags is a dict of tag keys to add beyond "owner"
|
||||||
|
CATEGORY_COLUMNS = [
|
||||||
|
# (name_col, reg_col, {tag_key: tag_value, ...})
|
||||||
|
(1, 2, {"owner_category_0": "business"}),
|
||||||
|
(4, 5, {"owner_category_0": "government"}),
|
||||||
|
(7, 8, {"owner_category_0": "celebrity"}),
|
||||||
|
(10, 11, {"owner_category_0": "sports"}),
|
||||||
|
(13, 14, {"owner_category_0": "celebrity"}),
|
||||||
|
(16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
|
||||||
|
(19, 20, {"owner_category_0": "other"}),
|
||||||
|
(22, 23, {"owner_category_0": "test_aircraft"}),
|
||||||
|
(25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
|
||||||
|
(28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
|
||||||
|
(31, 32, {"owner_category_0": "test_aircraft"}),
|
||||||
|
# Russia & Ukraine: col 34=name, col 35 or 36 may have reg
|
||||||
|
(34, 35, {"owner_category_0": "russia_ukraine"}),
|
||||||
|
(38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
|
||||||
|
(41, 42, {"owner_category_0": "other"}),
|
||||||
|
(44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
|
||||||
|
(47, 48, {"owner_category_0": "education"}),
|
||||||
|
(50, 51, {"owner_category_0": "charter"}),
|
||||||
|
(53, 54, {"owner_category_0": "unknown"}),
|
||||||
|
(56, 57, {"owner_category_0": "celebrity"}), # Frequent Flyers name col, aircraft col
|
||||||
|
]
|
||||||
|
|
||||||
|
# First data row index (0-based) in the CSV
|
||||||
|
DATA_START_ROW = 4
|
||||||
|
|
||||||
|
# ── Contributor info ────────────────────────────────────────────────────────
|
||||||
|
CONTRIBUTOR_NAME = "TheAirTraffic"
|
||||||
|
# Deterministic UUID v5 from contributor name
|
||||||
|
CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
|
||||||
|
|
||||||
|
# Citation
|
||||||
|
CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_military_serial(reg: str) -> bool:
|
||||||
|
"""
|
||||||
|
Detect military-style serials like 92-9000, 82-8000, 98-0001
|
||||||
|
or pure numeric IDs like 929000, 828000, 980001.
|
||||||
|
These aren't standard civil registrations; use openairframes_id.
|
||||||
|
"""
|
||||||
|
# Pattern: NN-NNNN
|
||||||
|
if re.match(r'^\d{2}-\d{4}$', reg):
|
||||||
|
return True
|
||||||
|
# Pure 6-digit numbers (likely ICAO hex or military mode-S)
|
||||||
|
if re.match(r'^\d{6}$', reg):
|
||||||
|
return True
|
||||||
|
# Short numeric-only (1-5 digits) like "01", "02", "676"
|
||||||
|
if re.match(r'^\d{1,5}$', reg):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_reg(raw: str) -> str:
|
||||||
|
"""Clean up a registration string."""
|
||||||
|
reg = raw.strip().rstrip(',').strip()
|
||||||
|
# Remove carriage returns and other whitespace
|
||||||
|
reg = reg.replace('\r', '').replace('\n', '').strip()
|
||||||
|
return reg
|
||||||
|
|
||||||
|
|
||||||
|
def parse_regs(cell_value: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Parse a cell that may contain one or many registrations,
|
||||||
|
separated by commas, possibly wrapped in quotes.
|
||||||
|
"""
|
||||||
|
if not cell_value or not cell_value.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Some cells have ADS-B exchange URLs – skip those
|
||||||
|
if 'globe.adsbexchange.com' in cell_value:
|
||||||
|
return []
|
||||||
|
if cell_value.strip() in ('.', ',', ''):
|
||||||
|
return []
|
||||||
|
|
||||||
|
results = []
|
||||||
|
# Split on comma
|
||||||
|
parts = cell_value.split(',')
|
||||||
|
for part in parts:
|
||||||
|
reg = normalize_reg(part)
|
||||||
|
if not reg:
|
||||||
|
continue
|
||||||
|
# Skip URLs, section labels, etc.
|
||||||
|
if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
|
||||||
|
continue
|
||||||
|
# Skip if it's just whitespace or dots
|
||||||
|
if reg in ('.', '..', '...'):
|
||||||
|
continue
|
||||||
|
results.append(reg)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def make_submission(
|
||||||
|
reg: str,
|
||||||
|
owner: str,
|
||||||
|
category_tags: dict[str, str],
|
||||||
|
) -> dict:
|
||||||
|
"""Build a single community_submission.v1 object."""
|
||||||
|
|
||||||
|
entry: dict = {}
|
||||||
|
|
||||||
|
# Decide identifier field
|
||||||
|
if looks_like_military_serial(reg):
|
||||||
|
entry["openairframes_id"] = reg
|
||||||
|
else:
|
||||||
|
entry["registration_number"] = reg
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
tags: dict = {
|
||||||
|
"citation_0": CITATION,
|
||||||
|
}
|
||||||
|
if owner:
|
||||||
|
tags["owner"] = owner.strip()
|
||||||
|
tags.update(category_tags)
|
||||||
|
entry["tags"] = tags
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
|
||||||
|
"/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not csv_path.exists():
|
||||||
|
print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Read CSV
|
||||||
|
with open(csv_path, 'r', encoding='utf-8-sig') as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
rows = list(reader)
|
||||||
|
|
||||||
|
print(f"Read {len(rows)} rows from {csv_path.name}")
|
||||||
|
|
||||||
|
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
submissions: list[dict] = []
|
||||||
|
seen: set[tuple] = set() # (reg, owner) dedup
|
||||||
|
|
||||||
|
for row_idx in range(DATA_START_ROW, len(rows)):
|
||||||
|
row = rows[row_idx]
|
||||||
|
if len(row) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
|
||||||
|
if reg_col >= len(row) or name_col >= len(row):
|
||||||
|
continue
|
||||||
|
|
||||||
|
owner_raw = row[name_col].strip().rstrip(',').strip()
|
||||||
|
reg_raw = row[reg_col]
|
||||||
|
|
||||||
|
# Clean owner name
|
||||||
|
owner = owner_raw.replace('\r', '').replace('\n', '').strip()
|
||||||
|
if not owner or owner in ('.', ',', 'Section 1'):
|
||||||
|
continue
|
||||||
|
# Skip header-like values
|
||||||
|
if owner.startswith('http') or owner.startswith('Link '):
|
||||||
|
continue
|
||||||
|
|
||||||
|
regs = parse_regs(reg_raw)
|
||||||
|
if not regs:
|
||||||
|
# For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
|
||||||
|
if name_col == 34 and reg_col + 1 < len(row):
|
||||||
|
regs = parse_regs(row[reg_col + 1])
|
||||||
|
|
||||||
|
for reg in regs:
|
||||||
|
key = (reg, owner)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
submissions.append(make_submission(reg, owner, cat_tags))
|
||||||
|
|
||||||
|
print(f"Generated {len(submissions)} submissions")
|
||||||
|
|
||||||
|
# Write output
|
||||||
|
proj_root = Path(__file__).resolve().parent.parent
|
||||||
|
out_dir = proj_root / "community" / date_str
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
out_file = out_dir / f"theairtraffic_{date_str}.json"
|
||||||
|
|
||||||
|
with open(out_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(submissions, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"Written to {out_file}")
|
||||||
|
print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
|
||||||
|
|
||||||
|
# Quick stats
|
||||||
|
cats = {}
|
||||||
|
for s in submissions:
|
||||||
|
c = s['tags'].get('owner_category_0', 'NONE')
|
||||||
|
cats[c] = cats.get(c, 0) + 1
|
||||||
|
print("\nCategory breakdown:")
|
||||||
|
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {c}: {n}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Validate the generated theairtraffic JSON output."""
|
||||||
|
import json
|
||||||
|
import glob
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Find the latest output
|
||||||
|
files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
|
||||||
|
if not files:
|
||||||
|
print("No output files found!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
path = files[-1]
|
||||||
|
print(f"Validating: {path}")
|
||||||
|
|
||||||
|
with open(path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
print(f"Total entries: {len(data)}")
|
||||||
|
|
||||||
|
# Check military serial handling
|
||||||
|
mil = [d for d in data if "openairframes_id" in d]
|
||||||
|
print(f"\nEntries using openairframes_id: {len(mil)}")
|
||||||
|
for m in mil[:10]:
|
||||||
|
print(f" {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
|
||||||
|
|
||||||
|
# Check youtuber entries
|
||||||
|
yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
|
||||||
|
print(f"\nYouTuber entries: {len(yt)}")
|
||||||
|
for y in yt[:5]:
|
||||||
|
reg = y.get("registration_number", y.get("openairframes_id"))
|
||||||
|
c0 = y["tags"].get("owner_category_0")
|
||||||
|
c1 = y["tags"].get("owner_category_1")
|
||||||
|
print(f" {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
|
||||||
|
|
||||||
|
# Check US Govt / military
|
||||||
|
gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
|
||||||
|
print(f"\nUSA 747/757 entries: {len(gov)}")
|
||||||
|
for g in gov:
|
||||||
|
oid = g.get("openairframes_id", g.get("registration_number"))
|
||||||
|
print(f" {oid}")
|
||||||
|
|
||||||
|
# Schema validation
|
||||||
|
issues = 0
|
||||||
|
for i, d in enumerate(data):
|
||||||
|
has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
|
||||||
|
if not has_id:
|
||||||
|
print(f" Entry {i}: no identifier!")
|
||||||
|
issues += 1
|
||||||
|
if "tags" not in d:
|
||||||
|
print(f" Entry {i}: no tags!")
|
||||||
|
issues += 1
|
||||||
|
# Check tag key format
|
||||||
|
for k in d.get("tags", {}):
|
||||||
|
import re
|
||||||
|
if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
|
||||||
|
print(f" Entry {i}: invalid tag key '{k}'")
|
||||||
|
issues += 1
|
||||||
|
|
||||||
|
print(f"\nSchema issues: {issues}")
|
||||||
|
|
||||||
|
# Category breakdown
|
||||||
|
cats = {}
|
||||||
|
for s in data:
|
||||||
|
c = s["tags"].get("owner_category_0", "NONE")
|
||||||
|
cats[c] = cats.get(c, 0) + 1
|
||||||
|
print("\nCategory breakdown:")
|
||||||
|
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
|
||||||
|
print(f" {c}: {n}")
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import polars as pl
|
import polars as pl
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
OUTPUT_DIR = Path("./data/output")
|
OUTPUT_DIR = Path("./data/output")
|
||||||
CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
|
CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
|
||||||
|
|
||||||
@@ -13,26 +13,25 @@ def main():
|
|||||||
|
|
||||||
compressed_dir = OUTPUT_DIR / "compressed"
|
compressed_dir = OUTPUT_DIR / "compressed"
|
||||||
date_dir = compressed_dir / args.date
|
date_dir = compressed_dir / args.date
|
||||||
if not date_dir.is_dir():
|
|
||||||
raise FileNotFoundError(f"No date folder found: {date_dir}")
|
|
||||||
|
|
||||||
parquet_files = sorted(date_dir.glob("*.parquet"))
|
parquet_files = sorted(date_dir.glob("*.parquet"))
|
||||||
if not parquet_files:
|
df = None
|
||||||
raise FileNotFoundError(f"No parquet files found in {date_dir}")
|
if parquet_files: # TODO: This logic could be updated slightly.
|
||||||
|
print(f"No parquet files found in {date_dir}")
|
||||||
|
|
||||||
frames = [pl.read_parquet(p) for p in parquet_files]
|
frames = [pl.read_parquet(p) for p in parquet_files]
|
||||||
df = pl.concat(frames, how="vertical", rechunk=True)
|
df = pl.concat(frames, how="vertical", rechunk=True)
|
||||||
|
|
||||||
df = df.sort(["time", "icao"])
|
df = df.sort(["time", "icao"])
|
||||||
df = df.select(CORRECT_ORDER_OF_COLUMNS)
|
df = df.select(CORRECT_ORDER_OF_COLUMNS)
|
||||||
|
|
||||||
output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
|
output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
|
||||||
print(f"Writing combined parquet to {output_path} with {df.height} rows")
|
print(f"Writing combined parquet to {output_path} with {df.height} rows")
|
||||||
df.write_parquet(output_path)
|
df.write_parquet(output_path)
|
||||||
|
|
||||||
csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
|
csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
|
||||||
print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
|
print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
|
||||||
df.write_csv(csv_output_path, compression="gzip")
|
df.write_csv(csv_output_path, compression="gzip")
|
||||||
|
|
||||||
if args.concat_with_latest_csv:
|
if args.concat_with_latest_csv:
|
||||||
print("Loading latest CSV from GitHub releases to concatenate with...")
|
print("Loading latest CSV from GitHub releases to concatenate with...")
|
||||||
@@ -46,9 +45,10 @@ def main():
|
|||||||
csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
|
csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
|
||||||
args_dt = datetime.strptime(args.date, "%Y-%m-%d")
|
args_dt = datetime.strptime(args.date, "%Y-%m-%d")
|
||||||
|
|
||||||
if csv_end_dt >= args_dt:
|
if df is None or csv_end_dt >= args_dt:
|
||||||
print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
|
print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
|
||||||
print("Writing latest CSV directly without concatenation to avoid duplicates")
|
print("Writing latest CSV directly without concatenation to avoid duplicates")
|
||||||
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||||
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
|
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
|
||||||
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
|
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
|
||||||
df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
|
df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
|
||||||
|
|||||||
Reference in New Issue
Block a user