Compare commits

...

14 Commits

Author SHA1 Message Date
JG 68cc4d89b3 Merge pull request #46 psimpson routes
add psimpson routes
2026-05-25 18:59:28 -04:00
ggman12 b653c3a844 add psimpson routes 2026-05-25 18:54:57 -04:00
JG 2829e5fb6e Merge pull request #35 from PlaneQuery/develop
update readme.md
2026-03-18 14:31:29 -04:00
ggman12 9c744b0baf update readme.md 2026-03-18 14:29:13 -04:00
JG ebda04767f Merge pull request #34 from PlaneQuery/develop
Develop to main: theairtraffic google sheet
2026-03-10 05:12:11 -04:00
ggman12 3fdf443894 add russia_ukraine 2026-03-10 05:08:19 -04:00
ggman12 24313603c5 works 2026-03-10 05:08:19 -04:00
JG 2bb0a5eac3 Merge pull request #33 from PlaneQuery/develop
Develop to Main: Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb
2026-02-26 15:32:59 -05:00
ggman12 b54f33aa56 Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb 2026-02-26 15:31:47 -05:00
JG 2dda3d341c Merge pull request #32 from PlaneQuery/develop
Develop to Main: Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.
2026-02-24 15:37:54 -05:00
ggman12 b0526f0a95 Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data. 2026-02-24 15:36:10 -05:00
JG 4b6a043a9d Merge pull request #31 from PlaneQuery/develop
Develop to Main Fix adsb asset retrival to be more fault tolerant. Fix download issue
2026-02-24 02:17:08 -05:00
ggman12 55c464aad7 Fix adsb asset retrival to be more fault tolerant. Fix download issue for 2024-07-03 2026-02-24 02:12:55 -05:00
ggman12 aa509e8560 attempt to fix download issue for 2024-07-03 2026-02-19 17:51:49 -05:00
20 changed files with 827 additions and 63 deletions
@@ -49,11 +49,38 @@ jobs:
python -m src.adsb.download_and_list_icaos --date "$DATE" python -m src.adsb.download_and_list_icaos --date "$DATE"
ls -lah data/output/adsb_archives/"$DATE" || true ls -lah data/output/adsb_archives/"$DATE" || true
- name: Upload archives - name: Upload archive part 0
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: adsb-archives-${{ inputs.date }} name: adsb-archive-${{ inputs.date }}-part-0
path: data/output/adsb_archives/${{ inputs.date }} path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_0.tar.gz
retention-days: 1
compression-level: 0
if-no-files-found: error
- name: Upload archive part 1
uses: actions/upload-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-1
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_1.tar.gz
retention-days: 1
compression-level: 0
if-no-files-found: error
- name: Upload archive part 2
uses: actions/upload-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-2
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_2.tar.gz
retention-days: 1
compression-level: 0
if-no-files-found: error
- name: Upload archive part 3
uses: actions/upload-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-3
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_3.tar.gz
retention-days: 1 retention-days: 1
compression-level: 0 compression-level: 0
if-no-files-found: error if-no-files-found: error
@@ -79,12 +106,22 @@ jobs:
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -r requirements.txt pip install -r requirements.txt
- name: Download archives - name: Download archive part
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
with: with:
name: adsb-archives-${{ inputs.date }} name: adsb-archive-${{ inputs.date }}-part-${{ matrix.part_id }}
path: data/output/adsb_archives/${{ inputs.date }} path: data/output/adsb_archives/${{ inputs.date }}
- name: Verify archive
run: |
FILE="data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_${{ matrix.part_id }}.tar.gz"
ls -lah data/output/adsb_archives/${{ inputs.date }}/
if [ ! -f "$FILE" ]; then
echo "::error::Archive not found: $FILE"
exit 1
fi
echo "Verified: $(du -h "$FILE")"
- name: Process part - name: Process part
env: env:
DATE: ${{ inputs.date }} DATE: ${{ inputs.date }}
@@ -140,6 +177,6 @@ jobs:
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: openairframes_adsb-${{ inputs.date }} name: openairframes_adsb-${{ inputs.date }}
path: data/output/openairframes_adsb_${{ inputs.date }}* path: data/output/openairframes_adsb_*
retention-days: 30 retention-days: 30
if-no-files-found: error if-no-files-found: error
@@ -101,6 +101,51 @@ jobs:
date: ${{ needs.resolve-dates.outputs.adsb_date }} date: ${{ needs.resolve-dates.outputs.adsb_date }}
concat_with_latest_csv: true concat_with_latest_csv: true
adsb-reduce:
needs: [resolve-dates, adsb-to-aircraft]
if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
runs-on: ubuntu-24.04-arm
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download compressed outputs
uses: actions/download-artifact@v4
with:
pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
merge-multiple: true
- name: Concatenate final outputs
env:
DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
CONCAT_WITH_LATEST_CSV: true
run: |
EXTRA=""
if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
EXTRA="--concat_with_latest_csv"
fi
python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
ls -lah data/output/ || true
- name: Upload final artifacts
uses: actions/upload-artifact@v4
with:
name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
path: data/output/openairframes_adsb_*
retention-days: 30
if-no-files-found: error
build-community: build-community:
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: github.event_name != 'schedule' if: github.event_name != 'schedule'
@@ -188,13 +233,13 @@ jobs:
create-release: create-release:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [resolve-dates, build-faa, adsb-to-aircraft, build-community, build-adsbexchange-json, build-mictronics-db] needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
if: github.event_name != 'schedule' && !cancelled() if: github.event_name != 'schedule' && !cancelled()
steps: steps:
- name: Check adsb-to-aircraft status - name: Check ADS-B workflow status
if: needs.adsb-to-aircraft.result != 'success' if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
run: | run: |
echo "WARNING: adsb-to-aircraft result was '${{ needs.adsb-to-aircraft.result }}', will continue without ADS-B artifacts" echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"
- name: Checkout for gh CLI - name: Checkout for gh CLI
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -211,7 +256,7 @@ jobs:
- name: Download ADS-B artifacts - name: Download ADS-B artifacts
uses: actions/download-artifact@v5 uses: actions/download-artifact@v5
if: needs.adsb-to-aircraft.result == 'success' if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
continue-on-error: true continue-on-error: true
with: with:
name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }} name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
@@ -266,7 +311,11 @@ jobs:
# Find files from artifacts using find (handles nested structures) # Find files from artifacts using find (handles nested structures)
CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1) CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1) # Prefer concatenated file (with date range) over single-day file
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*_*.csv.gz" -type f 2>/dev/null | head -1)
if [ -z "$CSV_FILE_ADSB" ]; then
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
fi
CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1) CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1) ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1) JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1)
+10 -2
View File
@@ -16,11 +16,19 @@ A daily release is created at **06:00 UTC** and includes:
- **openairframes_community.csv** - **openairframes_community.csv**
All community submissions All community submissions
- **openairframes_adsb.csv**
Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that days ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
Example Usage:
```python
import pandas as pd
url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
df = pd.read_csv(url)
df
```
![](docs/images/df_adsb_example_0.png)
- **openairframes_faa.csv** - **openairframes_faa.csv**
All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB) All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)
- **openairframes_adsb.csv**
Airframe information derived from ADS-B messages on the [ADSB.lol](https://www.adsb.lol/) network, from 2026-02-12 to present (will be from 2024-01-01 soon). The airframe information originates from [mictronics aircraft database](https://www.mictronics.de/aircraft-database/) (~5 MB).
- **ReleasableAircraft_{date}.zip** - **ReleasableAircraft_{date}.zip**
A daily snapshot of the FAA database, which updates at **05:30 UTC** A daily snapshot of the FAA database, which updates at **05:30 UTC**
+36
View File
@@ -0,0 +1,36 @@
TAP50Y lis lhr
EXS96WT man ibz
baw837 dbv lhr
exs6yr nce lba
tom1lx ncl ibz
exs62vc edi pmi
tom35j boj lgw
tom509 dlm lgw
afr902 cdg ndj nsi cdg
tom71a spc man
tom8ke man her
nsz3868 bll opo
exs95wl mah ncl
exs18rk stn reu
tom9db mah bhx
tom2bw reu bhx
kac113 kwi man
tom18e ibz gla
ocn8k snn fra
tfl365 ams cur bon ams
exs29y zth bhx
exs79cf olb man
asl508 beg yyz
tom4nw pmi man
exs3uq zth ema
exs23ml her man
gfa003 bah lhr
baw703 bjv lhr
tom2fb mme pmi
tom7el ibz lgw
tom7bd lba pmi
ual967 nap ewr
ein4ec dub cfu
tom78v lgw lca
eva067 tpe bkk lhr
ezy85xv nce lpl
+31
View File
@@ -0,0 +1,31 @@
efw979y klx lgw
ezy74wg ayt lgw
ezy95yg ibz sen
exs65lg kgs bhx
tom5ky her lgw
tom213 dlm man
jbu1990 sju ewr
exs68pv pmi stn
ice48p kef cdg
exs45ra man spu
klm741 ams bog ctg ams
exs42nu man olb
ein55g lys dub
baw538 lhr bds
uae74w lgw dxb
ely312 ltn tlv
tfl757 ams puj cur ams
wja41 lgw yhx
tom7pj reu man
ryr817l bzr stn
ein429 psa dub
exs3lf olb bhx
ezy38en lrh lgw
ezy85wd rmu man
apo7579 lgw los
tom13a mah man
baw2279 lgw yvr
exs406p gro edi
tom5jl pmi stn
ein42m dub vce
+30
View File
@@ -0,0 +1,30 @@
klm1045 ams bhx
dhk591 hkg del ema
sht22a lhr gci
etd75f auh lhr
tom92g pmi ema
tom767 nbe brs
qtr28u doh lhr
tom56m pmi ncl
aca883 nap yul
tsc691 ath yul
srr902 hgh nvi bhx bll
kac109 kwi lhr
cfe4ed ibz lcy
exs628 dbv ema
tom581 nbe ema
exs86j ema puy
exs67am skg lgw
tom37d kva bhx
tom9dy pmi bhx
qtr72b doh stn
exs52cj efl brs
ezy2816 pvk brs
tom2bk mah ncl
exs86pf jsi bhx
exs39yr jsi brs
exs17j mah lpl
qtr2c doh dub
cfe979 pmi lcy
sht21b gci lhr
exs12lf spu bhx
+32
View File
@@ -0,0 +1,32 @@
tom8ax kva man
tom6nk cfu man
gfa003 bah lhr
sxs7by adb dub
tom5gk ext kgs
tom62w efl brs
exs77j stn zth
tom4lw boh her
exs916 spu man
tom54y zth man
tom34g brs pfo
exs3th nap gla
exs9dw nap man
tom24m kgs lgw
tom748 ema sid
exs718d nte edi
exs53ru brs kgs
exs9eh pvk brs
etd71m lhr auh
exs29wk zth bhx
exs46qw kgs stn
wuk369 ltn pmi
tom5dc her brs
wuk9768 ltn jmk
tom5gl lba pmi
exs21dw bhx klx
tom5ka cwl lca
ein46p dub cta
tom73e efl stn
ely316 lhr tlv
efw26pp lgw mah
qtr47y lhr doh
+30
View File
@@ -0,0 +1,30 @@
kmm3118 mla lgw
baw539 bds lhr
tom29k zth brs
tom32x rho bhx
ezy71zj lgw pvk
baw536 lhr bds
ent429 lgw pvk
tom7cl brs cfu
qtr1f doh lhr
sxs5mq man ayt
klm767 ams aua bon ams
vlg5ml lcg lhr
exs92se puy ema
tom850 man nbe
exs5sq ncl pmi
apo7576 abv lhr
tom1an stn her
exs1kp gla pmi
ryr1794 ibz stn
kmm3119 lgw mla
isr116 ltn tlv
sht9f edi lhr
baw9cj lhr bru
ezy93wm brs pmo
ezy42eu ltn bsl
bbc201 dac zyl lhr
bbc202 lhr zyl dac
tom3lw rho lgw
sxs7fz ayt stn
exs71mf stn pmi
+33
View File
@@ -0,0 +1,33 @@
qtr33w doh lhr
tom86d cwl her
ezy49zc lgw bjv
cpa008 lhr cdg hkg
ely317 tlv lhr
tom4ej bhx cfu
tom93j gla ibz
sva117 jed lhr
ely313 tlv ltn
qtr67h lhr doh
box442 fra yyz ord
baw710c lhr lca
wuk784 pmi ltn
tom23m efl man
baw455 ibz lhr
baw595 olb lhr
baw621 peg lhr
azg394 bhx gyd
tom47x zth bhx
ezy45rl bsl ltn
tom10y ibz man
baw663 zth lhr
tom6en bhx pmi
efw74v kgs lgw
sva118 lhr jed
ezy38xg bod bhx
ein463 cta dub
baw537 bds lhr
exs689l pmi ncl
tom43j kgs ext
tom9gx pmi lgw
apo7577 lhr abv
ely318 lhr tlv
+33
View File
@@ -0,0 +1,33 @@
exs42m pmi stn
exs51nw ibz man
tom68h pfo brs
qtr61c doh lgg ord
wja51 lgw yyt
exs93pk gro ema
uae34y dxb man
cfe38z lcy fao
tom33j pmi ema
tom82k mah stn
tom8ya pmi bhx
exs1386 puy bhx
sva119 jed lhr
tom25a mah man
etd75f auh lhr
tom6ev mah bhx
efw16yk cag lgw
ajt8620 bru mia
uae9j dxb stn
tom429 nbe cwl
sxs9gg ayt bhx
baw947l spu lhr
tom7dm cfu ema
tom3nh skg brs
tom5hy ibz man
tom7hk pmi gla
tom9jw boj cwl
tom8be bud bhx
exs32y brs zth
tom7an spu man
tom84y pmi ncl
exs5qd efl lba
tom58h bhx zth
+38
View File
@@ -0,0 +1,38 @@
exs3uq zth ema
efw67a lgw ayt
tom657 nbe gla
tom7cd cfu gla
exs79ue pmi man
tom3lk nap ema
exs732 zth edi
cfe12g olb lcy
tom2xj jsi lgw
gfa003 bah lhr
gfa006 lhr bah
tfl4mh ams lpa
exs9dw nap man
tom6ym cwl cfu
cfe316 ibz lcy
qtr2c doh dub
exs48rz jsi man
afr018 cdg lax ppt
ewg8gj str lgw str
exs6yr nce lba
ely313 tlv ltn
ely317 tlv lhr
wuk13gw ltn tia
baw58xp mxp lhr
noz38w aes lgw
exs98dm ema fao
eju15uv lpl mxp
tom15x cfu man
ezy81qh ltn ibz
wuk784 pmi ltn
exs1898 zth brs
baw841 dbv lhr
sht6d lhr gla
tom2bg cfu cwl
exs45yk stn pmi
ezy36ep pmi lgw
tom9yg zth bhx
tom30w spu lgw
+31
View File
@@ -0,0 +1,31 @@
baw693 jtr lhr
baw58xp mxp lhr
uae9393 dwc lgg ord
exs91au ncl pmi
baw699w her lhr
cfe91g mah gla
vlg49uc lhr lcg
tom2nh pmi man
ezy56rd spu ltn
tom3fa reu bhx
eag8sb bhd sou
cfe31y pmi gla
cfe92y pmi edi
ibs18my lgw mad
exs1418 spu stn
exs41m vrn stn
tom2wt ibz nwi
baw661 efl lhr
wuk2818 zth ltn
eag9st sou bhd
tom2ga kgs brs
exs25db pmi edi
dhk812 bah lej ema
baw15 lhr sin syd
baw16 syd sin lhr
tom59a jtr man
exs45yk stn pmi
apo7577 lhr abv
tom6aw man pmi
baw675 pvk lhr
Binary file not shown.

After

Width:  |  Height:  |  Size: 99 KiB

+1 -1
View File
@@ -194,7 +194,7 @@ def main():
if triggered_runs and not args.dry_run: if triggered_runs and not args.dry_run:
import json import json
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
runs_file = f"./triggered_runs_{timestamp}.json" runs_file = f"./output/triggered_runs_{timestamp}.json"
with open(runs_file, 'w') as f: with open(runs_file, 'w') as f:
json.dump({ json.dump({
'start_date': args.start_date, 'start_date': args.start_date,
+242
View File
@@ -0,0 +1,242 @@
#!/usr/bin/env python3
"""
Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
Source: "TheAirTraffic Database - Aircraft 2.csv"
Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
Categories in the spreadsheet columns (paired: name, registrations, separator):
Col 1-3: Business
Col 4-6: Government
Col 7-9: People
Col 10-12: Sports
Col 13-15: Celebrity
Col 16-18: State Govt./Law
Col 19-21: Other
Col 22-24: Test Aircraft
Col 25-27: YouTubers
Col 28-30: Formula 1 VIP's
Col 31-33: Active GII's and GIII's (test/demo aircraft)
Col 34-37: Russia & Ukraine (extra col for old/new)
Col 38-40: Helicopters & Blimps
Col 41-43: Unique Reg's
Col 44-46: Saudi & UAE
Col 47-49: Schools
Col 50-52: Special Charter
Col 53-55: Unknown Owners
Col 56-59: Frequent Flyers (extra cols: name, aircraft, logged, hours)
"""
import csv
import json
import hashlib
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
# ── Category mapping ────────────────────────────────────────────────────────
# Each entry: (name_col, reg_col, owner_category_tags)
# owner_category_tags is a dict of tag keys to add beyond "owner"
CATEGORY_COLUMNS = [
# (name_col, reg_col, {tag_key: tag_value, ...})
(1, 2, {"owner_category_0": "business"}),
(4, 5, {"owner_category_0": "government"}),
(7, 8, {"owner_category_0": "celebrity"}),
(10, 11, {"owner_category_0": "sports"}),
(13, 14, {"owner_category_0": "celebrity"}),
(16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
(19, 20, {"owner_category_0": "other"}),
(22, 23, {"owner_category_0": "test_aircraft"}),
(25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
(28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
(31, 32, {"owner_category_0": "test_aircraft"}),
# Russia & Ukraine: col 34=name, col 35 or 36 may have reg
(34, 35, {"owner_category_0": "russia_ukraine"}),
(38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
(41, 42, {"owner_category_0": "other"}),
(44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
(47, 48, {"owner_category_0": "education"}),
(50, 51, {"owner_category_0": "charter"}),
(53, 54, {"owner_category_0": "unknown"}),
(56, 57, {"owner_category_0": "celebrity"}), # Frequent Flyers name col, aircraft col
]
# First data row index (0-based) in the CSV
DATA_START_ROW = 4
# ── Contributor info ────────────────────────────────────────────────────────
CONTRIBUTOR_NAME = "TheAirTraffic"
# Deterministic UUID v5 from contributor name
CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
# Citation
CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
def looks_like_military_serial(reg: str) -> bool:
"""
Detect military-style serials like 92-9000, 82-8000, 98-0001
or pure numeric IDs like 929000, 828000, 980001.
These aren't standard civil registrations; use openairframes_id.
"""
# Pattern: NN-NNNN
if re.match(r'^\d{2}-\d{4}$', reg):
return True
# Pure 6-digit numbers (likely ICAO hex or military mode-S)
if re.match(r'^\d{6}$', reg):
return True
# Short numeric-only (1-5 digits) like "01", "02", "676"
if re.match(r'^\d{1,5}$', reg):
return True
return False
def normalize_reg(raw: str) -> str:
"""Clean up a registration string."""
reg = raw.strip().rstrip(',').strip()
# Remove carriage returns and other whitespace
reg = reg.replace('\r', '').replace('\n', '').strip()
return reg
def parse_regs(cell_value: str) -> list[str]:
"""
Parse a cell that may contain one or many registrations,
separated by commas, possibly wrapped in quotes.
"""
if not cell_value or not cell_value.strip():
return []
# Some cells have ADS-B exchange URLs skip those
if 'globe.adsbexchange.com' in cell_value:
return []
if cell_value.strip() in ('.', ',', ''):
return []
results = []
# Split on comma
parts = cell_value.split(',')
for part in parts:
reg = normalize_reg(part)
if not reg:
continue
# Skip URLs, section labels, etc.
if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
continue
# Skip if it's just whitespace or dots
if reg in ('.', '..', '...'):
continue
results.append(reg)
return results
def make_submission(
reg: str,
owner: str,
category_tags: dict[str, str],
) -> dict:
"""Build a single community_submission.v1 object."""
entry: dict = {}
# Decide identifier field
if looks_like_military_serial(reg):
entry["openairframes_id"] = reg
else:
entry["registration_number"] = reg
# Tags
tags: dict = {
"citation_0": CITATION,
}
if owner:
tags["owner"] = owner.strip()
tags.update(category_tags)
entry["tags"] = tags
return entry
def main():
csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
"/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
)
if not csv_path.exists():
print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
sys.exit(1)
# Read CSV
with open(csv_path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
rows = list(reader)
print(f"Read {len(rows)} rows from {csv_path.name}")
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
submissions: list[dict] = []
seen: set[tuple] = set() # (reg, owner) dedup
for row_idx in range(DATA_START_ROW, len(rows)):
row = rows[row_idx]
if len(row) < 3:
continue
for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
if reg_col >= len(row) or name_col >= len(row):
continue
owner_raw = row[name_col].strip().rstrip(',').strip()
reg_raw = row[reg_col]
# Clean owner name
owner = owner_raw.replace('\r', '').replace('\n', '').strip()
if not owner or owner in ('.', ',', 'Section 1'):
continue
# Skip header-like values
if owner.startswith('http') or owner.startswith('Link '):
continue
regs = parse_regs(reg_raw)
if not regs:
# For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
if name_col == 34 and reg_col + 1 < len(row):
regs = parse_regs(row[reg_col + 1])
for reg in regs:
key = (reg, owner)
if key in seen:
continue
seen.add(key)
submissions.append(make_submission(reg, owner, cat_tags))
print(f"Generated {len(submissions)} submissions")
# Write output
proj_root = Path(__file__).resolve().parent.parent
out_dir = proj_root / "community" / date_str
out_dir.mkdir(parents=True, exist_ok=True)
out_file = out_dir / f"theairtraffic_{date_str}.json"
with open(out_file, 'w', encoding='utf-8') as f:
json.dump(submissions, f, indent=2, ensure_ascii=False)
print(f"Written to {out_file}")
print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
# Quick stats
cats = {}
for s in submissions:
c = s['tags'].get('owner_category_0', 'NONE')
cats[c] = cats.get(c, 0) + 1
print("\nCategory breakdown:")
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
print(f" {c}: {n}")
if __name__ == "__main__":
main()
+69
View File
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""Validate the generated theairtraffic JSON output."""
import json
import glob
import sys
# Find the latest output
files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
if not files:
print("No output files found!")
sys.exit(1)
path = files[-1]
print(f"Validating: {path}")
with open(path) as f:
data = json.load(f)
print(f"Total entries: {len(data)}")
# Check military serial handling
mil = [d for d in data if "openairframes_id" in d]
print(f"\nEntries using openairframes_id: {len(mil)}")
for m in mil[:10]:
print(f" {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
# Check youtuber entries
yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
print(f"\nYouTuber entries: {len(yt)}")
for y in yt[:5]:
reg = y.get("registration_number", y.get("openairframes_id"))
c0 = y["tags"].get("owner_category_0")
c1 = y["tags"].get("owner_category_1")
print(f" {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
# Check US Govt / military
gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
print(f"\nUSA 747/757 entries: {len(gov)}")
for g in gov:
oid = g.get("openairframes_id", g.get("registration_number"))
print(f" {oid}")
# Schema validation
issues = 0
for i, d in enumerate(data):
has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
if not has_id:
print(f" Entry {i}: no identifier!")
issues += 1
if "tags" not in d:
print(f" Entry {i}: no tags!")
issues += 1
# Check tag key format
for k in d.get("tags", {}):
import re
if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
print(f" Entry {i}: invalid tag key '{k}'")
issues += 1
print(f"\nSchema issues: {issues}")
# Category breakdown
cats = {}
for s in data:
c = s["tags"].get("owner_category_0", "NONE")
cats[c] = cats.get(c, 0) + 1
print("\nCategory breakdown:")
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
print(f" {c}: {n}")
+40 -23
View File
@@ -1,7 +1,7 @@
from pathlib import Path from pathlib import Path
import polars as pl import polars as pl
import argparse import argparse
import os
OUTPUT_DIR = Path("./data/output") OUTPUT_DIR = Path("./data/output")
CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"] CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
@@ -13,38 +13,55 @@ def main():
compressed_dir = OUTPUT_DIR / "compressed" compressed_dir = OUTPUT_DIR / "compressed"
date_dir = compressed_dir / args.date date_dir = compressed_dir / args.date
if not date_dir.is_dir():
raise FileNotFoundError(f"No date folder found: {date_dir}")
parquet_files = sorted(date_dir.glob("*.parquet")) parquet_files = sorted(date_dir.glob("*.parquet"))
if not parquet_files: df = None
raise FileNotFoundError(f"No parquet files found in {date_dir}") if parquet_files: # TODO: This logic could be updated slightly.
print(f"No parquet files found in {date_dir}")
frames = [pl.read_parquet(p) for p in parquet_files] frames = [pl.read_parquet(p) for p in parquet_files]
df = pl.concat(frames, how="vertical", rechunk=True) df = pl.concat(frames, how="vertical", rechunk=True)
df = df.sort(["time", "icao"]) df = df.sort(["time", "icao"])
df = df.select(CORRECT_ORDER_OF_COLUMNS) df = df.select(CORRECT_ORDER_OF_COLUMNS)
output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet" output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
print(f"Writing combined parquet to {output_path} with {df.height} rows") print(f"Writing combined parquet to {output_path} with {df.height} rows")
df.write_parquet(output_path) df.write_parquet(output_path)
csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz" csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows") print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
df.write_csv(csv_output_path, compression="gzip") df.write_csv(csv_output_path, compression="gzip")
if args.concat_with_latest_csv: if args.concat_with_latest_csv:
print("Loading latest CSV from GitHub releases to concatenate with...") print("Loading latest CSV from GitHub releases to concatenate with...")
from src.get_latest_release import get_latest_aircraft_adsb_csv_df from src.get_latest_release import get_latest_aircraft_adsb_csv_df
df_latest_csv, csv_date = get_latest_aircraft_adsb_csv_df() from datetime import datetime
# Ensure column order matches before concatenating
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS) df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
df_final = concat_compressed_dfs(df_latest_csv, df) # Compare dates: end_date is exclusive, so if csv_end_date > args.date,
df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS) # the latest CSV already includes this day's data
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_date}_{args.date}.csv.gz" csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
df_final.write_csv(final_csv_output_path, compression="gzip") args_dt = datetime.strptime(args.date, "%Y-%m-%d")
if df is None or csv_end_dt >= args_dt:
print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
print("Writing latest CSV directly without concatenation to avoid duplicates")
os.makedirs(OUTPUT_DIR, exist_ok=True)
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
else:
print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
# Ensure column order matches before concatenating
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
df_final = concat_compressed_dfs(df_latest_csv, df)
df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
df_final.write_csv(final_csv_output_path, compression="gzip")
print(f"Final CSV written to {final_csv_output_path}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
+10 -1
View File
@@ -123,7 +123,16 @@ def main():
print(f"Processing part {args.part_id} for {args.date}") print(f"Processing part {args.part_id} for {args.date}")
# Get specific archive file for this part # Get specific archive file for this part
archive_path = os.path.join(OUTPUT_DIR, "adsb_archives", args.date, f"{args.date}_part_{args.part_id}.tar.gz") archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", args.date)
archive_path = os.path.join(archive_dir, f"{args.date}_part_{args.part_id}.tar.gz")
if not os.path.isfile(archive_path):
print(f"ERROR: Archive not found: {archive_path}")
if os.path.isdir(archive_dir):
print(f"Files in {archive_dir}: {os.listdir(archive_dir)}")
else:
print(f"Directory does not exist: {archive_dir}")
sys.exit(1)
# Extract and collect trace files # Extract and collect trace files
trace_map = build_trace_file_map(archive_path) trace_map = build_trace_file_map(archive_path)
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
"""Read all JSON submissions from the community directory.""" """Read all JSON submissions from the community directory."""
all_submissions = [] all_submissions = []
for json_file in sorted(community_dir.glob("*.json")): for json_file in sorted(community_dir.glob("**/*.json")):
try: try:
with open(json_file) as f: with open(json_file) as f:
data = json.load(f) data = json.load(f)
+61 -22
View File
@@ -27,6 +27,33 @@ def _http_get_json(url: str, headers: dict[str, str]) -> dict:
return json.loads(data.decode("utf-8")) return json.loads(data.decode("utf-8"))
def get_releases(repo: str = REPO, github_token: Optional[str] = None, per_page: int = 30) -> list[dict]:
"""Get a list of releases from the repository."""
url = f"https://api.github.com/repos/{repo}/releases?per_page={per_page}"
headers = {
"Accept": "application/vnd.github+json",
"User-Agent": "openairframes-downloader/1.0",
}
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
return _http_get_json(url, headers=headers)
def get_release_assets_from_release_data(release_data: dict) -> list[ReleaseAsset]:
"""Extract assets from a release data dictionary."""
assets = []
for a in release_data.get("assets", []):
assets.append(
ReleaseAsset(
name=a["name"],
download_url=a["browser_download_url"],
size=int(a.get("size", 0)),
)
)
return assets
def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]: def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
url = f"https://api.github.com/repos/{repo}/releases/latest" url = f"https://api.github.com/repos/{repo}/releases/latest"
headers = { headers = {
@@ -37,16 +64,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
headers["Authorization"] = f"Bearer {github_token}" headers["Authorization"] = f"Bearer {github_token}"
payload = _http_get_json(url, headers=headers) payload = _http_get_json(url, headers=headers)
assets = [] return get_release_assets_from_release_data(payload)
for a in payload.get("assets", []):
assets.append(
ReleaseAsset(
name=a["name"],
download_url=a["browser_download_url"],
size=int(a.get("size", 0)),
)
)
return assets
def pick_asset( def pick_asset(
@@ -155,7 +173,8 @@ def download_latest_aircraft_adsb_csv(
repo: str = REPO, repo: str = REPO,
) -> Path: ) -> Path:
""" """
Download the latest openairframes_adsb_*.csv file from the latest GitHub release. Download the latest openairframes_adsb_*.csv file from GitHub releases.
If the latest release doesn't have the file, searches previous releases.
Args: Args:
output_dir: Directory to save the downloaded file (default: "downloads") output_dir: Directory to save the downloaded file (default: "downloads")
@@ -166,15 +185,33 @@ def download_latest_aircraft_adsb_csv(
Path to the downloaded file Path to the downloaded file
""" """
output_dir = Path(output_dir) output_dir = Path(output_dir)
assets = get_latest_release_assets(repo, github_token=github_token)
asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$") # Get multiple releases
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) releases = get_releases(repo, github_token=github_token, per_page=30)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to # Try each release until we find one with the matching asset
for release in releases:
assets = get_release_assets_from_release_data(release)
try:
asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to
except FileNotFoundError:
# This release doesn't have the matching asset, try the next one
continue
raise FileNotFoundError(
f"No release in the last 30 releases has an asset matching 'openairframes_adsb_.*\\.csv(\\.gz)?$'"
)
import polars as pl import polars as pl
def get_latest_aircraft_adsb_csv_df(): def get_latest_aircraft_adsb_csv_df():
"""Download and load the latest ADS-B CSV from GitHub releases.""" """Download and load the latest ADS-B CSV from GitHub releases.
Returns:
tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
"""
import re import re
csv_path = download_latest_aircraft_adsb_csv() csv_path = download_latest_aircraft_adsb_csv()
@@ -198,17 +235,19 @@ def get_latest_aircraft_adsb_csv_df():
if df[col].dtype == pl.Utf8: if df[col].dtype == pl.Utf8:
df = df.with_columns(pl.col(col).fill_null("")) df = df.with_columns(pl.col(col).fill_null(""))
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz] # Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
if not match: if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}") raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
date_str = match.group(1) start_date = match.group(1)
end_date = match.group(2)
print(df.columns) print(df.columns)
print(df.dtypes) print(df.dtypes)
return df, date_str return df, start_date, end_date
if __name__ == "__main__": if __name__ == "__main__":
download_latest_aircraft_csv() download_latest_aircraft_csv()
download_latest_aircraft_adsb_csv()