Compare commits

..

25 Commits

Author SHA1 Message Date
JG 68cc4d89b3 Merge pull request #46 psimpson routes
add psimpson routes
2026-05-25 18:59:28 -04:00
ggman12 b653c3a844 add psimpson routes 2026-05-25 18:54:57 -04:00
JG 2829e5fb6e Merge pull request #35 from PlaneQuery/develop
update readme.md
2026-03-18 14:31:29 -04:00
ggman12 9c744b0baf update readme.md 2026-03-18 14:29:13 -04:00
JG ebda04767f Merge pull request #34 from PlaneQuery/develop
Develop to main: theairtraffic google sheet
2026-03-10 05:12:11 -04:00
ggman12 3fdf443894 add russia_ukraine 2026-03-10 05:08:19 -04:00
ggman12 24313603c5 works 2026-03-10 05:08:19 -04:00
JG 2bb0a5eac3 Merge pull request #33 from PlaneQuery/develop
Develop to Main: Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb
2026-02-26 15:32:59 -05:00
ggman12 b54f33aa56 Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb 2026-02-26 15:31:47 -05:00
JG 2dda3d341c Merge pull request #32 from PlaneQuery/develop
Develop to Main: Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.
2026-02-24 15:37:54 -05:00
ggman12 b0526f0a95 Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data. 2026-02-24 15:36:10 -05:00
JG 4b6a043a9d Merge pull request #31 from PlaneQuery/develop
Develop to Main Fix adsb asset retrival to be more fault tolerant. Fix download issue
2026-02-24 02:17:08 -05:00
ggman12 55c464aad7 Fix adsb asset retrival to be more fault tolerant. Fix download issue for 2024-07-03 2026-02-24 02:12:55 -05:00
ggman12 aa509e8560 attempt to fix download issue for 2024-07-03 2026-02-19 17:51:49 -05:00
ggman12 82d11d8d24 try less strict tar extract for 2025-10-15 and other days that fail 2026-02-19 00:20:03 -05:00
ggman12 76a217ad14 src/contributions/approve_submission.py handle big json files 2026-02-18 23:18:19 -05:00
ggman12 ec2d1a1291 update download.sh 2026-02-18 23:18:19 -05:00
ggman12 97284c69a9 verify downlaod asssets 2026-02-18 23:18:19 -05:00
JG 892ffa78af Merge pull request #28 from PlaneQuery/community-submission-27
Community submission: ggman12_2026-02-18_5ddbb8bd.json
2026-02-18 17:18:49 -05:00
github-actions[bot] f77a91db2c Update schema with new tags: manufacturer_icao, manufacturer_name, model, type_code, serial_number, icao_aircraft_type, operator, operator_callsign, operator_icao, citation_0 2026-02-18 22:18:12 +00:00
github-actions[bot] b3bd654998 Add community submission from @ggman12 (closes #27) 2026-02-18 22:18:12 +00:00
ggman12 302be8b8dc update checker for arrays issue 2026-02-18 17:11:14 -05:00
ggman12 b61dc0f5e5 provide more error 2026-02-18 17:08:43 -05:00
ggman12 1ff17cc6a8 allow adsb to fail for when adsb.lol hasen't uploaded file yet. 2026-02-18 16:49:02 -05:00
ggman12 d216ea9329 Daily ADSB and Histoircal updates. Update readme.md 2026-02-18 16:34:06 -05:00
47 changed files with 2435 additions and 1746 deletions
@@ -0,0 +1,182 @@
name: Historical ADS-B Processing
on:
workflow_dispatch:
inputs:
date:
description: 'YYYY-MM-DD'
required: true
type: string
concat_with_latest_csv:
description: 'Also concatenate with latest CSV from GitHub releases'
required: false
type: boolean
default: false
workflow_call:
inputs:
date:
description: 'YYYY-MM-DD'
required: true
type: string
concat_with_latest_csv:
description: 'Also concatenate with latest CSV from GitHub releases'
required: false
type: boolean
default: false
jobs:
adsb-extract:
runs-on: ubuntu-24.04-arm
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download and split ADS-B data
env:
DATE: ${{ inputs.date }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
python -m src.adsb.download_and_list_icaos --date "$DATE"
ls -lah data/output/adsb_archives/"$DATE" || true
- name: Upload archive part 0
uses: actions/upload-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-0
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_0.tar.gz
retention-days: 1
compression-level: 0
if-no-files-found: error
- name: Upload archive part 1
uses: actions/upload-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-1
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_1.tar.gz
retention-days: 1
compression-level: 0
if-no-files-found: error
- name: Upload archive part 2
uses: actions/upload-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-2
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_2.tar.gz
retention-days: 1
compression-level: 0
if-no-files-found: error
- name: Upload archive part 3
uses: actions/upload-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-3
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_3.tar.gz
retention-days: 1
compression-level: 0
if-no-files-found: error
adsb-map:
needs: adsb-extract
runs-on: ubuntu-24.04-arm
strategy:
fail-fast: true
matrix:
part_id: [0, 1, 2, 3]
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download archive part
uses: actions/download-artifact@v4
with:
name: adsb-archive-${{ inputs.date }}-part-${{ matrix.part_id }}
path: data/output/adsb_archives/${{ inputs.date }}
- name: Verify archive
run: |
FILE="data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_${{ matrix.part_id }}.tar.gz"
ls -lah data/output/adsb_archives/${{ inputs.date }}/
if [ ! -f "$FILE" ]; then
echo "::error::Archive not found: $FILE"
exit 1
fi
echo "Verified: $(du -h "$FILE")"
- name: Process part
env:
DATE: ${{ inputs.date }}
run: |
python -m src.adsb.process_icao_chunk --part-id ${{ matrix.part_id }} --date "$DATE"
- name: Upload compressed outputs
uses: actions/upload-artifact@v4
with:
name: adsb-compressed-${{ inputs.date }}-part-${{ matrix.part_id }}
path: data/output/compressed/${{ inputs.date }}
retention-days: 1
compression-level: 0
if-no-files-found: error
adsb-reduce:
needs: adsb-map
runs-on: ubuntu-24.04-arm
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download compressed outputs
uses: actions/download-artifact@v4
with:
pattern: adsb-compressed-${{ inputs.date }}-part-*
path: data/output/compressed/${{ inputs.date }}
merge-multiple: true
- name: Concatenate final outputs
env:
DATE: ${{ inputs.date }}
CONCAT_WITH_LATEST_CSV: ${{ inputs.concat_with_latest_csv }}
run: |
EXTRA=""
if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
EXTRA="--concat_with_latest_csv"
fi
python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
ls -lah data/output/ || true
- name: Upload final artifacts
uses: actions/upload-artifact@v4
with:
name: openairframes_adsb-${{ inputs.date }}
path: data/output/openairframes_adsb_*
retention-days: 30
if-no-files-found: error
@@ -0,0 +1,118 @@
name: adsb-to-aircraft-multiple-day-run
on:
workflow_dispatch:
inputs:
start_date:
description: 'YYYY-MM-DD (inclusive)'
required: true
type: string
end_date:
description: 'YYYY-MM-DD (exclusive)'
required: true
type: string
jobs:
generate-dates:
runs-on: ubuntu-24.04-arm
outputs:
dates: ${{ steps.generate.outputs.dates }}
steps:
- name: Generate date list
id: generate
env:
START_DATE: ${{ inputs.start_date }}
END_DATE: ${{ inputs.end_date }}
run: |
python - <<'PY'
import json
import os
from datetime import datetime, timedelta
start = datetime.strptime(os.environ["START_DATE"], "%Y-%m-%d")
end = datetime.strptime(os.environ["END_DATE"], "%Y-%m-%d")
if end <= start:
raise SystemExit("end_date must be after start_date")
dates = []
cur = start
while cur < end:
dates.append(cur.strftime("%Y-%m-%d"))
cur += timedelta(days=1)
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
f.write(f"dates={json.dumps(dates)}\n")
PY
adsb-day:
needs: generate-dates
strategy:
fail-fast: true
matrix:
date: ${{ fromJson(needs.generate-dates.outputs.dates) }}
uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
with:
date: ${{ matrix.date }}
adsb-final:
needs: adsb-day
runs-on: ubuntu-24.04-arm
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download daily CSVs
uses: actions/download-artifact@v4
with:
pattern: openairframes_adsb-*
path: outputs/daily/
merge-multiple: true
- name: Concatenate all days to final CSV
env:
START_DATE: ${{ inputs.start_date }}
END_DATE: ${{ inputs.end_date }}
run: |
python - <<'PY'
import os
import re
from pathlib import Path
import polars as pl
start = os.environ["START_DATE"]
end = os.environ["END_DATE"]
daily_dir = Path("outputs/daily")
files = sorted(daily_dir.glob("openairframes_adsb_*.csv.gz"))
if not files:
raise SystemExit("No daily CSVs found")
def date_key(path: Path) -> str:
m = re.match(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", path.name)
return m.group(1) if m else path.name
files = sorted(files, key=date_key)
frames = [pl.read_csv(p) for p in files]
df = pl.concat(frames, how="vertical", rechunk=True)
output_path = Path("outputs") / f"openairframes_adsb_{start}_{end}.csv.gz"
df.write_csv(output_path, compression="gzip")
print(f"Wrote {output_path} with {df.height} rows")
PY
- name: Upload final CSV
uses: actions/upload-artifact@v4
with:
name: openairframes_adsb-${{ inputs.start_date }}-${{ inputs.end_date }}
path: outputs/openairframes_adsb_${{ inputs.start_date }}_${{ inputs.end_date }}.csv.gz
retention-days: 30
# gh workflow run adsb-to-aircraft-multiple-day-run.yaml --repo ggman12/OpenAirframes --ref jonah/fix-historical-proper -f start_date=2025-12-31 -f end_date=2026-01-02
-268
View File
@@ -1,268 +0,0 @@
name: Historical ADS-B Processing
on:
workflow_dispatch:
inputs:
start_date:
description: 'Start date (YYYY-MM-DD, inclusive)'
required: true
type: string
end_date:
description: 'End date (YYYY-MM-DD, exclusive)'
required: true
type: string
chunk_days:
description: 'Days per job chunk (default: 7)'
required: false
type: number
default: 7
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
chunks: ${{ steps.generate.outputs.chunks }}
global_start: ${{ inputs.start_date }}
global_end: ${{ inputs.end_date }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Generate date chunks
id: generate
env:
INPUT_START_DATE: ${{ inputs.start_date }}
INPUT_END_DATE: ${{ inputs.end_date }}
INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
run: python src/adsb/historical_generate_matrix.py
adsb-extract:
needs: generate-matrix
runs-on: ubuntu-24.04-arm
strategy:
matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
max-parallel: 3
fail-fast: true
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
df -h
- name: Download and extract ADS-B data
env:
START_DATE: ${{ matrix.chunk.start_date }}
END_DATE: ${{ matrix.chunk.end_date }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
ls -lah data/output/
- name: Create tar of extracted data and split into chunks
run: |
cd data/output
echo "=== Disk space before tar ==="
df -h .
echo "=== Files to tar ==="
ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
# Create tar with explicit error checking
if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
echo "=== Tar file created ==="
ls -lah extracted_data.tar
# Verify tar integrity
tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
# Create checksum of the FULL tar before splitting (for verification after reassembly)
echo "=== Creating checksum of full tar ==="
sha256sum extracted_data.tar > full_tar.sha256
cat full_tar.sha256
# Split into 500MB chunks to avoid artifact upload issues
echo "=== Splitting tar into 500MB chunks ==="
mkdir -p tar_chunks
split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
rm extracted_data.tar
mv full_tar.sha256 tar_chunks/
echo "=== Chunks created ==="
ls -lah tar_chunks/
else
echo "ERROR: No extracted directories found, cannot create tar"
exit 1
fi
- name: Upload extracted data chunks
uses: actions/upload-artifact@v4
with:
name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
path: data/output/tar_chunks/
retention-days: 1
compression-level: 0
if-no-files-found: warn
adsb-map:
needs: [generate-matrix, adsb-extract]
runs-on: ubuntu-24.04-arm
strategy:
fail-fast: true
matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
icao_chunk: [0, 1, 2, 3]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
df -h
- name: Download extracted data
uses: actions/download-artifact@v4
with:
name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
path: data/output/tar_chunks/
- name: Reassemble and extract tar
id: extract
run: |
cd data/output
if [ -d tar_chunks ] && ls tar_chunks/extracted_data.tar.part_* 1>/dev/null 2>&1; then
echo "=== Chunk files info ==="
ls -lah tar_chunks/
cd tar_chunks
# Reassemble tar with explicit sorting
echo "=== Reassembling tar file ==="
ls -1 extracted_data.tar.part_?? | sort | while read part; do
echo "Appending $part..."
cat "$part" >> ../extracted_data.tar
done
cd ..
echo "=== Reassembled tar file info ==="
ls -lah extracted_data.tar
# Verify checksum of reassembled tar matches original
echo "=== Verifying reassembled tar checksum ==="
echo "Original checksum:"
cat tar_chunks/full_tar.sha256
echo "Reassembled checksum:"
sha256sum extracted_data.tar
sha256sum -c tar_chunks/full_tar.sha256 || { echo "ERROR: Reassembled tar checksum mismatch - data corrupted during transfer"; exit 1; }
echo "Checksum verified - data integrity confirmed"
rm -rf tar_chunks
echo "=== Extracting ==="
tar -xvf extracted_data.tar
rm extracted_data.tar
echo "has_data=true" >> "$GITHUB_OUTPUT"
echo "=== Contents of data/output ==="
ls -lah
else
echo "No tar chunks found"
echo "has_data=false" >> "$GITHUB_OUTPUT"
fi
- name: Process ICAO chunk
if: steps.extract.outputs.has_data == 'true'
env:
START_DATE: ${{ matrix.chunk.start_date }}
END_DATE: ${{ matrix.chunk.end_date }}
run: |
python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.icao_chunk }} --total-chunks 4 --start-date "$START_DATE" --end-date "$END_DATE"
ls -lah data/output/adsb_chunks/ || echo "No chunks created"
- name: Upload chunk artifacts
if: steps.extract.outputs.has_data == 'true'
uses: actions/upload-artifact@v4
with:
name: adsb-map-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}-chunk-${{ matrix.icao_chunk }}
path: data/output/adsb_chunks/
retention-days: 1
if-no-files-found: ignore
adsb-reduce:
needs: [generate-matrix, adsb-map]
runs-on: ubuntu-24.04-arm
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download all chunk artifacts
uses: actions/download-artifact@v4
with:
pattern: adsb-map-*
path: data/output/adsb_chunks/
merge-multiple: true
- name: Debug downloaded files
run: |
echo "=== Disk space before processing ==="
df -h
echo "=== Listing data/output/adsb_chunks/ ==="
find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
echo "=== Total parquet size ==="
du -sh data/output/adsb_chunks/ || echo "No chunks dir"
- name: Combine chunks to CSV
env:
START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
run: |
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
ls -lah data/openairframes/
- name: Upload final artifact
uses: actions/upload-artifact@v4
with:
name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
path: data/openairframes/*.csv
retention-days: 30
+171 -136
View File
@@ -1,4 +1,4 @@
name: OpenAirframes Daily Release name: openairframes-daily-release
on: on:
schedule: schedule:
@@ -76,159 +76,75 @@ jobs:
data/faa_releasable/ReleasableAircraft_*.zip data/faa_releasable/ReleasableAircraft_*.zip
retention-days: 1 retention-days: 1
adsb-extract: resolve-dates:
runs-on: ubuntu-24.04-arm runs-on: ubuntu-latest
if: github.event_name != 'schedule' if: github.event_name != 'schedule'
outputs: outputs:
manifest-exists: ${{ steps.check.outputs.exists }} date: ${{ steps.out.outputs.date }}
adsb_date: ${{ steps.out.outputs.adsb_date }}
steps: steps:
- name: Checkout - id: out
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: "3.14"
- name: Install dependencies
run: | run: |
python -m pip install --upgrade pip if [ -n "${{ inputs.date }}" ]; then
pip install -r requirements.txt echo "date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
echo "adsb_date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
- name: Download and extract ADS-B data
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
python -m src.adsb.download_and_list_icaos ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/output/
- name: Check manifest exists
id: check
run: |
if ls data/output/icao_manifest_*.txt 1>/dev/null 2>&1; then
echo "exists=true" >> "$GITHUB_OUTPUT"
else else
echo "exists=false" >> "$GITHUB_OUTPUT" echo "date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
echo "adsb_date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
fi fi
- name: Create tar of extracted data adsb-to-aircraft:
run: | needs: resolve-dates
cd data/output if: github.event_name != 'schedule'
tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
ls -lah extracted_data.tar with:
date: ${{ needs.resolve-dates.outputs.adsb_date }}
- name: Upload extracted data concat_with_latest_csv: true
uses: actions/upload-artifact@v4
with:
name: adsb-extracted
path: data/output/extracted_data.tar
retention-days: 1
compression-level: 0 # Already compressed trace files
adsb-map:
runs-on: ubuntu-24.04-arm
needs: adsb-extract
if: github.event_name != 'schedule' && needs.adsb-extract.outputs.manifest-exists == 'true'
strategy:
fail-fast: false
matrix:
chunk: [0, 1, 2, 3]
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: "3.14"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Download extracted data
uses: actions/download-artifact@v4
with:
name: adsb-extracted
path: data/output/
- name: Extract tar
run: |
cd data/output
tar -xf extracted_data.tar
rm extracted_data.tar
echo "=== Contents of data/output ==="
ls -lah
echo "=== Looking for manifest ==="
cat icao_manifest_*.txt | head -20 || echo "No manifest found"
echo "=== Looking for extracted dirs ==="
ls -d *-planes-readsb-prod-0* 2>/dev/null || echo "No extracted dirs"
- name: Process chunk ${{ matrix.chunk }}
run: |
python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4 ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
mkdir -p data/output/adsb_chunks
ls -lah data/output/adsb_chunks/ || echo "No chunks created"
- name: Upload chunk artifacts
uses: actions/upload-artifact@v4
with:
name: adsb-chunk-${{ matrix.chunk }}
path: data/output/adsb_chunks/
retention-days: 1
adsb-reduce: adsb-reduce:
needs: [resolve-dates, adsb-to-aircraft]
if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
runs-on: ubuntu-24.04-arm runs-on: ubuntu-24.04-arm
needs: adsb-map
if: github.event_name != 'schedule'
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v6 uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Python - name: Setup Python
uses: actions/setup-python@v6 uses: actions/setup-python@v6
with: with:
python-version: "3.14" python-version: '3.12'
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
pip install -r requirements.txt pip install -r requirements.txt
- name: Download all chunk artifacts - name: Download compressed outputs
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
with: with:
pattern: adsb-chunk-* pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
path: data/output/adsb_chunks/ path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
merge-multiple: true merge-multiple: true
- name: Debug downloaded files - name: Concatenate final outputs
env:
DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
CONCAT_WITH_LATEST_CSV: true
run: | run: |
echo "=== Listing data/ ===" EXTRA=""
find data/ -type f 2>/dev/null | head -50 || echo "No files in data/" if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
echo "=== Looking for parquet files ===" EXTRA="--concat_with_latest_csv"
find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found" fi
python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
ls -lah data/output/ || true
- name: Combine chunks to CSV - name: Upload final artifacts
run: |
mkdir -p data/output/adsb_chunks
ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/openairframes/
- name: Upload ADS-B artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: adsb-release name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
path: data/openairframes/openairframes_adsb_*.csv path: data/output/openairframes_adsb_*
retention-days: 1 retention-days: 30
if-no-files-found: error
build-community: build-community:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -261,11 +177,70 @@ jobs:
path: data/openairframes/openairframes_community_*.csv path: data/openairframes/openairframes_community_*.csv
retention-days: 1 retention-days: 1
create-release: build-adsbexchange-json:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [build-faa, adsb-reduce, build-community]
if: github.event_name != 'schedule' if: github.event_name != 'schedule'
steps: steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: "3.14"
- name: Run ADS-B Exchange JSON release script
run: |
python -m src.contributions.create_daily_adsbexchange_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/openairframes
- name: Upload ADS-B Exchange JSON artifact
uses: actions/upload-artifact@v4
with:
name: adsbexchange-json
path: data/openairframes/basic-ac-db_*.json.gz
retention-days: 1
build-mictronics-db:
runs-on: ubuntu-latest
if: github.event_name != 'schedule'
steps:
- name: Checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup Python
uses: actions/setup-python@v6
with:
python-version: "3.14"
- name: Run Mictronics DB release script
continue-on-error: true
run: |
python -m src.contributions.create_daily_microtonics_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/openairframes
- name: Upload Mictronics DB artifact
uses: actions/upload-artifact@v4
with:
name: mictronics-db
path: data/openairframes/mictronics-db_*.zip
retention-days: 1
if-no-files-found: ignore
create-release:
runs-on: ubuntu-latest
needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
if: github.event_name != 'schedule' && !cancelled()
steps:
- name: Check ADS-B workflow status
if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
run: |
echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"
- name: Checkout for gh CLI - name: Checkout for gh CLI
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
@@ -274,23 +249,38 @@ jobs:
sparse-checkout-cone-mode: false sparse-checkout-cone-mode: false
- name: Download FAA artifacts - name: Download FAA artifacts
uses: actions/download-artifact@v4 uses: actions/download-artifact@v5
with: with:
name: faa-release name: faa-release
path: artifacts/faa path: artifacts/faa
- name: Download ADS-B artifacts - name: Download ADS-B artifacts
uses: actions/download-artifact@v4 uses: actions/download-artifact@v5
if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
continue-on-error: true
with: with:
name: adsb-release name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
path: artifacts/adsb path: artifacts/adsb
- name: Download Community artifacts - name: Download Community artifacts
uses: actions/download-artifact@v4 uses: actions/download-artifact@v5
with: with:
name: community-release name: community-release
path: artifacts/community path: artifacts/community
- name: Download ADS-B Exchange JSON artifact
uses: actions/download-artifact@v5
with:
name: adsbexchange-json
path: artifacts/adsbexchange
- name: Download Mictronics DB artifact
uses: actions/download-artifact@v5
continue-on-error: true
with:
name: mictronics-db
path: artifacts/mictronics
- name: Debug artifact structure - name: Debug artifact structure
run: | run: |
echo "=== Full artifacts tree ===" echo "=== Full artifacts tree ==="
@@ -301,6 +291,10 @@ jobs:
find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb" find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
echo "=== Community artifacts ===" echo "=== Community artifacts ==="
find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community" find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
echo "=== ADS-B Exchange JSON artifacts ==="
find artifacts/adsbexchange -type f 2>/dev/null || echo "No files found in artifacts/adsbexchange"
echo "=== Mictronics DB artifacts ==="
find artifacts/mictronics -type f 2>/dev/null || echo "No files found in artifacts/mictronics"
- name: Prepare release metadata - name: Prepare release metadata
id: meta id: meta
@@ -317,35 +311,66 @@ jobs:
# Find files from artifacts using find (handles nested structures) # Find files from artifacts using find (handles nested structures)
CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1) CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" -type f 2>/dev/null | head -1) # Prefer concatenated file (with date range) over single-day file
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*_*.csv.gz" -type f 2>/dev/null | head -1)
if [ -z "$CSV_FILE_ADSB" ]; then
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
fi
CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1) CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1) ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1)
ZIP_FILE_MICTRONICS=$(find artifacts/mictronics -name "mictronics-db_*.zip" -type f 2>/dev/null | head -1)
# Validate required files exist # Validate required files exist
MISSING_FILES="" MISSING_FILES=""
if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then
MISSING_FILES="$MISSING_FILES FAA_CSV" MISSING_FILES="$MISSING_FILES FAA_CSV"
fi fi
if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
MISSING_FILES="$MISSING_FILES ADSB_CSV"
fi
if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then
MISSING_FILES="$MISSING_FILES FAA_ZIP" MISSING_FILES="$MISSING_FILES FAA_ZIP"
fi fi
if [ -z "$JSON_FILE_ADSBX" ] || [ ! -f "$JSON_FILE_ADSBX" ]; then
MISSING_FILES="$MISSING_FILES ADSBX_JSON"
fi
# Optional files - warn but don't fail
OPTIONAL_MISSING=""
if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
OPTIONAL_MISSING="$OPTIONAL_MISSING ADSB_CSV"
CSV_FILE_ADSB=""
CSV_BASENAME_ADSB=""
fi
if [ -z "$ZIP_FILE_MICTRONICS" ] || [ ! -f "$ZIP_FILE_MICTRONICS" ]; then
OPTIONAL_MISSING="$OPTIONAL_MISSING MICTRONICS_ZIP"
ZIP_FILE_MICTRONICS=""
fi
if [ -n "$MISSING_FILES" ]; then if [ -n "$MISSING_FILES" ]; then
echo "ERROR: Missing required release files:$MISSING_FILES" echo "ERROR: Missing required release files:$MISSING_FILES"
echo "FAA CSV: $CSV_FILE_FAA" echo "FAA CSV: $CSV_FILE_FAA"
echo "ADSB CSV: $CSV_FILE_ADSB" echo "ADSB CSV: $CSV_FILE_ADSB"
echo "ZIP: $ZIP_FILE" echo "ZIP: $ZIP_FILE"
echo "ADSBX JSON: $JSON_FILE_ADSBX"
echo "MICTRONICS ZIP: $ZIP_FILE_MICTRONICS"
exit 1 exit 1
fi fi
# Get basenames for display # Get basenames for display
CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA") CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB") if [ -n "$CSV_FILE_ADSB" ]; then
CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
fi
CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "") CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
ZIP_BASENAME=$(basename "$ZIP_FILE") ZIP_BASENAME=$(basename "$ZIP_FILE")
JSON_BASENAME_ADSBX=$(basename "$JSON_FILE_ADSBX")
ZIP_BASENAME_MICTRONICS=""
if [ -n "$ZIP_FILE_MICTRONICS" ]; then
ZIP_BASENAME_MICTRONICS=$(basename "$ZIP_FILE_MICTRONICS")
fi
if [ -n "$OPTIONAL_MISSING" ]; then
echo "WARNING: Optional files missing:$OPTIONAL_MISSING (will continue without them)"
fi
echo "date=$DATE" >> "$GITHUB_OUTPUT" echo "date=$DATE" >> "$GITHUB_OUTPUT"
echo "tag=$TAG" >> "$GITHUB_OUTPUT" echo "tag=$TAG" >> "$GITHUB_OUTPUT"
@@ -357,6 +382,10 @@ jobs:
echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT" echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT"
echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT" echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT" echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
echo "json_file_adsbx=$JSON_FILE_ADSBX" >> "$GITHUB_OUTPUT"
echo "json_basename_adsbx=$JSON_BASENAME_ADSBX" >> "$GITHUB_OUTPUT"
echo "zip_file_mictronics=$ZIP_FILE_MICTRONICS" >> "$GITHUB_OUTPUT"
echo "zip_basename_mictronics=$ZIP_BASENAME_MICTRONICS" >> "$GITHUB_OUTPUT"
echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT" echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
echo "Found files:" echo "Found files:"
@@ -364,6 +393,8 @@ jobs:
echo " ADSB CSV: $CSV_FILE_ADSB" echo " ADSB CSV: $CSV_FILE_ADSB"
echo " Community CSV: $CSV_FILE_COMMUNITY" echo " Community CSV: $CSV_FILE_COMMUNITY"
echo " ZIP: $ZIP_FILE" echo " ZIP: $ZIP_FILE"
echo " ADSBX JSON: $JSON_FILE_ADSBX"
echo " MICTRONICS ZIP: $ZIP_FILE_MICTRONICS"
- name: Delete existing release if exists - name: Delete existing release if exists
run: | run: |
@@ -377,19 +408,23 @@ jobs:
with: with:
tag_name: ${{ steps.meta.outputs.tag }} tag_name: ${{ steps.meta.outputs.tag }}
name: ${{ steps.meta.outputs.name }} name: ${{ steps.meta.outputs.name }}
fail_on_unmatched_files: true fail_on_unmatched_files: false
body: | body: |
Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}. Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
Assets: Assets:
- ${{ steps.meta.outputs.csv_basename_faa }} - ${{ steps.meta.outputs.csv_basename_faa }}
- ${{ steps.meta.outputs.csv_basename_adsb }} ${{ steps.meta.outputs.csv_basename_adsb && format('- {0}', steps.meta.outputs.csv_basename_adsb) || '' }}
- ${{ steps.meta.outputs.csv_basename_community }} - ${{ steps.meta.outputs.csv_basename_community }}
- ${{ steps.meta.outputs.zip_basename }} - ${{ steps.meta.outputs.zip_basename }}
- ${{ steps.meta.outputs.json_basename_adsbx }}
${{ steps.meta.outputs.zip_basename_mictronics && format('- {0}', steps.meta.outputs.zip_basename_mictronics) || '' }}
files: | files: |
${{ steps.meta.outputs.csv_file_faa }} ${{ steps.meta.outputs.csv_file_faa }}
${{ steps.meta.outputs.csv_file_adsb }} ${{ steps.meta.outputs.csv_file_adsb }}
${{ steps.meta.outputs.csv_file_community }} ${{ steps.meta.outputs.csv_file_community }}
${{ steps.meta.outputs.zip_file }} ${{ steps.meta.outputs.zip_file }}
${{ steps.meta.outputs.json_file_adsbx }}
${{ steps.meta.outputs.zip_file_mictronics }}
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+4 -1
View File
@@ -281,4 +281,7 @@ read*lock
.nx/ .nx/
# jsii-rosetta files # jsii-rosetta files
type-fingerprints.txt type-fingerprints.txt
notebooks/whatever.ipynb
.snapshots/
+10 -2
View File
@@ -16,11 +16,19 @@ A daily release is created at **06:00 UTC** and includes:
- **openairframes_community.csv** - **openairframes_community.csv**
All community submissions All community submissions
- **openairframes_adsb.csv**
Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that days ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
Example Usage:
```python
import pandas as pd
url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
df = pd.read_csv(url)
df
```
![](docs/images/df_adsb_example_0.png)
- **openairframes_faa.csv** - **openairframes_faa.csv**
All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB) All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)
- **openairframes_adsb.csv**
Airframe information derived from ADS-B messages on the [ADSB.lol](https://www.adsb.lol/) network, from 2026-02-12 to present (will be from 2024-01-01 soon). The airframe information originates from [mictronics aircraft database](https://www.mictronics.de/aircraft-database/) (~5 MB).
- **ReleasableAircraft_{date}.zip** - **ReleasableAircraft_{date}.zip**
A daily snapshot of the FAA database, which updates at **05:30 UTC** A daily snapshot of the FAA database, which updates at **05:30 UTC**
+36
View File
@@ -0,0 +1,36 @@
TAP50Y lis lhr
EXS96WT man ibz
baw837 dbv lhr
exs6yr nce lba
tom1lx ncl ibz
exs62vc edi pmi
tom35j boj lgw
tom509 dlm lgw
afr902 cdg ndj nsi cdg
tom71a spc man
tom8ke man her
nsz3868 bll opo
exs95wl mah ncl
exs18rk stn reu
tom9db mah bhx
tom2bw reu bhx
kac113 kwi man
tom18e ibz gla
ocn8k snn fra
tfl365 ams cur bon ams
exs29y zth bhx
exs79cf olb man
asl508 beg yyz
tom4nw pmi man
exs3uq zth ema
exs23ml her man
gfa003 bah lhr
baw703 bjv lhr
tom2fb mme pmi
tom7el ibz lgw
tom7bd lba pmi
ual967 nap ewr
ein4ec dub cfu
tom78v lgw lca
eva067 tpe bkk lhr
ezy85xv nce lpl
+31
View File
@@ -0,0 +1,31 @@
efw979y klx lgw
ezy74wg ayt lgw
ezy95yg ibz sen
exs65lg kgs bhx
tom5ky her lgw
tom213 dlm man
jbu1990 sju ewr
exs68pv pmi stn
ice48p kef cdg
exs45ra man spu
klm741 ams bog ctg ams
exs42nu man olb
ein55g lys dub
baw538 lhr bds
uae74w lgw dxb
ely312 ltn tlv
tfl757 ams puj cur ams
wja41 lgw yhx
tom7pj reu man
ryr817l bzr stn
ein429 psa dub
exs3lf olb bhx
ezy38en lrh lgw
ezy85wd rmu man
apo7579 lgw los
tom13a mah man
baw2279 lgw yvr
exs406p gro edi
tom5jl pmi stn
ein42m dub vce
+30
View File
@@ -0,0 +1,30 @@
klm1045 ams bhx
dhk591 hkg del ema
sht22a lhr gci
etd75f auh lhr
tom92g pmi ema
tom767 nbe brs
qtr28u doh lhr
tom56m pmi ncl
aca883 nap yul
tsc691 ath yul
srr902 hgh nvi bhx bll
kac109 kwi lhr
cfe4ed ibz lcy
exs628 dbv ema
tom581 nbe ema
exs86j ema puy
exs67am skg lgw
tom37d kva bhx
tom9dy pmi bhx
qtr72b doh stn
exs52cj efl brs
ezy2816 pvk brs
tom2bk mah ncl
exs86pf jsi bhx
exs39yr jsi brs
exs17j mah lpl
qtr2c doh dub
cfe979 pmi lcy
sht21b gci lhr
exs12lf spu bhx
+32
View File
@@ -0,0 +1,32 @@
tom8ax kva man
tom6nk cfu man
gfa003 bah lhr
sxs7by adb dub
tom5gk ext kgs
tom62w efl brs
exs77j stn zth
tom4lw boh her
exs916 spu man
tom54y zth man
tom34g brs pfo
exs3th nap gla
exs9dw nap man
tom24m kgs lgw
tom748 ema sid
exs718d nte edi
exs53ru brs kgs
exs9eh pvk brs
etd71m lhr auh
exs29wk zth bhx
exs46qw kgs stn
wuk369 ltn pmi
tom5dc her brs
wuk9768 ltn jmk
tom5gl lba pmi
exs21dw bhx klx
tom5ka cwl lca
ein46p dub cta
tom73e efl stn
ely316 lhr tlv
efw26pp lgw mah
qtr47y lhr doh
+30
View File
@@ -0,0 +1,30 @@
kmm3118 mla lgw
baw539 bds lhr
tom29k zth brs
tom32x rho bhx
ezy71zj lgw pvk
baw536 lhr bds
ent429 lgw pvk
tom7cl brs cfu
qtr1f doh lhr
sxs5mq man ayt
klm767 ams aua bon ams
vlg5ml lcg lhr
exs92se puy ema
tom850 man nbe
exs5sq ncl pmi
apo7576 abv lhr
tom1an stn her
exs1kp gla pmi
ryr1794 ibz stn
kmm3119 lgw mla
isr116 ltn tlv
sht9f edi lhr
baw9cj lhr bru
ezy93wm brs pmo
ezy42eu ltn bsl
bbc201 dac zyl lhr
bbc202 lhr zyl dac
tom3lw rho lgw
sxs7fz ayt stn
exs71mf stn pmi
+33
View File
@@ -0,0 +1,33 @@
qtr33w doh lhr
tom86d cwl her
ezy49zc lgw bjv
cpa008 lhr cdg hkg
ely317 tlv lhr
tom4ej bhx cfu
tom93j gla ibz
sva117 jed lhr
ely313 tlv ltn
qtr67h lhr doh
box442 fra yyz ord
baw710c lhr lca
wuk784 pmi ltn
tom23m efl man
baw455 ibz lhr
baw595 olb lhr
baw621 peg lhr
azg394 bhx gyd
tom47x zth bhx
ezy45rl bsl ltn
tom10y ibz man
baw663 zth lhr
tom6en bhx pmi
efw74v kgs lgw
sva118 lhr jed
ezy38xg bod bhx
ein463 cta dub
baw537 bds lhr
exs689l pmi ncl
tom43j kgs ext
tom9gx pmi lgw
apo7577 lhr abv
ely318 lhr tlv
+33
View File
@@ -0,0 +1,33 @@
exs42m pmi stn
exs51nw ibz man
tom68h pfo brs
qtr61c doh lgg ord
wja51 lgw yyt
exs93pk gro ema
uae34y dxb man
cfe38z lcy fao
tom33j pmi ema
tom82k mah stn
tom8ya pmi bhx
exs1386 puy bhx
sva119 jed lhr
tom25a mah man
etd75f auh lhr
tom6ev mah bhx
efw16yk cag lgw
ajt8620 bru mia
uae9j dxb stn
tom429 nbe cwl
sxs9gg ayt bhx
baw947l spu lhr
tom7dm cfu ema
tom3nh skg brs
tom5hy ibz man
tom7hk pmi gla
tom9jw boj cwl
tom8be bud bhx
exs32y brs zth
tom7an spu man
tom84y pmi ncl
exs5qd efl lba
tom58h bhx zth
+38
View File
@@ -0,0 +1,38 @@
exs3uq zth ema
efw67a lgw ayt
tom657 nbe gla
tom7cd cfu gla
exs79ue pmi man
tom3lk nap ema
exs732 zth edi
cfe12g olb lcy
tom2xj jsi lgw
gfa003 bah lhr
gfa006 lhr bah
tfl4mh ams lpa
exs9dw nap man
tom6ym cwl cfu
cfe316 ibz lcy
qtr2c doh dub
exs48rz jsi man
afr018 cdg lax ppt
ewg8gj str lgw str
exs6yr nce lba
ely313 tlv ltn
ely317 tlv lhr
wuk13gw ltn tia
baw58xp mxp lhr
noz38w aes lgw
exs98dm ema fao
eju15uv lpl mxp
tom15x cfu man
ezy81qh ltn ibz
wuk784 pmi ltn
exs1898 zth brs
baw841 dbv lhr
sht6d lhr gla
tom2bg cfu cwl
exs45yk stn pmi
ezy36ep pmi lgw
tom9yg zth bhx
tom30w spu lgw
+31
View File
@@ -0,0 +1,31 @@
baw693 jtr lhr
baw58xp mxp lhr
uae9393 dwc lgg ord
exs91au ncl pmi
baw699w her lhr
cfe91g mah gla
vlg49uc lhr lcg
tom2nh pmi man
ezy56rd spu ltn
tom3fa reu bhx
eag8sb bhd sou
cfe31y pmi gla
cfe92y pmi edi
ibs18my lgw mad
exs1418 spu stn
exs41m vrn stn
tom2wt ibz nwi
baw661 efl lhr
wuk2818 zth ltn
eag9st sou bhd
tom2ga kgs brs
exs25db pmi edi
dhk812 bah lej ema
baw15 lhr sin syd
baw16 syd sin lhr
tom59a jtr man
exs45yk stn pmi
apo7577 lhr abv
tom6aw man pmi
baw675 pvk lhr
@@ -0,0 +1,40 @@
[
{
"contributor_name": "JohnSmith.com",
"contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
"creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
"registration_number": "ZM146",
"tags": {
"citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
"icao_aircraft_type": "L1J",
"manufacturer_icao": "LOCKHEED MARTIN",
"manufacturer_name": "Lockheed-martin",
"model": "F-35B Lightning II",
"operator": "Royal Air Force",
"operator_callsign": "RAFAIR",
"operator_icao": "RFR",
"serial_number": "BK-12",
"type_code": "VF35"
},
"transponder_code_hex": "43C81C"
},
{
"contributor_name": "JohnSmith.com",
"contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
"creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
"registration_number": "ZM148",
"tags": {
"citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
"icao_aircraft_type": "L1J",
"manufacturer_icao": "LOCKHEED MARTIN",
"manufacturer_name": "Lockheed-martin",
"model": "F-35B Lightning II",
"operator": "Royal Air Force",
"operator_callsign": "RAFAIR",
"operator_icao": "RFR",
"serial_number": "BK-14",
"type_code": "VF35"
},
"transponder_code_hex": "43C811"
}
]
Binary file not shown.

After

Width:  |  Height:  |  Size: 99 KiB

+32 -1
View File
@@ -54,7 +54,38 @@
"additionalProperties": { "additionalProperties": {
"$ref": "#/$defs/tagValue" "$ref": "#/$defs/tagValue"
}, },
"properties": {} "properties": {
"citation_0": {
"type": "string"
},
"icao_aircraft_type": {
"type": "string"
},
"manufacturer_icao": {
"type": "string"
},
"manufacturer_name": {
"type": "string"
},
"model": {
"type": "string"
},
"operator": {
"type": "string"
},
"operator_callsign": {
"type": "string"
},
"operator_icao": {
"type": "string"
},
"serial_number": {
"type": "string"
},
"type_code": {
"type": "string"
}
}
} }
}, },
"allOf": [ "allOf": [
+49
View File
@@ -0,0 +1,49 @@
#!/usr/bin/env python3
import re
from pathlib import Path
import polars as pl
# Find all CSV.gz files in the downloaded artifacts
artifacts_dir = Path("downloads/adsb_artifacts")
files = sorted(artifacts_dir.glob("*/openairframes_adsb_*.csv.gz"))
if not files:
raise SystemExit("No CSV.gz files found in downloads/adsb_artifacts/")
print(f"Found {len(files)} files to concatenate")
# Extract dates from filenames to determine range
def extract_dates(path: Path) -> tuple[str, str]:
"""Extract start and end dates from filename"""
m = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv\.gz", path.name)
if m:
return m.group(1), m.group(2)
return None, None
# Collect all dates
all_dates = []
for f in files:
start, end = extract_dates(f)
if start and end:
all_dates.extend([start, end])
print(f" {f.name}: {start} to {end}")
if not all_dates:
raise SystemExit("Could not extract dates from filenames")
# Find earliest and latest dates
earliest = min(all_dates)
latest = max(all_dates)
print(f"\nDate range: {earliest} to {latest}")
# Read and concatenate all files
print("\nReading and concatenating files...")
frames = [pl.read_csv(f) for f in files]
df = pl.concat(frames, how="vertical", rechunk=True)
# Write output
output_path = Path("downloads") / f"openairframes_adsb_{earliest}_{latest}.csv.gz"
output_path.parent.mkdir(parents=True, exist_ok=True)
df.write_csv(output_path, compression="gzip")
print(f"\nWrote {output_path} with {df.height:,} rows")
+40
View File
@@ -0,0 +1,40 @@
#!/bin/bash
# Create download directory
mkdir -p downloads/adsb_artifacts
# Repository from the workflow comment
REPO="ggman12/OpenAirframes"
# Get last 15 runs of the workflow and download matching artifacts
gh run list \
--repo "$REPO" \
--workflow adsb-to-aircraft-multiple-day-run.yaml \
--limit 15 \
--json databaseId \
--jq '.[].databaseId' | while read -r run_id; do
echo "Checking run ID: $run_id"
# List artifacts for this run using the API
# Match pattern: openairframes_adsb-YYYY-MM-DD-YYYY-MM-DD (with second date)
gh api \
--paginate \
"repos/$REPO/actions/runs/$run_id/artifacts" \
--jq '.artifacts[] | select(.name | test("^openairframes_adsb-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}$")) | .name' | while read -r artifact_name; do
# Check if artifact directory already exists and has files
if [ -d "downloads/adsb_artifacts/$artifact_name" ] && [ -n "$(ls -A "downloads/adsb_artifacts/$artifact_name" 2>/dev/null)" ]; then
echo " Skipping (already exists): $artifact_name"
continue
fi
echo " Downloading: $artifact_name"
gh run download "$run_id" \
--repo "$REPO" \
--name "$artifact_name" \
--dir "downloads/adsb_artifacts/$artifact_name"
done
done
echo "Download complete! Files saved to downloads/adsb_artifacts/"
+182
View File
@@ -0,0 +1,182 @@
#!/usr/bin/env python3
"""
Download and concatenate artifacts from a specific set of workflow runs.
Usage:
python scripts/download_and_concat_runs.py triggered_runs_20260216_123456.json
"""
import argparse
import json
import os
import subprocess
import sys
from pathlib import Path
def download_run_artifact(run_id, output_dir):
"""Download artifact from a specific workflow run."""
print(f" Downloading artifacts from run {run_id}...")
cmd = [
'gh', 'run', 'download', str(run_id),
'--pattern', 'openairframes_adsb-*',
'--dir', output_dir
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print(f" ✓ Downloaded")
return True
else:
if "no artifacts" in result.stderr.lower():
print(f" ⚠ No artifacts found (workflow may still be running)")
else:
print(f" ✗ Failed: {result.stderr}")
return False
def find_csv_files(download_dir):
"""Find all CSV.gz files in the download directory."""
csv_files = []
for root, dirs, files in os.walk(download_dir):
for file in files:
if file.endswith('.csv.gz'):
csv_files.append(os.path.join(root, file))
return sorted(csv_files)
def concatenate_csv_files(csv_files, output_file):
"""Concatenate CSV files in order, preserving headers."""
import gzip
print(f"\nConcatenating {len(csv_files)} CSV files...")
with gzip.open(output_file, 'wt') as outf:
header_written = False
for i, csv_file in enumerate(csv_files, 1):
print(f" [{i}/{len(csv_files)}] Processing {os.path.basename(csv_file)}")
with gzip.open(csv_file, 'rt') as inf:
lines = inf.readlines()
if not header_written:
# Write header from first file
outf.writelines(lines)
header_written = True
else:
# Skip header for subsequent files
outf.writelines(lines[1:])
print(f"\n✓ Concatenated CSV saved to: {output_file}")
# Show file size
size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f" Size: {size_mb:.1f} MB")
def main():
parser = argparse.ArgumentParser(
description='Download and concatenate artifacts from workflow runs'
)
parser.add_argument(
'runs_file',
help='JSON file containing run IDs (from run_historical_adsb_action.py)'
)
parser.add_argument(
'--output-dir',
default='./downloads/historical_concat',
help='Directory for downloads (default: ./downloads/historical_concat)'
)
parser.add_argument(
'--wait',
action='store_true',
help='Wait for workflows to complete before downloading'
)
args = parser.parse_args()
# Load run IDs
if not os.path.exists(args.runs_file):
print(f"Error: File not found: {args.runs_file}")
sys.exit(1)
with open(args.runs_file, 'r') as f:
data = json.load(f)
runs = data['runs']
start_date = data['start_date']
end_date = data['end_date']
print("=" * 60)
print("Download and Concatenate Historical Artifacts")
print("=" * 60)
print(f"Date range: {start_date} to {end_date}")
print(f"Workflow runs: {len(runs)}")
print(f"Output directory: {args.output_dir}")
print("=" * 60)
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
# Wait for workflows to complete if requested
if args.wait:
print("\nWaiting for workflows to complete...")
for run_info in runs:
run_id = run_info['run_id']
print(f" Checking run {run_id}...")
cmd = ['gh', 'run', 'watch', str(run_id)]
subprocess.run(cmd)
# Download artifacts
print("\nDownloading artifacts...")
successful_downloads = 0
for i, run_info in enumerate(runs, 1):
run_id = run_info['run_id']
print(f"\n[{i}/{len(runs)}] Run {run_id} ({run_info['start']} to {run_info['end']})")
if download_run_artifact(run_id, args.output_dir):
successful_downloads += 1
print(f"\n\nDownload Summary: {successful_downloads}/{len(runs)} artifacts downloaded")
if successful_downloads == 0:
print("\nNo artifacts downloaded. Workflows may still be running.")
print("Use --wait to wait for completion, or try again later.")
sys.exit(1)
# Find all CSV files
csv_files = find_csv_files(args.output_dir)
if not csv_files:
print("\nError: No CSV files found in download directory")
sys.exit(1)
print(f"\nFound {len(csv_files)} CSV file(s):")
for csv_file in csv_files:
print(f" - {os.path.basename(csv_file)}")
# Concatenate
# Calculate actual end date for filename (end_date - 1 day since it's exclusive)
from datetime import datetime, timedelta
end_dt = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
actual_end = end_dt.strftime('%Y-%m-%d')
output_file = os.path.join(
args.output_dir,
f"openairframes_adsb_{start_date}_{actual_end}.csv.gz"
)
concatenate_csv_files(csv_files, output_file)
print("\n" + "=" * 60)
print("Done!")
print("=" * 60)
if __name__ == '__main__':
main()
+215
View File
@@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""
Script to trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks.
Usage:
python scripts/run_historical_adsb_action.py --start-date 2025-01-01 --end-date 2025-06-01
"""
import argparse
import subprocess
import sys
from datetime import datetime, timedelta
from calendar import monthrange
def generate_monthly_chunks(start_date_str, end_date_str):
"""Generate date ranges in monthly chunks from start to end date.
End dates are exclusive (e.g., to process Jan 1-31, end_date should be Feb 1).
"""
start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
chunks = []
current = start_date
while current < end_date:
# Get the first day of the next month (exclusive end)
_, days_in_month = monthrange(current.year, current.month)
month_end = current.replace(day=days_in_month)
next_month_start = month_end + timedelta(days=1)
# Don't go past the global end date
chunk_end = min(next_month_start, end_date)
chunks.append({
'start': current.strftime('%Y-%m-%d'),
'end': chunk_end.strftime('%Y-%m-%d')
})
# Move to first day of next month
if next_month_start >= end_date:
break
current = next_month_start
return chunks
def trigger_workflow(start_date, end_date, repo='ggman12/OpenAirframes', branch='main', dry_run=False):
"""Trigger the adsb-to-aircraft-multiple-day-run workflow via GitHub CLI."""
cmd = [
'gh', 'workflow', 'run', 'adsb-to-aircraft-multiple-day-run.yaml',
'--repo', repo,
'--ref', branch,
'-f', f'start_date={start_date}',
'-f', f'end_date={end_date}'
]
if dry_run:
print(f"[DRY RUN] Would run: {' '.join(cmd)}")
return True, None
print(f"Triggering workflow: {start_date} to {end_date} (on {branch})")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print(f"✓ Successfully triggered workflow for {start_date} to {end_date}")
# Get the run ID of the workflow we just triggered
# Wait a moment for it to appear
import time
time.sleep(2)
# Get the most recent run (should be the one we just triggered)
list_cmd = [
'gh', 'run', 'list',
'--repo', repo,
'--workflow', 'adsb-to-aircraft-multiple-day-run.yaml',
'--branch', branch,
'--limit', '1',
'--json', 'databaseId',
'--jq', '.[0].databaseId'
]
list_result = subprocess.run(list_cmd, capture_output=True, text=True)
run_id = list_result.stdout.strip() if list_result.returncode == 0 else None
return True, run_id
else:
print(f"✗ Failed to trigger workflow for {start_date} to {end_date}")
print(f"Error: {result.stderr}")
return False, None
def main():
parser = argparse.ArgumentParser(
description='Trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks'
)
parser.add_argument(
'--start-date', '--start_date',
dest='start_date',
required=True,
help='Start date in YYYY-MM-DD format (inclusive)'
)
parser.add_argument(
'--end-date', '--end_date',
dest='end_date',
required=True,
help='End date in YYYY-MM-DD format (exclusive)'
)
parser.add_argument(
'--repo',
type=str,
default='ggman12/OpenAirframes',
help='GitHub repository (default: ggman12/OpenAirframes)'
)
parser.add_argument(
'--branch',
type=str,
default='main',
help='Branch to run the workflow on (default: main)'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Print commands without executing them'
)
parser.add_argument(
'--delay',
type=int,
default=5,
help='Delay in seconds between workflow triggers (default: 5)'
)
args = parser.parse_args()
# Validate dates
try:
start = datetime.strptime(args.start_date, '%Y-%m-%d')
end = datetime.strptime(args.end_date, '%Y-%m-%d')
if start > end:
print("Error: start_date must be before or equal to end_date")
sys.exit(1)
except ValueError as e:
print(f"Error: Invalid date format - {e}")
sys.exit(1)
# Generate monthly chunks
chunks = generate_monthly_chunks(args.start_date, args.end_date)
print(f"\nGenerating {len(chunks)} monthly workflow runs on branch '{args.branch}' (repo: {args.repo}):")
for i, chunk in enumerate(chunks, 1):
print(f" {i}. {chunk['start']} to {chunk['end']}")
if not args.dry_run:
response = input(f"\nProceed with triggering {len(chunks)} workflows on '{args.branch}'? [y/N]: ")
if response.lower() != 'y':
print("Cancelled.")
sys.exit(0)
print()
# Trigger workflows
import time
success_count = 0
triggered_runs = []
for i, chunk in enumerate(chunks, 1):
print(f"\n[{i}/{len(chunks)}] ", end='')
success, run_id = trigger_workflow(
chunk['start'],
chunk['end'],
repo=args.repo,
branch=args.branch,
dry_run=args.dry_run
)
if success:
success_count += 1
if run_id:
triggered_runs.append({
'run_id': run_id,
'start': chunk['start'],
'end': chunk['end']
})
# Add delay between triggers (except for last one)
if i < len(chunks) and not args.dry_run:
time.sleep(args.delay)
print(f"\n\nSummary: {success_count}/{len(chunks)} workflows triggered successfully")
# Save triggered run IDs to a file
if triggered_runs and not args.dry_run:
import json
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
runs_file = f"./output/triggered_runs_{timestamp}.json"
with open(runs_file, 'w') as f:
json.dump({
'start_date': args.start_date,
'end_date': args.end_date,
'repo': args.repo,
'branch': args.branch,
'runs': triggered_runs
}, f, indent=2)
print(f"\nRun IDs saved to: {runs_file}")
print(f"\nTo download and concatenate these artifacts, run:")
print(f" python scripts/download_and_concat_runs.py {runs_file}")
if success_count < len(chunks):
sys.exit(1)
if __name__ == '__main__':
main()
+82
View File
@@ -0,0 +1,82 @@
#!/usr/bin/env python3
"""
Run src.adsb.main in an isolated git worktree so edits in the main
working tree won't affect subprocess imports during the run.
Usage:
python scripts/run_main_isolated.py 2026-01-01
python scripts/run_main_isolated.py --start_date 2026-01-01 --end_date 2026-01-03
"""
import argparse
import os
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
def run(
cmd: list[str],
*,
cwd: Path | None = None,
check: bool = True,
) -> subprocess.CompletedProcess:
print(f"\n>>> {' '.join(cmd)}")
return subprocess.run(cmd, cwd=cwd, check=check)
def main() -> int:
parser = argparse.ArgumentParser(description="Run src.adsb.main in an isolated worktree")
parser.add_argument("date", nargs="?", help="Single date to process (YYYY-MM-DD)")
parser.add_argument("--start_date", help="Start date (inclusive, YYYY-MM-DD)")
parser.add_argument("--end_date", help="End date (exclusive, YYYY-MM-DD)")
parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
args = parser.parse_args()
if args.date and (args.start_date or args.end_date):
raise SystemExit("Use a single date or --start_date/--end_date, not both.")
if args.date:
datetime.strptime(args.date, "%Y-%m-%d")
main_args = ["--date", args.date]
else:
if not args.start_date or not args.end_date:
raise SystemExit("Provide --start_date and --end_date, or a single date.")
datetime.strptime(args.start_date, "%Y-%m-%d")
datetime.strptime(args.end_date, "%Y-%m-%d")
main_args = ["--start_date", args.start_date, "--end_date", args.end_date]
if args.concat_with_latest_csv:
main_args.append("--concat_with_latest_csv")
repo_root = Path(__file__).resolve().parents[1]
snapshots_root = repo_root / ".snapshots"
snapshots_root.mkdir(exist_ok=True)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
snapshot_root = snapshots_root / f"run_{timestamp}"
snapshot_src = snapshot_root / "src"
exit_code = 0
try:
shutil.copytree(repo_root / "src", snapshot_src)
runner = (
"import sys, runpy; "
f"sys.path.insert(0, {repr(str(snapshot_root))}); "
f"sys.argv = ['src.adsb.main'] + {main_args!r}; "
"runpy.run_module('src.adsb.main', run_name='__main__')"
)
cmd = [sys.executable, "-c", runner]
run(cmd, cwd=repo_root)
except subprocess.CalledProcessError as exc:
exit_code = exc.returncode
finally:
shutil.rmtree(snapshot_root, ignore_errors=True)
return exit_code
if __name__ == "__main__":
raise SystemExit(main())
+242
View File
@@ -0,0 +1,242 @@
#!/usr/bin/env python3
"""
Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
Source: "TheAirTraffic Database - Aircraft 2.csv"
Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
Categories in the spreadsheet columns (paired: name, registrations, separator):
Col 1-3: Business
Col 4-6: Government
Col 7-9: People
Col 10-12: Sports
Col 13-15: Celebrity
Col 16-18: State Govt./Law
Col 19-21: Other
Col 22-24: Test Aircraft
Col 25-27: YouTubers
Col 28-30: Formula 1 VIP's
Col 31-33: Active GII's and GIII's (test/demo aircraft)
Col 34-37: Russia & Ukraine (extra col for old/new)
Col 38-40: Helicopters & Blimps
Col 41-43: Unique Reg's
Col 44-46: Saudi & UAE
Col 47-49: Schools
Col 50-52: Special Charter
Col 53-55: Unknown Owners
Col 56-59: Frequent Flyers (extra cols: name, aircraft, logged, hours)
"""
import csv
import json
import hashlib
import re
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
# ── Category mapping ────────────────────────────────────────────────────────
# Each entry: (name_col, reg_col, owner_category_tags)
# owner_category_tags is a dict of tag keys to add beyond "owner"
CATEGORY_COLUMNS = [
# (name_col, reg_col, {tag_key: tag_value, ...})
(1, 2, {"owner_category_0": "business"}),
(4, 5, {"owner_category_0": "government"}),
(7, 8, {"owner_category_0": "celebrity"}),
(10, 11, {"owner_category_0": "sports"}),
(13, 14, {"owner_category_0": "celebrity"}),
(16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
(19, 20, {"owner_category_0": "other"}),
(22, 23, {"owner_category_0": "test_aircraft"}),
(25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
(28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
(31, 32, {"owner_category_0": "test_aircraft"}),
# Russia & Ukraine: col 34=name, col 35 or 36 may have reg
(34, 35, {"owner_category_0": "russia_ukraine"}),
(38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
(41, 42, {"owner_category_0": "other"}),
(44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
(47, 48, {"owner_category_0": "education"}),
(50, 51, {"owner_category_0": "charter"}),
(53, 54, {"owner_category_0": "unknown"}),
(56, 57, {"owner_category_0": "celebrity"}), # Frequent Flyers name col, aircraft col
]
# First data row index (0-based) in the CSV
DATA_START_ROW = 4
# ── Contributor info ────────────────────────────────────────────────────────
CONTRIBUTOR_NAME = "TheAirTraffic"
# Deterministic UUID v5 from contributor name
CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
# Citation
CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
def looks_like_military_serial(reg: str) -> bool:
"""
Detect military-style serials like 92-9000, 82-8000, 98-0001
or pure numeric IDs like 929000, 828000, 980001.
These aren't standard civil registrations; use openairframes_id.
"""
# Pattern: NN-NNNN
if re.match(r'^\d{2}-\d{4}$', reg):
return True
# Pure 6-digit numbers (likely ICAO hex or military mode-S)
if re.match(r'^\d{6}$', reg):
return True
# Short numeric-only (1-5 digits) like "01", "02", "676"
if re.match(r'^\d{1,5}$', reg):
return True
return False
def normalize_reg(raw: str) -> str:
"""Clean up a registration string."""
reg = raw.strip().rstrip(',').strip()
# Remove carriage returns and other whitespace
reg = reg.replace('\r', '').replace('\n', '').strip()
return reg
def parse_regs(cell_value: str) -> list[str]:
"""
Parse a cell that may contain one or many registrations,
separated by commas, possibly wrapped in quotes.
"""
if not cell_value or not cell_value.strip():
return []
# Some cells have ADS-B exchange URLs skip those
if 'globe.adsbexchange.com' in cell_value:
return []
if cell_value.strip() in ('.', ',', ''):
return []
results = []
# Split on comma
parts = cell_value.split(',')
for part in parts:
reg = normalize_reg(part)
if not reg:
continue
# Skip URLs, section labels, etc.
if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
continue
# Skip if it's just whitespace or dots
if reg in ('.', '..', '...'):
continue
results.append(reg)
return results
def make_submission(
reg: str,
owner: str,
category_tags: dict[str, str],
) -> dict:
"""Build a single community_submission.v1 object."""
entry: dict = {}
# Decide identifier field
if looks_like_military_serial(reg):
entry["openairframes_id"] = reg
else:
entry["registration_number"] = reg
# Tags
tags: dict = {
"citation_0": CITATION,
}
if owner:
tags["owner"] = owner.strip()
tags.update(category_tags)
entry["tags"] = tags
return entry
def main():
csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
"/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
)
if not csv_path.exists():
print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
sys.exit(1)
# Read CSV
with open(csv_path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
rows = list(reader)
print(f"Read {len(rows)} rows from {csv_path.name}")
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
submissions: list[dict] = []
seen: set[tuple] = set() # (reg, owner) dedup
for row_idx in range(DATA_START_ROW, len(rows)):
row = rows[row_idx]
if len(row) < 3:
continue
for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
if reg_col >= len(row) or name_col >= len(row):
continue
owner_raw = row[name_col].strip().rstrip(',').strip()
reg_raw = row[reg_col]
# Clean owner name
owner = owner_raw.replace('\r', '').replace('\n', '').strip()
if not owner or owner in ('.', ',', 'Section 1'):
continue
# Skip header-like values
if owner.startswith('http') or owner.startswith('Link '):
continue
regs = parse_regs(reg_raw)
if not regs:
# For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
if name_col == 34 and reg_col + 1 < len(row):
regs = parse_regs(row[reg_col + 1])
for reg in regs:
key = (reg, owner)
if key in seen:
continue
seen.add(key)
submissions.append(make_submission(reg, owner, cat_tags))
print(f"Generated {len(submissions)} submissions")
# Write output
proj_root = Path(__file__).resolve().parent.parent
out_dir = proj_root / "community" / date_str
out_dir.mkdir(parents=True, exist_ok=True)
out_file = out_dir / f"theairtraffic_{date_str}.json"
with open(out_file, 'w', encoding='utf-8') as f:
json.dump(submissions, f, indent=2, ensure_ascii=False)
print(f"Written to {out_file}")
print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
# Quick stats
cats = {}
for s in submissions:
c = s['tags'].get('owner_category_0', 'NONE')
cats[c] = cats.get(c, 0) + 1
print("\nCategory breakdown:")
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
print(f" {c}: {n}")
if __name__ == "__main__":
main()
+69
View File
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""Validate the generated theairtraffic JSON output."""
import json
import glob
import sys
# Find the latest output
files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
if not files:
print("No output files found!")
sys.exit(1)
path = files[-1]
print(f"Validating: {path}")
with open(path) as f:
data = json.load(f)
print(f"Total entries: {len(data)}")
# Check military serial handling
mil = [d for d in data if "openairframes_id" in d]
print(f"\nEntries using openairframes_id: {len(mil)}")
for m in mil[:10]:
print(f" {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
# Check youtuber entries
yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
print(f"\nYouTuber entries: {len(yt)}")
for y in yt[:5]:
reg = y.get("registration_number", y.get("openairframes_id"))
c0 = y["tags"].get("owner_category_0")
c1 = y["tags"].get("owner_category_1")
print(f" {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
# Check US Govt / military
gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
print(f"\nUSA 747/757 entries: {len(gov)}")
for g in gov:
oid = g.get("openairframes_id", g.get("registration_number"))
print(f" {oid}")
# Schema validation
issues = 0
for i, d in enumerate(data):
has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
if not has_id:
print(f" Entry {i}: no identifier!")
issues += 1
if "tags" not in d:
print(f" Entry {i}: no tags!")
issues += 1
# Check tag key format
for k in d.get("tags", {}):
import re
if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
print(f" Entry {i}: invalid tag key '{k}'")
issues += 1
print(f"\nSchema issues: {issues}")
# Category breakdown
cats = {}
for s in data:
c = s["tags"].get("owner_category_0", "NONE")
cats[c] = cats.get(c, 0) + 1
print("\nCategory breakdown:")
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
print(f" {c}: {n}")
-11
View File
@@ -1,11 +0,0 @@
FROM --platform=linux/arm64 python:3.12-slim
WORKDIR /app
COPY requirements.reducer.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY compress_adsb_to_aircraft_data.py .
COPY reducer.py .
CMD ["python", "-u", "reducer.py"]
-12
View File
@@ -1,12 +0,0 @@
FROM --platform=linux/arm64 python:3.12-slim
WORKDIR /app
COPY requirements.worker.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY compress_adsb_to_aircraft_data.py .
COPY download_adsb_data_to_parquet.py .
COPY worker.py .
CMD ["python", "-u", "worker.py"]
-250
View File
@@ -1,250 +0,0 @@
"""
Combines chunk parquet files and compresses to final aircraft CSV.
This is the reduce phase of the map-reduce pipeline.
Supports both single-day (daily) and multi-day (historical) modes.
Memory-efficient: processes each chunk separately, compresses, then combines.
Usage:
# Daily mode
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
# Historical mode
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
"""
import gc
import os
import sys
import glob
import argparse
from datetime import datetime, timedelta
import polars as pl
from src.adsb.download_adsb_data_to_parquet import OUTPUT_DIR, get_resource_usage
from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLUMNS
DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
FINAL_OUTPUT_DIR = "./data/openairframes"
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
def get_target_day() -> datetime:
"""Get yesterday's date (the day we're processing)."""
return datetime.utcnow() - timedelta(days=1)
def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
"""Load and compress a single chunk parquet file.
Args:
chunk_path: Path to parquet file
delete_after_load: If True, delete the parquet file after loading to free disk space
"""
print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
# Load chunk - only columns we need
needed_columns = ['time', 'icao'] + COLUMNS
df = pl.read_parquet(chunk_path, columns=needed_columns)
print(f" Loaded {len(df)} rows")
# Delete file immediately after loading to free disk space
if delete_after_load:
try:
os.remove(chunk_path)
print(f" Deleted {chunk_path} to free disk space")
except Exception as e:
print(f" Warning: Failed to delete {chunk_path}: {e}")
# Compress to aircraft records (one per ICAO) using shared function
compressed = compress_multi_icao_df(df, verbose=True)
print(f" Compressed to {len(compressed)} aircraft records")
del df
gc.collect()
return compressed
def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFrame:
"""Combine multiple compressed DataFrames.
Since chunks are partitioned by ICAO hash, each ICAO only appears in one chunk.
No deduplication needed here - just concatenate.
"""
print(f"Combining {len(compressed_dfs)} compressed chunks... | {get_resource_usage()}")
# Concat all
combined = pl.concat(compressed_dfs)
print(f"Combined: {len(combined)} records")
return combined
def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame:
"""Download base release and merge with new data."""
from src.get_latest_release import download_latest_aircraft_adsb_csv
print("Downloading base ADS-B release...")
try:
base_path = download_latest_aircraft_adsb_csv(
output_dir="./data/openairframes_base"
)
print(f"Download returned: {base_path}")
if base_path and os.path.exists(str(base_path)):
print(f"Loading base release from {base_path}")
base_df = pl.read_csv(base_path)
print(f"Base release has {len(base_df)} records")
# Ensure columns match
base_cols = set(base_df.columns)
new_cols = set(compressed_df.columns)
print(f"Base columns: {sorted(base_cols)}")
print(f"New columns: {sorted(new_cols)}")
# Add missing columns
for col in new_cols - base_cols:
base_df = base_df.with_columns(pl.lit(None).alias(col))
for col in base_cols - new_cols:
compressed_df = compressed_df.with_columns(pl.lit(None).alias(col))
# Reorder columns to match
compressed_df = compressed_df.select(base_df.columns)
# Concat and deduplicate by icao (keep new data - it comes last)
combined = pl.concat([base_df, compressed_df])
print(f"After concat: {len(combined)} records")
deduplicated = combined.unique(subset=["icao"], keep="last")
print(f"Combined with base: {len(combined)} -> {len(deduplicated)} after dedup")
del base_df, combined
gc.collect()
return deduplicated
else:
print(f"No base release found at {base_path}, using only new data")
return compressed_df
except Exception as e:
import traceback
print(f"Failed to download base release: {e}")
traceback.print_exc()
return compressed_df
def cleanup_chunks(output_id: str, chunks_dir: str):
"""Delete chunk parquet files after successful merge."""
pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
chunk_files = glob.glob(pattern)
for f in chunk_files:
try:
os.remove(f)
print(f"Deleted {f}")
except Exception as e:
print(f"Failed to delete {f}: {e}")
def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
"""Find chunk parquet files matching the output ID."""
pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
chunk_files = sorted(glob.glob(pattern))
if not chunk_files:
# Try recursive search for historical mode with merged artifacts
pattern = os.path.join(chunks_dir, "**", "*.parquet")
chunk_files = sorted(glob.glob(pattern, recursive=True))
return chunk_files
def main():
parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
args = parser.parse_args()
# Determine output ID and filename based on mode
if args.start_date and args.end_date:
# Historical mode
output_id = f"{args.start_date}_{args.end_date}"
output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv"
print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
else:
# Daily mode - use same date for start and end
if args.date:
target_day = datetime.strptime(args.date, "%Y-%m-%d")
else:
target_day = get_target_day()
date_str = target_day.strftime("%Y-%m-%d")
output_id = date_str
output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv"
print(f"Combining chunks for {date_str}")
chunks_dir = args.chunks_dir
print(f"Chunks directory: {chunks_dir}")
print(f"Resource usage at start: {get_resource_usage()}")
# Find chunk files
chunk_files = find_chunk_files(chunks_dir, output_id)
if not chunk_files:
print(f"No chunk files found in: {chunks_dir}")
sys.exit(1)
print(f"Found {len(chunk_files)} chunk files")
# Process each chunk separately to save memory
# With --stream, delete parquet files immediately after loading to save disk space
compressed_chunks = []
for chunk_path in chunk_files:
compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
compressed_chunks.append(compressed)
gc.collect()
# Combine all compressed chunks
combined = combine_compressed_chunks(compressed_chunks)
# Free memory from individual chunks
del compressed_chunks
gc.collect()
print(f"After combining: {get_resource_usage()}")
# Merge with base release (unless skipped)
if not args.skip_base:
combined = download_and_merge_base_release(combined)
# Convert list columns to strings for CSV compatibility
for col in combined.columns:
if combined[col].dtype == pl.List:
combined = combined.with_columns(
pl.col(col).list.join(",").alias(col)
)
# Sort by time for consistent output
if 'time' in combined.columns:
combined = combined.sort('time')
# Write final CSV
output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
combined.write_csv(output_path)
print(f"Wrote {len(combined)} records to {output_path}")
# Cleanup
if not args.keep_chunks:
cleanup_chunks(output_id, chunks_dir)
print(f"Done! | {get_resource_usage()}")
if __name__ == "__main__":
main()
+38 -115
View File
@@ -5,23 +5,6 @@ import polars as pl
COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't'] COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']
def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
"""For each icao, keep only the earliest row with each unique signature.
This is used for deduplicating across multiple compressed chunks.
"""
# Create signature column
df = df.with_columns(
pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
)
# Group by icao and signature, take first row (earliest due to time sort)
df = df.sort("time")
df_deduped = df.group_by(["icao", "_signature"]).first()
df_deduped = df_deduped.drop("_signature")
df_deduped = df_deduped.sort("time")
return df_deduped
def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame: def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
"""Compress a single ICAO group to its most informative row using Polars.""" """Compress a single ICAO group to its most informative row using Polars."""
# Create signature string # Create signature string
@@ -99,9 +82,6 @@ def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame: def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
"""Compress a DataFrame with multiple ICAOs to one row per ICAO. """Compress a DataFrame with multiple ICAOs to one row per ICAO.
This is the main entry point for compressing ADS-B data.
Used by both daily GitHub Actions runs and historical AWS runs.
Args: Args:
df: DataFrame with columns ['time', 'icao'] + COLUMNS df: DataFrame with columns ['time', 'icao'] + COLUMNS
verbose: Whether to print progress verbose: Whether to print progress
@@ -120,29 +100,27 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
if col in df.columns: if col in df.columns:
df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null("")) df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
# First pass: quick deduplication of exact duplicates # Quick deduplication of exact duplicates
df = df.unique(subset=['icao'] + COLUMNS, keep='first') df = df.unique(subset=['icao'] + COLUMNS, keep='first')
if verbose: if verbose:
print(f"After quick dedup: {df.height} records") print(f"After quick dedup: {df.height} records")
# Second pass: sophisticated compression per ICAO # Compress per ICAO
if verbose: if verbose:
print("Compressing per ICAO...") print("Compressing per ICAO...")
# Process each ICAO group
icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True) icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
compressed_dfs = [] compressed_dfs = []
for icao_key, group_df in icao_groups.items(): for icao_key, group_df in icao_groups.items():
# partition_by with as_dict=True returns tuple keys, extract first element icao = icao_key[0]
icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
compressed = compress_df_polars(group_df, str(icao)) compressed = compress_df_polars(group_df, str(icao))
compressed_dfs.append(compressed) compressed_dfs.append(compressed)
if compressed_dfs: if compressed_dfs:
df_compressed = pl.concat(compressed_dfs) df_compressed = pl.concat(compressed_dfs)
else: else:
df_compressed = df.head(0) # Empty with same schema df_compressed = df.head(0)
if verbose: if verbose:
print(f"After compress: {df_compressed.height} records") print(f"After compress: {df_compressed.height} records")
@@ -155,45 +133,22 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
return df_compressed return df_compressed
def load_raw_adsb_for_day(day): def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
"""Load raw ADS-B data for a day from parquet file.""" """Load a single parquet part file for a date.
from datetime import timedelta
Args:
part_id: Part ID (e.g., 1, 2, 3)
date: Date string in YYYY-MM-DD format
Returns:
DataFrame with ADS-B data
"""
from pathlib import Path from pathlib import Path
start_time = day.replace(hour=0, minute=0, second=0, microsecond=0) parquet_file = Path(f"data/output/parquet_output/part_{part_id}_{date}.parquet")
# Check for parquet file first
version_date = f"v{start_time.strftime('%Y.%m.%d')}"
parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
if not parquet_file.exists(): if not parquet_file.exists():
# Try to generate parquet file by calling the download function print(f"Parquet file not found: {parquet_file}")
print(f" Parquet file not found: {parquet_file}")
print(f" Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
from download_adsb_data_to_parquet import create_parquet_for_day
result_path = create_parquet_for_day(start_time, keep_folders=False)
if result_path:
print(f" Successfully generated parquet file: {result_path}")
else:
raise Exception("Failed to generate parquet file")
if parquet_file.exists():
print(f" Loading from parquet: {parquet_file}")
df = pl.read_parquet(
parquet_file,
columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
)
# Convert to timezone-naive datetime
if df["time"].dtype == pl.Datetime:
df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
return df
else:
# Return empty DataFrame if parquet file doesn't exist
print(f" No data available for {start_time.strftime('%Y-%m-%d')}")
return pl.DataFrame(schema={ return pl.DataFrame(schema={
'time': pl.Datetime, 'time': pl.Datetime,
'icao': pl.Utf8, 'icao': pl.Utf8,
@@ -205,17 +160,33 @@ def load_raw_adsb_for_day(day):
'desc': pl.Utf8, 'desc': pl.Utf8,
'aircraft_category': pl.Utf8 'aircraft_category': pl.Utf8
}) })
print(f"Loading from parquet: {parquet_file}")
df = pl.read_parquet(
parquet_file,
columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
)
# Convert to timezone-naive datetime
if df["time"].dtype == pl.Datetime:
df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
os.remove(parquet_file)
return df
def load_historical_for_day(day): def compress_parquet_part(part_id: int, date: str) -> pl.DataFrame:
"""Load and compress historical ADS-B data for a day.""" """Load and compress a single parquet part file."""
df = load_raw_adsb_for_day(day) df = load_parquet_part(part_id, date)
if df.height == 0: if df.height == 0:
return df return df
# Filter to rows within the given date (UTC-naive). This is because sometimes adsb.lol export can have rows at 00:00:00 of next day or similar.
date_lit = pl.lit(date).str.strptime(pl.Date, "%Y-%m-%d")
df = df.filter(pl.col("time").dt.date() == date_lit)
print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}") print(f"Loaded {df.height} raw records for part {part_id}, date {date}")
# Use shared compression function
return compress_multi_icao_df(df, verbose=True) return compress_multi_icao_df(df, verbose=True)
@@ -223,52 +194,4 @@ def concat_compressed_dfs(df_base, df_new):
"""Concatenate base and new compressed dataframes, keeping the most informative row per ICAO.""" """Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
# Combine both dataframes # Combine both dataframes
df_combined = pl.concat([df_base, df_new]) df_combined = pl.concat([df_base, df_new])
return df_combined
# Sort by ICAO and time
df_combined = df_combined.sort(['icao', 'time'])
# Fill null values
for col in COLUMNS:
if col in df_combined.columns:
df_combined = df_combined.with_columns(pl.col(col).fill_null(""))
# Apply compression logic per ICAO to get the best row
icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
compressed_dfs = []
for icao, group_df in icao_groups.items():
compressed = compress_df_polars(group_df, icao)
compressed_dfs.append(compressed)
if compressed_dfs:
df_compressed = pl.concat(compressed_dfs)
else:
df_compressed = df_combined.head(0)
# Sort by time
df_compressed = df_compressed.sort('time')
return df_compressed
def get_latest_aircraft_adsb_csv_df():
"""Download and load the latest ADS-B CSV from GitHub releases."""
from get_latest_release import download_latest_aircraft_adsb_csv
import re
csv_path = download_latest_aircraft_adsb_csv()
df = pl.read_csv(csv_path, null_values=[""])
# Fill nulls with empty strings
for col in df.columns:
if df[col].dtype == pl.Utf8:
df = df.with_columns(pl.col(col).fill_null(""))
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
date_str = match.group(1)
return df, date_str
+67
View File
@@ -0,0 +1,67 @@
from pathlib import Path
import polars as pl
import argparse
import os
OUTPUT_DIR = Path("./data/output")
CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
def main():
parser = argparse.ArgumentParser(description="Concatenate compressed parquet files for a single day")
parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
parser.add_argument("--concat_with_latest_csv", action="store_true", help="Whether to also concatenate with the latest CSV from GitHub releases")
args = parser.parse_args()
compressed_dir = OUTPUT_DIR / "compressed"
date_dir = compressed_dir / args.date
parquet_files = sorted(date_dir.glob("*.parquet"))
df = None
if parquet_files: # TODO: This logic could be updated slightly.
print(f"No parquet files found in {date_dir}")
frames = [pl.read_parquet(p) for p in parquet_files]
df = pl.concat(frames, how="vertical", rechunk=True)
df = df.sort(["time", "icao"])
df = df.select(CORRECT_ORDER_OF_COLUMNS)
output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
print(f"Writing combined parquet to {output_path} with {df.height} rows")
df.write_parquet(output_path)
csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
df.write_csv(csv_output_path, compression="gzip")
if args.concat_with_latest_csv:
print("Loading latest CSV from GitHub releases to concatenate with...")
from src.get_latest_release import get_latest_aircraft_adsb_csv_df
from datetime import datetime
df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
# Compare dates: end_date is exclusive, so if csv_end_date > args.date,
# the latest CSV already includes this day's data
csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
args_dt = datetime.strptime(args.date, "%Y-%m-%d")
if df is None or csv_end_dt >= args_dt:
print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
print("Writing latest CSV directly without concatenation to avoid duplicates")
os.makedirs(OUTPUT_DIR, exist_ok=True)
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
else:
print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
# Ensure column order matches before concatenating
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
df_final = concat_compressed_dfs(df_latest_csv, df)
df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
df_final.write_csv(final_csv_output_path, compression="gzip")
print(f"Final CSV written to {final_csv_output_path}")
if __name__ == "__main__":
main()
+144 -304
View File
@@ -1,42 +1,33 @@
""" """
Downloads adsb.lol data and writes to Parquet files. Downloads adsb.lol data and writes to Parquet files.
Usage: This file contains utility functions for downloading and processing adsb.lol trace data.
python -m src.process_historical_adsb_data.download_to_parquet 2025-01-01 2025-01-02 Used by the historical ADS-B processing pipeline.
This will download trace data for the specified date range and output Parquet files.
This file is self-contained and does not import from other project modules.
""" """
import gc import datetime as dt
import glob
import gzip import gzip
import os
import re
import resource import resource
import shutil import shutil
import sys
import logging
import time
import re
import signal import signal
import concurrent.futures
import subprocess import subprocess
import os import sys
import argparse
import datetime as dt
from datetime import datetime, timedelta, timezone
import urllib.request
import urllib.error import urllib.error
import urllib.request
from datetime import datetime
import time
import orjson import orjson
import pyarrow as pa import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
from pathlib import Path
# ============================================================================ # ============================================================================
# Configuration # Configuration
# ============================================================================ # ============================================================================
OUTPUT_DIR = "./data/output" OUTPUT_DIR = Path("./data/output")
os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True)
PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output") PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output")
@@ -76,20 +67,16 @@ def timeout_handler(signum, frame):
raise DownloadTimeoutException("Download timed out after 40 seconds") raise DownloadTimeoutException("Download timed out after 40 seconds")
def fetch_releases(version_date: str) -> list: def _fetch_releases_from_repo(year: str, version_date: str) -> list:
"""Fetch GitHub releases for a given version date from adsblol.""" """Fetch GitHub releases for a given version date from a specific year's adsblol repo."""
year = version_date.split('.')[0][1:]
if version_date == "v2024.12.31":
year = "2025"
BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases" BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
# Match exact release name, exclude tmp releases PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+(tmp)?$"
PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+$"
releases = [] releases = []
page = 1 page = 1
while True: while True:
max_retries = 10 max_retries = 10
retry_delay = 60 retry_delay = 60*5
for attempt in range(1, max_retries + 1): for attempt in range(1, max_retries + 1):
try: try:
@@ -101,7 +88,7 @@ def fetch_releases(version_date: str) -> list:
else: else:
print(f"Failed to fetch releases (attempt {attempt}/{max_retries}): {response.status} {response.reason}") print(f"Failed to fetch releases (attempt {attempt}/{max_retries}): {response.status} {response.reason}")
if attempt < max_retries: if attempt < max_retries:
print(f"Waiting {retry_delay} seconds before retry...") print(f"Waiting {retry_delay} seconds before retry")
time.sleep(retry_delay) time.sleep(retry_delay)
else: else:
print(f"Giving up after {max_retries} attempts") print(f"Giving up after {max_retries} attempts")
@@ -109,7 +96,7 @@ def fetch_releases(version_date: str) -> list:
except Exception as e: except Exception as e:
print(f"Request exception (attempt {attempt}/{max_retries}): {e}") print(f"Request exception (attempt {attempt}/{max_retries}): {e}")
if attempt < max_retries: if attempt < max_retries:
print(f"Waiting {retry_delay} seconds before retry...") print(f"Waiting {retry_delay} seconds before retry")
time.sleep(retry_delay) time.sleep(retry_delay)
else: else:
print(f"Giving up after {max_retries} attempts") print(f"Giving up after {max_retries} attempts")
@@ -123,41 +110,118 @@ def fetch_releases(version_date: str) -> list:
return releases return releases
def download_asset(asset_url: str, file_path: str) -> bool: def fetch_releases(version_date: str) -> list:
"""Download a single release asset.""" """Fetch GitHub releases for a given version date from adsblol.
For Dec 31 dates, if no releases are found in the current year's repo,
also checks the next year's repo (adsblol sometimes publishes Dec 31
data in the following year's repository).
"""
year = version_date.split('.')[0][1:]
releases = _fetch_releases_from_repo(year, version_date)
# For last day of year, also check next year's repo if nothing found
if not releases and version_date.endswith(".12.31"):
next_year = str(int(year) + 1)
print(f"No releases found for {version_date} in {year} repo, checking {next_year} repo")
releases = _fetch_releases_from_repo(next_year, version_date)
return releases
def download_asset(asset_url: str, file_path: str, expected_size: int | None = None) -> bool:
"""Download a single release asset with size verification.
Args:
asset_url: URL to download from
file_path: Local path to save to
expected_size: Expected file size in bytes (for verification)
Returns:
True if download succeeded and size matches (if provided), False otherwise
"""
os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True) os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
# Check if file exists and has correct size
if os.path.exists(file_path): if os.path.exists(file_path):
print(f"[SKIP] {file_path} already downloaded.") if expected_size is not None:
return True actual_size = os.path.getsize(file_path)
if actual_size == expected_size:
print(f"Downloading {asset_url}...") print(f"[SKIP] {file_path} already downloaded and verified ({actual_size} bytes).")
try:
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(40) # 40-second timeout
req = urllib.request.Request(asset_url, headers=HEADERS)
with urllib.request.urlopen(req) as response:
signal.alarm(0)
if response.status == 200:
with open(file_path, "wb") as file:
while True:
chunk = response.read(8192)
if not chunk:
break
file.write(chunk)
print(f"Saved {file_path}")
return True return True
else: else:
print(f"Failed to download {asset_url}: {response.status} {response.msg}") print(f"[WARN] {file_path} exists but size mismatch (expected {expected_size}, got {actual_size}). Re-downloading.")
os.remove(file_path)
else:
print(f"[SKIP] {file_path} already downloaded.")
return True
max_retries = 2
retry_delay = 30
timeout_seconds = 140
for attempt in range(1, max_retries + 1):
print(f"Downloading {asset_url} (attempt {attempt}/{max_retries})")
try:
req = urllib.request.Request(asset_url, headers=HEADERS)
with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
if response.status == 200:
with open(file_path, "wb") as file:
while True:
chunk = response.read(8192)
if not chunk:
break
file.write(chunk)
# Verify file size if expected_size was provided
if expected_size is not None:
actual_size = os.path.getsize(file_path)
if actual_size != expected_size:
print(f"[ERROR] Size mismatch for {file_path}: expected {expected_size} bytes, got {actual_size} bytes")
os.remove(file_path)
if attempt < max_retries:
print(f"Waiting {retry_delay} seconds before retry")
time.sleep(retry_delay)
continue
return False
print(f"Saved {file_path} ({actual_size} bytes, verified)")
else:
print(f"Saved {file_path}")
return True
else:
print(f"Failed to download {asset_url}: {response.status} {response.msg}")
if attempt < max_retries:
print(f"Waiting {retry_delay} seconds before retry")
time.sleep(retry_delay)
else:
return False
except urllib.error.HTTPError as e:
if e.code == 404:
print(f"404 Not Found: {asset_url}")
raise Exception(f"Asset not found (404): {asset_url}")
else:
print(f"HTTP error occurred (attempt {attempt}/{max_retries}): {e.code} {e.reason}")
if attempt < max_retries:
print(f"Waiting {retry_delay} seconds before retry")
time.sleep(retry_delay)
else:
return False
except urllib.error.URLError as e:
print(f"URL/Timeout error (attempt {attempt}/{max_retries}): {e}")
if attempt < max_retries:
print(f"Waiting {retry_delay} seconds before retry")
time.sleep(retry_delay)
else:
return False return False
except DownloadTimeoutException as e: except Exception as e:
print(f"Download aborted for {asset_url}: {e}") print(f"An error occurred (attempt {attempt}/{max_retries}): {e}")
return False if attempt < max_retries:
except Exception as e: print(f"Waiting {retry_delay} seconds before retry")
print(f"An error occurred while downloading {asset_url}: {e}") time.sleep(retry_delay)
return False else:
return False
return False
def extract_split_archive(file_paths: list, extract_dir: str) -> bool: def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
@@ -196,7 +260,6 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
stdin=cat_proc.stdout, stdin=cat_proc.stdout,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
check=True
) )
cat_proc.stdout.close() cat_proc.stdout.close()
cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else "" cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
@@ -205,6 +268,24 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
if cat_stderr: if cat_stderr:
print(f"cat stderr: {cat_stderr}") print(f"cat stderr: {cat_stderr}")
tar_stderr = result.stderr.decode() if result.stderr else ""
if result.returncode != 0:
# GNU tar exits non-zero for format issues that BSD tar silently
# tolerates (e.g. trailing junk after the last valid entry).
# Check whether files were actually extracted before giving up.
extracted_items = os.listdir(extract_dir)
if extracted_items:
print(f"[WARN] tar exited {result.returncode} but extracted "
f"{len(extracted_items)} items — treating as success")
if tar_stderr:
print(f"tar stderr: {tar_stderr}")
else:
print(f"Failed to extract split archive (tar exit {result.returncode})")
if tar_stderr:
print(f"tar stderr: {tar_stderr}")
shutil.rmtree(extract_dir, ignore_errors=True)
return False
print(f"Successfully extracted archive to {extract_dir}") print(f"Successfully extracted archive to {extract_dir}")
# Delete tar files immediately after extraction # Delete tar files immediately after extraction
@@ -221,11 +302,9 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
print(f"Disk space after tar deletion: {free_gb:.1f}GB free") print(f"Disk space after tar deletion: {free_gb:.1f}GB free")
return True return True
except subprocess.CalledProcessError as e: except Exception as e:
stderr_output = e.stderr.decode() if e.stderr else ""
print(f"Failed to extract split archive: {e}") print(f"Failed to extract split archive: {e}")
if stderr_output: shutil.rmtree(extract_dir, ignore_errors=True)
print(f"tar stderr: {stderr_output}")
return False return False
@@ -389,8 +468,6 @@ COLUMNS = [
OS_CPU_COUNT = os.cpu_count() or 1 OS_CPU_COUNT = os.cpu_count() or 1
MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1 MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1
CHUNK_SIZE = MAX_WORKERS * 500 # Reduced for lower RAM usage
BATCH_SIZE = 250_000 # Fixed size for predictable memory usage (~500MB per batch)
# PyArrow schema for efficient Parquet writing # PyArrow schema for efficient Parquet writing
PARQUET_SCHEMA = pa.schema([ PARQUET_SCHEMA = pa.schema([
@@ -478,211 +555,6 @@ def collect_trace_files_with_find(root_dir):
return trace_dict return trace_dict
def generate_version_dates(start_date: str, end_date: str) -> list:
"""Generate a list of dates from start_date to end_date inclusive."""
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
delta = end - start
return [start + timedelta(days=i) for i in range(delta.days + 1)]
def safe_process(fp):
"""Safely process a file, returning empty list on error."""
try:
return process_file(fp)
except Exception as e:
logging.error(f"Error processing {fp}: {e}")
return []
def rows_to_arrow_table(rows: list) -> pa.Table:
"""Convert list of rows to a PyArrow Table directly (no pandas)."""
# Transpose rows into columns
columns = list(zip(*rows))
# Build arrays for each column according to schema
arrays = []
for i, field in enumerate(PARQUET_SCHEMA):
col_data = list(columns[i]) if i < len(columns) else [None] * len(rows)
arrays.append(pa.array(col_data, type=field.type))
return pa.Table.from_arrays(arrays, schema=PARQUET_SCHEMA)
def write_batch_to_parquet(rows: list, version_date: str, batch_idx: int):
"""Write a batch of rows to a Parquet file."""
if not rows:
return
table = rows_to_arrow_table(rows)
parquet_path = os.path.join(PARQUET_DIR, f"{version_date}_batch_{batch_idx:04d}.parquet")
pq.write_table(table, parquet_path, compression='snappy')
print(f"Written parquet batch {batch_idx} ({len(rows)} rows) | {get_resource_usage()}")
def merge_parquet_files(version_date: str, delete_batches: bool = True):
"""Merge all batch parquet files for a version_date into a single file using streaming."""
pattern = os.path.join(PARQUET_DIR, f"{version_date}_batch_*.parquet")
batch_files = sorted(glob.glob(pattern))
if not batch_files:
print(f"No batch files found for {version_date}")
return None
print(f"Merging {len(batch_files)} batch files for {version_date} (streaming)...")
merged_path = os.path.join(PARQUET_DIR, f"{version_date}.parquet")
total_rows = 0
# Stream write: read one batch at a time to minimize RAM usage
writer = None
try:
for i, f in enumerate(batch_files):
table = pq.read_table(f)
total_rows += table.num_rows
if writer is None:
writer = pq.ParquetWriter(merged_path, table.schema, compression='snappy')
writer.write_table(table)
# Delete batch file immediately after reading to free disk space
if delete_batches:
os.remove(f)
# Free memory
del table
if (i + 1) % 10 == 0:
gc.collect()
print(f" Merged {i + 1}/{len(batch_files)} batches... | {get_resource_usage()}")
finally:
if writer is not None:
writer.close()
print(f"Merged parquet file written to {merged_path} ({total_rows} total rows) | {get_resource_usage()}")
if delete_batches:
print(f"Deleted {len(batch_files)} batch files during merge")
gc.collect()
return merged_path
def process_version_date(version_date: str, keep_folders: bool = False):
"""Download, extract, and process trace files for a single version date."""
print(f"\nProcessing version_date: {version_date}")
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
def collect_trace_files_for_version_date(vd):
releases = fetch_releases(vd)
if len(releases) == 0:
print(f"No releases found for {vd}.")
return None
downloaded_files = []
for release in releases:
tag_name = release["tag_name"]
print(f"Processing release: {tag_name}")
# Only download prod-0 if available, else prod-0tmp
assets = release.get("assets", [])
normal_assets = [
a for a in assets
if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
]
tmp_assets = [
a for a in assets
if "planes-readsb-prod-0tmp" in a["name"]
]
use_assets = normal_assets if normal_assets else tmp_assets
for asset in use_assets:
asset_name = asset["name"]
asset_url = asset["browser_download_url"]
file_path = os.path.join(OUTPUT_DIR, asset_name)
result = download_asset(asset_url, file_path)
if result:
downloaded_files.append(file_path)
extract_split_archive(downloaded_files, extract_dir)
return collect_trace_files_with_find(extract_dir)
# Check if files already exist
pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
if matches:
print(f"Found existing files for {version_date}:")
# Prefer non-tmp slices when reusing existing files
normal_matches = [
p for p in matches
if "-planes-readsb-prod-0." in os.path.basename(p)
and "tmp" not in os.path.basename(p)
]
downloaded_files = normal_matches if normal_matches else matches
extract_split_archive(downloaded_files, extract_dir)
trace_files = collect_trace_files_with_find(extract_dir)
else:
trace_files = collect_trace_files_for_version_date(version_date)
if trace_files is None or len(trace_files) == 0:
print(f"No trace files found for version_date: {version_date}")
return 0
file_list = list(trace_files.values())
start_time = time.perf_counter()
total_num_rows = 0
batch_rows = []
batch_idx = 0
# Process files in chunks
for offset in range(0, len(file_list), CHUNK_SIZE):
chunk = file_list[offset:offset + CHUNK_SIZE]
with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_executor:
for rows in process_executor.map(safe_process, chunk):
if not rows:
continue
batch_rows.extend(rows)
if len(batch_rows) >= BATCH_SIZE:
total_num_rows += len(batch_rows)
write_batch_to_parquet(batch_rows, version_date, batch_idx)
batch_idx += 1
batch_rows = []
elapsed = time.perf_counter() - start_time
speed = total_num_rows / elapsed if elapsed > 0 else 0
print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
gc.collect()
# Final batch
if batch_rows:
total_num_rows += len(batch_rows)
write_batch_to_parquet(batch_rows, version_date, batch_idx)
elapsed = time.perf_counter() - start_time
speed = total_num_rows / elapsed if elapsed > 0 else 0
print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
print(f"Total rows processed for version_date {version_date}: {total_num_rows}")
# Clean up extracted directory immediately after processing (before merging parquet files)
if not keep_folders and os.path.isdir(extract_dir):
print(f"Deleting extraction directory with 100,000+ files: {extract_dir}")
shutil.rmtree(extract_dir)
print(f"Successfully deleted extraction directory: {extract_dir} | {get_resource_usage()}")
# Merge batch files into a single parquet file
merge_parquet_files(version_date, delete_batches=True)
return total_num_rows
def create_parquet_for_day(day, keep_folders: bool = False): def create_parquet_for_day(day, keep_folders: bool = False):
"""Create parquet file for a single day. """Create parquet file for a single day.
@@ -706,42 +578,10 @@ def create_parquet_for_day(day, keep_folders: bool = False):
print(f"Parquet file already exists: {parquet_path}") print(f"Parquet file already exists: {parquet_path}")
return parquet_path return parquet_path
print(f"Creating parquet for {version_date}...") print(f"Creating parquet for {version_date}")
rows_processed = process_version_date(version_date, keep_folders) rows_processed = process_version_date(version_date, keep_folders)
if rows_processed > 0 and parquet_path.exists(): if rows_processed > 0 and parquet_path.exists():
return parquet_path return parquet_path
else: else:
return None return None
def main(start_date: str, end_date: str, keep_folders: bool = False):
"""Main function to download and convert adsb.lol data to Parquet."""
version_dates = [f"v{date.strftime('%Y.%m.%d')}" for date in generate_version_dates(start_date, end_date)]
print(f"Processing dates: {version_dates}")
total_rows_all = 0
for version_date in version_dates:
rows_processed = process_version_date(version_date, keep_folders)
total_rows_all += rows_processed
print(f"\n=== Summary ===")
print(f"Total dates processed: {len(version_dates)}")
print(f"Total rows written to Parquet: {total_rows_all}")
print(f"Parquet files location: {PARQUET_DIR}")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
parser = argparse.ArgumentParser(
description="Download adsb.lol data and write to Parquet files"
)
parser.add_argument("start_date", type=str, help="Start date in YYYY-MM-DD format")
parser.add_argument("end_date", type=str, help="End date in YYYY-MM-DD format")
parser.add_argument("--keep-folders", action="store_true",
help="Keep extracted folders after processing")
args = parser.parse_args()
main(args.start_date, args.end_date, args.keep_folders)
+41 -88
View File
@@ -1,9 +1,7 @@
""" """
Downloads and extracts adsb.lol tar files, then lists all ICAO folders. Downloads and extracts adsb.lol tar files for a single day, then lists all ICAO folders.
This is the first step of the map-reduce pipeline. This is the first step of the map-reduce pipeline.
Supports both single-day (daily) and multi-day (historical) modes.
Outputs: Outputs:
- Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/ - Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
- ICAO manifest at data/output/icao_manifest_{date}.txt - ICAO manifest at data/output/icao_manifest_{date}.txt
@@ -25,11 +23,6 @@ from src.adsb.download_adsb_data_to_parquet import (
) )
def get_target_day() -> datetime:
"""Get yesterday's date (the day we're processing)."""
return datetime.utcnow() - timedelta(days=1)
def download_and_extract(version_date: str) -> str | None: def download_and_extract(version_date: str) -> str | None:
"""Download and extract tar files, return extract directory path.""" """Download and extract tar files, return extract directory path."""
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0") extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
@@ -59,6 +52,12 @@ def download_and_extract(version_date: str) -> str | None:
print(f"No releases found for {version_date}") print(f"No releases found for {version_date}")
return None return None
# Prefer non-tmp releases; only use tmp if no normal releases exist
normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
releases = normal_releases if normal_releases else tmp_releases
print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
downloaded_files = [] downloaded_files = []
for release in releases: for release in releases:
tag_name = release["tag_name"] tag_name = release["tag_name"]
@@ -78,8 +77,9 @@ def download_and_extract(version_date: str) -> str | None:
for asset in use_assets: for asset in use_assets:
asset_name = asset["name"] asset_name = asset["name"]
asset_url = asset["browser_download_url"] asset_url = asset["browser_download_url"]
asset_size = asset.get("size") # Get expected file size
file_path = os.path.join(OUTPUT_DIR, asset_name) file_path = os.path.join(OUTPUT_DIR, asset_name)
if download_asset(asset_url, file_path): if download_asset(asset_url, file_path, expected_size=asset_size):
downloaded_files.append(file_path) downloaded_files.append(file_path)
if not downloaded_files: if not downloaded_files:
@@ -100,21 +100,6 @@ def list_icao_folders(extract_dir: str) -> list[str]:
return icaos return icaos
def write_manifest(icaos: list[str], manifest_id: str) -> str:
"""Write ICAO list to manifest file.
Args:
icaos: List of ICAO codes
manifest_id: Identifier for manifest file (date or date range)
"""
manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
with open(manifest_path, "w") as f:
for icao in sorted(icaos):
f.write(f"{icao}\n")
print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
return manifest_path
def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]: def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
"""Process a single day: download, extract, list ICAOs. """Process a single day: download, extract, list ICAOs.
@@ -129,82 +114,50 @@ def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
extract_dir = download_and_extract(version_date) extract_dir = download_and_extract(version_date)
if not extract_dir: if not extract_dir:
print(f"Failed to download/extract data for {date_str}") print(f"Failed to download/extract data for {date_str}")
return None, [] raise Exception(f"No data available for {date_str}")
icaos = list_icao_folders(extract_dir) icaos = list_icao_folders(extract_dir)
print(f"Found {len(icaos)} ICAOs for {date_str}") print(f"Found {len(icaos)} ICAOs for {date_str}")
return extract_dir, icaos return extract_dir, icaos
from pathlib import Path
def process_date_range(start_date: datetime, end_date: datetime) -> set[str]: import tarfile
"""Process multiple days: download, extract, combine ICAO lists. NUMBER_PARTS = 4
def split_folders_into_gzip_archives(extract_dir: Path, tar_output_dir: Path, icaos: list[str], parts = NUMBER_PARTS) -> list[str]:
Args: traces_dir = extract_dir / "traces"
start_date: Start date (inclusive) buckets = sorted(traces_dir.iterdir())
end_date: End date (inclusive) tars = []
for i in range(parts):
Returns: tar_path = tar_output_dir / f"{tar_output_dir.name}_part_{i}.tar.gz"
Combined set of all ICAOs across the date range tars.append(tarfile.open(tar_path, "w:gz"))
""" for idx, bucket_path in enumerate(buckets):
all_icaos: set[str] = set() tar_idx = idx % parts
current = start_date tars[tar_idx].add(bucket_path, arcname=bucket_path.name)
for tar in tars:
# Both start and end are inclusive tar.close()
while current <= end_date:
_, icaos = process_single_day(current)
all_icaos.update(icaos)
current += timedelta(days=1)
return all_icaos
def main(): def main():
parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data") parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data for a single day")
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)") parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
args = parser.parse_args() args = parser.parse_args()
# Determine mode: single day or date range target_day = datetime.strptime(args.date, "%Y-%m-%d")
if args.start_date and args.end_date: date_str = target_day.strftime("%Y-%m-%d")
# Historical mode: process date range tar_output_dir = Path(f"./data/output/adsb_archives/{date_str}")
start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
end_date = datetime.strptime(args.end_date, "%Y-%m-%d") extract_dir, icaos = process_single_day(target_day)
extract_dir = Path(extract_dir)
print(f"Processing date range: {args.start_date} to {args.end_date}") print(extract_dir)
tar_output_dir.mkdir(parents=True, exist_ok=True)
all_icaos = process_date_range(start_date, end_date) split_folders_into_gzip_archives(extract_dir, tar_output_dir, icaos)
if not icaos:
if not all_icaos: print("No ICAOs found")
print("No ICAOs found in date range") sys.exit(1)
sys.exit(1)
print(f"\nDone! Extract dir: {extract_dir}")
# Write combined manifest with range identifier print(f"Total ICAOs: {len(icaos)}")
manifest_id = f"{args.start_date}_{args.end_date}"
write_manifest(list(all_icaos), manifest_id)
print(f"\nDone! Total ICAOs: {len(all_icaos)}")
else:
# Daily mode: single day
if args.date:
target_day = datetime.strptime(args.date, "%Y-%m-%d")
else:
target_day = get_target_day()
date_str = target_day.strftime("%Y-%m-%d")
extract_dir, icaos = process_single_day(target_day)
if not icaos:
print("No ICAOs found")
sys.exit(1)
write_manifest(icaos, date_str)
print(f"\nDone! Extract dir: {extract_dir}")
print(f"Total ICAOs: {len(icaos)}")
if __name__ == "__main__": if __name__ == "__main__":
+1 -1
View File
@@ -41,7 +41,7 @@ def main() -> None:
"""Main entry point for GitHub Actions.""" """Main entry point for GitHub Actions."""
start_date = os.environ.get("INPUT_START_DATE") start_date = os.environ.get("INPUT_START_DATE")
end_date = os.environ.get("INPUT_END_DATE") end_date = os.environ.get("INPUT_END_DATE")
chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7")) chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "1"))
if not start_date or not end_date: if not start_date or not end_date:
print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr) print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
+78
View File
@@ -0,0 +1,78 @@
"""
Main pipeline for processing ADS-B data from adsb.lol.
Usage:
python -m src.adsb.main --date 2026-01-01
python -m src.adsb.main --start_date 2026-01-01 --end_date 2026-01-03
"""
import argparse
import subprocess
import sys
from datetime import datetime, timedelta
import polars as pl
from src.adsb.download_and_list_icaos import NUMBER_PARTS
def main():
parser = argparse.ArgumentParser(description="Process ADS-B data for a single day or date range")
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format")
parser.add_argument("--start_date", type=str, help="Start date (inclusive, YYYY-MM-DD)")
parser.add_argument("--end_date", type=str, help="End date (exclusive, YYYY-MM-DD)")
parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
args = parser.parse_args()
if args.date and (args.start_date or args.end_date):
raise SystemExit("Use --date or --start_date/--end_date, not both.")
if args.date:
start_date = datetime.strptime(args.date, "%Y-%m-%d")
end_date = start_date + timedelta(days=1)
else:
if not args.start_date or not args.end_date:
raise SystemExit("Provide --start_date and --end_date, or use --date.")
start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
current = start_date
while current < end_date:
date_str = current.strftime("%Y-%m-%d")
print(f"Processing day: {date_str}")
# Download and split
subprocess.run([sys.executable, "-m", "src.adsb.download_and_list_icaos", "--date", date_str], check=True)
# Process parts
for part_id in range(NUMBER_PARTS):
subprocess.run([sys.executable, "-m", "src.adsb.process_icao_chunk", "--part-id", str(part_id), "--date", date_str], check=True)
# Concatenate
concat_cmd = [sys.executable, "-m", "src.adsb.concat_parquet_to_final", "--date", date_str]
if args.concat_with_latest_csv:
concat_cmd.append("--concat_with_latest_csv")
subprocess.run(concat_cmd, check=True)
current += timedelta(days=1)
if end_date - start_date > timedelta(days=1):
dates = []
cur = start_date
while cur < end_date:
dates.append(cur.strftime("%Y-%m-%d"))
cur += timedelta(days=1)
csv_files = [
f"data/outputs/openairframes_adsb_{d}_{d}.csv"
for d in dates
]
frames = [pl.read_csv(p) for p in csv_files]
df = pl.concat(frames, how="vertical", rechunk=True)
output_path = f"data/outputs/openairframes_adsb_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv"
df.write_csv(output_path)
print(f"Wrote combined CSV: {output_path}")
print("Done")
if __name__ == "__main__":
main()
+62 -241
View File
@@ -1,18 +1,9 @@
""" """
Processes a chunk of ICAOs from pre-extracted trace files. Processes trace files from a single archive part for a single day.
This is the map phase of the map-reduce pipeline. This is the map phase of the map-reduce pipeline.
Supports both single-day (daily) and multi-day (historical) modes.
Expects extract_dir to already exist with trace files.
Reads ICAO manifest to determine which ICAOs to process based on chunk-id.
Usage: Usage:
# Daily mode (single day) python -m src.adsb.process_icao_chunk --part-id 1 --date 2026-01-01
python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
# Historical mode (date range)
python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
""" """
import gc import gc
import os import os
@@ -21,6 +12,9 @@ import argparse
import time import time
import concurrent.futures import concurrent.futures
from datetime import datetime, timedelta from datetime import datetime, timedelta
import tarfile
import tempfile
import shutil
import pyarrow as pa import pyarrow as pa
import pyarrow.parquet as pq import pyarrow.parquet as pq
@@ -37,72 +31,21 @@ from src.adsb.download_adsb_data_to_parquet import (
) )
CHUNK_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
os.makedirs(CHUNK_OUTPUT_DIR, exist_ok=True)
# Smaller batch size for memory efficiency # Smaller batch size for memory efficiency
BATCH_SIZE = 100_000 BATCH_SIZE = 100_000
def build_trace_file_map(archive_path: str) -> dict[str, str]:
def get_target_day() -> datetime: """Build a map of ICAO -> trace file path by extracting tar.gz archive."""
"""Get yesterday's date (the day we're processing).""" print(f"Extracting {archive_path}...")
return datetime.utcnow() - timedelta(days=1)
def read_manifest(manifest_id: str) -> list[str]:
"""Read ICAO manifest file.
Args: temp_dir = tempfile.mkdtemp(prefix="adsb_extract_")
manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
"""
manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
if not os.path.exists(manifest_path):
raise FileNotFoundError(f"Manifest not found: {manifest_path}")
with open(manifest_path, "r") as f: with tarfile.open(archive_path, 'r:gz') as tar:
icaos = [line.strip() for line in f if line.strip()] tar.extractall(path=temp_dir, filter='data')
return icaos
def deterministic_hash(s: str) -> int:
"""Return a deterministic hash for a string (unlike Python's hash() which is randomized)."""
# Use sum of byte values - simple but deterministic
return sum(ord(c) for c in s)
def get_chunk_icaos(icaos: list[str], chunk_id: int, total_chunks: int) -> list[str]:
"""Get the subset of ICAOs for this chunk based on deterministic hash partitioning."""
return [icao for icao in icaos if deterministic_hash(icao) % total_chunks == chunk_id]
def build_trace_file_map(extract_dir: str) -> dict[str, str]:
"""Build a map of ICAO -> trace file path using find command."""
print(f"Building trace file map from {extract_dir}...")
# Debug: check what's in extract_dir trace_map = collect_trace_files_with_find(temp_dir)
if os.path.isdir(extract_dir):
items = os.listdir(extract_dir)[:10]
print(f"First 10 items in extract_dir: {items}")
# Check if there are subdirectories
for item in items[:3]:
subpath = os.path.join(extract_dir, item)
if os.path.isdir(subpath):
subitems = os.listdir(subpath)[:5]
print(f" Contents of {item}/: {subitems}")
trace_map = collect_trace_files_with_find(extract_dir)
print(f"Found {len(trace_map)} trace files") print(f"Found {len(trace_map)} trace files")
if len(trace_map) == 0:
# Debug: try manual find
import subprocess
result = subprocess.run(
['find', extract_dir, '-type', 'f', '-name', 'trace_full_*'],
capture_output=True, text=True
)
print(f"Manual find output (first 500 chars): {result.stdout[:500]}")
print(f"Manual find stderr: {result.stderr[:200]}")
return trace_map return trace_map
@@ -125,42 +68,13 @@ def rows_to_table(rows: list) -> pa.Table:
def process_chunk( def process_chunk(
chunk_id: int, trace_files: list[str],
total_chunks: int, part_id: int,
trace_map: dict[str, str], date_str: str,
icaos: list[str],
output_id: str,
) -> str | None: ) -> str | None:
"""Process a chunk of ICAOs and write to parquet. """Process trace files and write to a single parquet file."""
Args: output_path = os.path.join(PARQUET_DIR, f"part_{part_id}_{date_str}.parquet")
chunk_id: This chunk's ID (0-indexed)
total_chunks: Total number of chunks
trace_map: Map of ICAO -> trace file path
icaos: Full list of ICAOs from manifest
output_id: Identifier for output file (date or date range)
"""
chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
if not chunk_icaos:
print(f"Chunk {chunk_id}: No ICAOs to process")
return None
# Get trace file paths from the map
trace_files = []
for icao in chunk_icaos:
if icao in trace_map:
trace_files.append(trace_map[icao])
print(f"Chunk {chunk_id}: Found {len(trace_files)} trace files")
if not trace_files:
print(f"Chunk {chunk_id}: No trace files found")
return None
# Process files and write parquet in batches
output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
start_time = time.perf_counter() start_time = time.perf_counter()
total_rows = 0 total_rows = 0
@@ -168,7 +82,8 @@ def process_chunk(
writer = None writer = None
try: try:
# Process in parallel batches writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
files_per_batch = MAX_WORKERS * 100 files_per_batch = MAX_WORKERS * 100
for offset in range(0, len(trace_files), files_per_batch): for offset in range(0, len(trace_files), files_per_batch):
batch_files = trace_files[offset:offset + files_per_batch] batch_files = trace_files[offset:offset + files_per_batch]
@@ -178,166 +93,72 @@ def process_chunk(
if rows: if rows:
batch_rows.extend(rows) batch_rows.extend(rows)
# Write when batch is full
if len(batch_rows) >= BATCH_SIZE: if len(batch_rows) >= BATCH_SIZE:
table = rows_to_table(batch_rows) writer.write_table(rows_to_table(batch_rows))
total_rows += len(batch_rows) total_rows += len(batch_rows)
if writer is None:
writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
writer.write_table(table)
batch_rows = [] batch_rows = []
del table
gc.collect() gc.collect()
elapsed = time.perf_counter() - start_time
print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
gc.collect() gc.collect()
# Write remaining rows
if batch_rows: if batch_rows:
table = rows_to_table(batch_rows) writer.write_table(rows_to_table(batch_rows))
total_rows += len(batch_rows) total_rows += len(batch_rows)
if writer is None:
writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
writer.write_table(table)
del table
finally: finally:
if writer: if writer:
writer.close() writer.close()
elapsed = time.perf_counter() - start_time print(f"Part {part_id}: Done! {total_rows} rows in {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
if total_rows > 0: return output_path if total_rows > 0 else None
return output_path
return None
def process_single_day(
chunk_id: int,
total_chunks: int,
target_day: datetime,
) -> str | None:
"""Process a single day for this chunk."""
date_str = target_day.strftime("%Y-%m-%d")
version_date = f"v{target_day.strftime('%Y.%m.%d')}"
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
if not os.path.isdir(extract_dir):
print(f"Extract directory not found: {extract_dir}")
return None
trace_map = build_trace_file_map(extract_dir)
if not trace_map:
print("No trace files found")
return None
icaos = read_manifest(date_str)
print(f"Total ICAOs in manifest: {len(icaos)}")
return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
def process_date_range(
chunk_id: int,
total_chunks: int,
start_date: datetime,
end_date: datetime,
) -> str | None:
"""Process a date range for this chunk.
Combines trace files from all days in the range.
Args:
chunk_id: This chunk's ID (0-indexed)
total_chunks: Total number of chunks
start_date: Start date (inclusive)
end_date: End date (inclusive)
"""
start_str = start_date.strftime("%Y-%m-%d")
end_str = end_date.strftime("%Y-%m-%d")
manifest_id = f"{start_str}_{end_str}"
print(f"Processing date range: {start_str} to {end_str}")
# Build combined trace map from all days
combined_trace_map: dict[str, str] = {}
current = start_date
# Both start and end are inclusive
while current <= end_date:
version_date = f"v{current.strftime('%Y.%m.%d')}"
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
if os.path.isdir(extract_dir):
trace_map = build_trace_file_map(extract_dir)
# Later days override earlier days (use most recent trace file)
combined_trace_map.update(trace_map)
print(f" {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
else:
print(f" {current.strftime('%Y-%m-%d')}: no extract directory")
current += timedelta(days=1)
if not combined_trace_map:
print("No trace files found in date range")
return None
print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
icaos = read_manifest(manifest_id)
print(f"Total ICAOs in manifest: {len(icaos)}")
return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)
from pathlib import Path
def main(): def main():
parser = argparse.ArgumentParser(description="Process a chunk of ICAOs") parser = argparse.ArgumentParser(description="Process a single archive part for a day")
parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)") parser.add_argument("--part-id", type=int, required=True, help="Part ID (1-indexed)")
parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks") parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
args = parser.parse_args() args = parser.parse_args()
print(f"Processing chunk {args.chunk_id}/{args.total_chunks}") print(f"Processing part {args.part_id} for {args.date}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")
print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
print(f"Resource usage at start: {get_resource_usage()}")
# Debug: List what's in OUTPUT_DIR # Get specific archive file for this part
print(f"\nContents of {OUTPUT_DIR}:") archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", args.date)
if os.path.isdir(OUTPUT_DIR): archive_path = os.path.join(archive_dir, f"{args.date}_part_{args.part_id}.tar.gz")
for item in os.listdir(OUTPUT_DIR)[:20]:
print(f" - {item}")
else:
print(f" Directory does not exist!")
# Determine mode: single day or date range if not os.path.isfile(archive_path):
if args.start_date and args.end_date: print(f"ERROR: Archive not found: {archive_path}")
# Historical mode if os.path.isdir(archive_dir):
start_date = datetime.strptime(args.start_date, "%Y-%m-%d") print(f"Files in {archive_dir}: {os.listdir(archive_dir)}")
end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
else:
# Daily mode
if args.date:
target_day = datetime.strptime(args.date, "%Y-%m-%d")
else: else:
target_day = get_target_day() print(f"Directory does not exist: {archive_dir}")
output_path = process_single_day(args.chunk_id, args.total_chunks, target_day) sys.exit(1)
if output_path: # Extract and collect trace files
print(f"Output: {output_path}") trace_map = build_trace_file_map(archive_path)
else: all_trace_files = list(trace_map.values())
print("No output generated")
print(f"Total trace files: {len(all_trace_files)}")
# Process and write output
output_path = process_chunk(all_trace_files, args.part_id, args.date)
from src.adsb.compress_adsb_to_aircraft_data import compress_parquet_part
df_compressed = compress_parquet_part(args.part_id, args.date)
# Write parquet
df_compressed_output = OUTPUT_DIR / "compressed" / args.date/ f"part_{args.part_id}_{args.date}.parquet"
os.makedirs(df_compressed_output.parent, exist_ok=True)
df_compressed.write_parquet(df_compressed_output, compression='snappy')
# Write CSV
csv_output = OUTPUT_DIR / "compressed" / args.date / f"part_{args.part_id}_{args.date}.csv"
df_compressed.write_csv(csv_output)
print(f"Raw output: {output_path}" if output_path else "No raw output generated")
print(f"Compressed parquet: {df_compressed_output}")
print(f"Compressed CSV: {csv_output}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
-97
View File
@@ -1,97 +0,0 @@
"""
Reduce step: downloads all chunk CSVs from S3, combines them,
deduplicates across the full dataset, and uploads the final result.
Environment variables:
S3_BUCKET — bucket with intermediate results
RUN_ID — run identifier matching the map workers
GLOBAL_START_DATE — overall start date for output filename
GLOBAL_END_DATE — overall end date for output filename
"""
import gzip
import os
import shutil
from pathlib import Path
import boto3
import polars as pl
from compress_adsb_to_aircraft_data import COLUMNS, deduplicate_by_signature
def main():
s3_bucket = os.environ["S3_BUCKET"]
run_id = os.environ.get("RUN_ID", "default")
global_start = os.environ["GLOBAL_START_DATE"]
global_end = os.environ["GLOBAL_END_DATE"]
s3 = boto3.client("s3")
prefix = f"intermediate/{run_id}/"
# List all chunk files for this run
paginator = s3.get_paginator("list_objects_v2")
chunk_keys = []
for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix):
for obj in page.get("Contents", []):
if obj["Key"].endswith(".csv.gz"):
chunk_keys.append(obj["Key"])
chunk_keys.sort()
print(f"Found {len(chunk_keys)} chunks to combine")
if not chunk_keys:
print("No chunks found — nothing to reduce.")
return
# Download and concatenate all chunks
download_dir = Path("/tmp/chunks")
download_dir.mkdir(parents=True, exist_ok=True)
dfs = []
for key in chunk_keys:
gz_path = download_dir / Path(key).name
csv_path = gz_path.with_suffix("") # Remove .gz
print(f"Downloading {key}...")
s3.download_file(s3_bucket, key, str(gz_path))
# Decompress
with gzip.open(gz_path, 'rb') as f_in:
with open(csv_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
gz_path.unlink()
df_chunk = pl.read_csv(csv_path)
print(f" Loaded {df_chunk.height} rows from {csv_path.name}")
dfs.append(df_chunk)
# Free disk space after loading
csv_path.unlink()
df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
print(f"Combined: {df_accumulated.height} rows before dedup")
# Final global deduplication
df_accumulated = deduplicate_by_signature(df_accumulated)
print(f"After dedup: {df_accumulated.height} rows")
# Write and upload final result
output_name = f"openairframes_adsb_{global_start}_{global_end}.csv.gz"
csv_output = Path(f"/tmp/openairframes_adsb_{global_start}_{global_end}.csv")
gz_output = Path(f"/tmp/{output_name}")
df_accumulated.write_csv(csv_output)
with open(csv_output, 'rb') as f_in:
with gzip.open(gz_output, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
csv_output.unlink()
final_key = f"final/{output_name}"
print(f"Uploading to s3://{s3_bucket}/{final_key}")
s3.upload_file(str(gz_output), s3_bucket, final_key)
print(f"Final output: {df_accumulated.height} records -> {final_key}")
if __name__ == "__main__":
main()
-2
View File
@@ -1,2 +0,0 @@
polars>=1.0
boto3>=1.34
-5
View File
@@ -1,5 +0,0 @@
polars>=1.0
pyarrow>=14.0
orjson>=3.9
boto3>=1.34
zstandard>=0.22
-89
View File
@@ -1,89 +0,0 @@
"""
Map worker: processes a date range chunk, uploads result to S3.
Environment variables:
START_DATE — inclusive, YYYY-MM-DD
END_DATE — exclusive, YYYY-MM-DD
S3_BUCKET — bucket for intermediate results
RUN_ID — unique run identifier for namespacing S3 keys
"""
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
import boto3
import polars as pl
from compress_adsb_to_aircraft_data import (
load_historical_for_day,
deduplicate_by_signature,
COLUMNS,
)
def main():
start_date_str = os.environ["START_DATE"]
end_date_str = os.environ["END_DATE"]
s3_bucket = os.environ["S3_BUCKET"]
run_id = os.environ.get("RUN_ID", "default")
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
total_days = (end_date - start_date).days
print(f"Worker: processing {total_days} days [{start_date_str}, {end_date_str})")
dfs = []
current_date = start_date
while current_date < end_date:
day_str = current_date.strftime("%Y-%m-%d")
print(f" Loading {day_str}...")
df_compressed = load_historical_for_day(current_date)
if df_compressed.height == 0:
raise RuntimeError(f"No data found for {day_str}")
dfs.append(df_compressed)
total_rows = sum(df.height for df in dfs)
print(f" +{df_compressed.height} rows (total: {total_rows})")
# Delete local cache after each day to save disk in container
cache_dir = Path("data/adsb")
if cache_dir.exists():
import shutil
shutil.rmtree(cache_dir)
current_date += timedelta(days=1)
# Concatenate all days
df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
# Deduplicate within this chunk
df_accumulated = deduplicate_by_signature(df_accumulated)
print(f"After dedup: {df_accumulated.height} rows")
# Write to local file then upload to S3
local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv")
df_accumulated.write_csv(local_path)
# Compress with gzip
import gzip
import shutil
gz_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
with open(local_path, 'rb') as f_in:
with gzip.open(gz_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
local_path.unlink() # Remove uncompressed file
s3_key = f"intermediate/{run_id}/chunk_{start_date_str}_{end_date_str}.csv.gz"
print(f"Uploading to s3://{s3_bucket}/{s3_key}")
s3 = boto3.client("s3")
s3.upload_file(str(gz_path), s3_bucket, s3_key)
print("Done.")
if __name__ == "__main__":
main()
+15 -4
View File
@@ -246,6 +246,20 @@ def process_submission(
if schema_updated: if schema_updated:
schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n" schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
# Truncate JSON preview to stay under GitHub's 65536 char body limit
max_json_preview = 50000
if len(content_json) > max_json_preview:
# Show first few entries as a preview
preview_entries = submissions[:10]
preview_json = json.dumps(preview_entries, indent=2, sort_keys=True)
json_section = (
f"### Submissions (showing 10 of {len(submissions)})\n"
f"```json\n{preview_json}\n```\n\n"
f"*Full submission ({len(submissions)} entries, {len(content_json):,} chars) is in the committed file.*"
)
else:
json_section = f"### Submissions\n```json\n{content_json}\n```"
pr_body = f"""## Community Submission pr_body = f"""## Community Submission
Adds {len(submissions)} submission(s) from @{author_username}. Adds {len(submissions)} submission(s) from @{author_username}.
@@ -257,10 +271,7 @@ Closes #{issue_number}
--- ---
### Submissions {json_section}"""
```json
{content_json}
```"""
pr = create_pull_request( pr = create_pull_request(
title=f"Community submission: {filename}", title=f"Community submission: {filename}",
@@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""
Download ADS-B Exchange basic-ac-db.json.gz.
Usage:
python -m src.contributions.create_daily_adsbexchange_release [--date YYYY-MM-DD]
"""
from __future__ import annotations
import argparse
import shutil
from datetime import datetime, timezone
from pathlib import Path
from urllib.request import Request, urlopen
URL = "https://downloads.adsbexchange.com/downloads/basic-ac-db.json.gz"
OUT_ROOT = Path("data/openairframes")
def main() -> None:
parser = argparse.ArgumentParser(description="Create daily ADS-B Exchange JSON release")
parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
args = parser.parse_args()
date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
gz_path = OUT_ROOT / f"basic-ac-db_{date_str}.json.gz"
print(f"Downloading {URL}...")
req = Request(URL, headers={"User-Agent": "openairframes-downloader/1.0"}, method="GET")
with urlopen(req, timeout=300) as r, gz_path.open("wb") as f:
shutil.copyfileobj(r, f)
print(f"Wrote: {gz_path}")
if __name__ == "__main__":
main()
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
"""Read all JSON submissions from the community directory.""" """Read all JSON submissions from the community directory."""
all_submissions = [] all_submissions = []
for json_file in sorted(community_dir.glob("*.json")): for json_file in sorted(community_dir.glob("**/*.json")):
try: try:
with open(json_file) as f: with open(json_file) as f:
data = json.load(f) data = json.load(f)
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
"""
Download Mictronics aircraft database zip.
Usage:
python -m src.contributions.create_daily_microtonics_release [--date YYYY-MM-DD]
"""
from __future__ import annotations
import argparse
import shutil
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from urllib.error import URLError
from urllib.request import Request, urlopen
URL = "https://www.mictronics.de/aircraft-database/indexedDB_old.php"
OUT_ROOT = Path("data/openairframes")
MAX_RETRIES = 3
RETRY_DELAY = 30 # seconds
def main() -> None:
parser = argparse.ArgumentParser(description="Create daily Mictronics database release")
parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
args = parser.parse_args()
date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
zip_path = OUT_ROOT / f"mictronics-db_{date_str}.zip"
for attempt in range(1, MAX_RETRIES + 1):
try:
print(f"Downloading {URL} (attempt {attempt}/{MAX_RETRIES})...")
req = Request(URL, headers={"User-Agent": "Mozilla/5.0 (compatible; openairframes-downloader/1.0)"}, method="GET")
with urlopen(req, timeout=120) as r, zip_path.open("wb") as f:
shutil.copyfileobj(r, f)
print(f"Wrote: {zip_path}")
return
except (URLError, TimeoutError) as e:
print(f"Attempt {attempt} failed: {e}")
if attempt < MAX_RETRIES:
print(f"Retrying in {RETRY_DELAY} seconds...")
time.sleep(RETRY_DELAY)
else:
print("All retries exhausted. Mictronics download failed.")
sys.exit(1)
if __name__ == "__main__":
main()
+65 -3
View File
@@ -36,6 +36,52 @@ def get_latest_schema_version() -> int:
return max_version return max_version
def _is_balanced_json(text: str) -> bool:
"""
Check if JSON has balanced brackets/braces.
This is a simple check to ensure we captured complete JSON.
Ignores brackets/braces inside strings.
Args:
text: JSON text to check
Returns:
True if balanced, False otherwise
"""
in_string = False
escape = False
stack = []
pairs = {'[': ']', '{': '}'}
for char in text:
if escape:
escape = False
continue
if char == '\\':
escape = True
continue
if char == '"' and not escape:
in_string = not in_string
continue
if in_string:
continue
if char in pairs:
stack.append(char)
elif char in pairs.values():
if not stack:
return False
if pairs[stack[-1]] != char:
return False
stack.pop()
return len(stack) == 0 and not in_string
def get_schema_path(version: int | None = None) -> Path: def get_schema_path(version: int | None = None) -> Path:
""" """
Get path to a specific schema version, or latest if version is None. Get path to a specific schema version, or latest if version is None.
@@ -162,10 +208,14 @@ def extract_json_from_issue_body(body: str) -> str | None:
return match.group(1).strip() return match.group(1).strip()
# Try: Raw JSON after "### Submission JSON" until next section or end # Try: Raw JSON after "### Submission JSON" until next section or end
pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*?[\]}])(?=\n###|\n\n###|$)" # Use greedy matching since we have a clear boundary (next ### or end)
pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*[\]}])(?=\s*\n###|\s*$)"
match = re.search(pattern_raw, body) match = re.search(pattern_raw, body)
if match: if match:
return match.group(1).strip() candidate = match.group(1).strip()
# Validate it's complete JSON by checking balanced brackets
if _is_balanced_json(candidate):
return candidate
# Try: Any JSON object/array in the body (fallback) # Try: Any JSON object/array in the body (fallback)
pattern_any = r"([\[{][\s\S]*?[\]}])" pattern_any = r"([\[{][\s\S]*?[\]}])"
@@ -219,7 +269,19 @@ def parse_and_validate(json_str: str, schema: dict | None = None) -> tuple[list
try: try:
data = json.loads(json_str) data = json.loads(json_str)
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
return None, [f"Invalid JSON: {e}"] # Provide detailed error context
error_msg = f"Invalid JSON: {e}"
# Show context around the error position
if hasattr(e, 'pos') and e.pos is not None:
start = max(0, e.pos - 50)
end = min(len(json_str), e.pos + 50)
context = json_str[start:end]
# Escape for readability
context_escaped = repr(context)
error_msg += f"\n\nContext around position {e.pos}: {context_escaped}"
return None, [error_msg]
errors = validate_submission(data, schema) errors = validate_submission(data, schema)
return data, errors return data, errors
-84
View File
@@ -1,84 +0,0 @@
from pathlib import Path
from datetime import datetime, timezone, timedelta
import sys
import polars as pl
# Add adsb directory to path
sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation
from adsb.compress_adsb_to_aircraft_data import (
load_historical_for_day,
concat_compressed_dfs,
get_latest_aircraft_adsb_csv_df,
)
if __name__ == '__main__':
# Get yesterday's date (data for the previous day)
day = datetime.now(timezone.utc) - timedelta(days=1)
# Find a day with complete data
max_attempts = 2 # Don't look back more than a week
for attempt in range(max_attempts):
date_str = day.strftime("%Y-%m-%d")
print(f"Processing ADS-B data for {date_str}")
print("Loading new ADS-B data...")
df_new = load_historical_for_day(day)
if df_new.height == 0:
day = day - timedelta(days=1)
continue
max_time = df_new['time'].max()
if max_time is not None:
# Handle timezone
max_time_dt = max_time
if hasattr(max_time_dt, 'replace'):
max_time_dt = max_time_dt.replace(tzinfo=timezone.utc)
end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5)
# Convert polars datetime to python datetime if needed
if isinstance(max_time_dt, datetime):
if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day:
break
else:
# Polars returns python datetime already
if max_time >= day.replace(hour=23, minute=54, second=59):
break
print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.")
day = day - timedelta(days=1)
else:
raise RuntimeError(f"Could not find complete data in the last {max_attempts} days")
try:
# Get the latest release data
print("Downloading latest ADS-B release...")
df_base, start_date_str = get_latest_aircraft_adsb_csv_df()
# Combine with historical data
print("Combining with historical data...")
df_combined = concat_compressed_dfs(df_base, df_new)
except Exception as e:
print(f"Error downloading latest ADS-B release: {e}")
df_combined = df_new
start_date_str = date_str
# Sort by time for consistent ordering
df_combined = df_combined.sort('time')
# Convert any list columns to strings for CSV compatibility
for col in df_combined.columns:
if df_combined[col].dtype == pl.List:
df_combined = df_combined.with_columns(
pl.col(col).list.join(",").alias(col)
)
# Save the result
OUT_ROOT = Path("data/openairframes")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv"
df_combined.write_csv(output_file)
print(f"Saved: {output_file}")
print(f"Total aircraft: {df_combined.height}")
+5 -2
View File
@@ -47,6 +47,9 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
# Convert all NaN to empty strings # Convert all NaN to empty strings
df = df.fillna("") df = df.fillna("")
# The FAA parser can produce the literal string "None" for missing values;
# replace those so they match the empty-string convention used everywhere else.
df = df.replace("None", "")
return df return df
@@ -84,8 +87,8 @@ def concat_faa_historical_df(df_base, df_new):
# Convert to string # Convert to string
val_str = str(val).strip() val_str = str(val).strip()
# Handle empty strings # Handle empty strings and null-like literals
if val_str == "" or val_str == "nan": if val_str == "" or val_str == "nan" or val_str == "None":
return "" return ""
# Check if it looks like a list representation (starts with [ ) # Check if it looks like a list representation (starts with [ )
+93 -29
View File
@@ -27,6 +27,33 @@ def _http_get_json(url: str, headers: dict[str, str]) -> dict:
return json.loads(data.decode("utf-8")) return json.loads(data.decode("utf-8"))
def get_releases(repo: str = REPO, github_token: Optional[str] = None, per_page: int = 30) -> list[dict]:
"""Get a list of releases from the repository."""
url = f"https://api.github.com/repos/{repo}/releases?per_page={per_page}"
headers = {
"Accept": "application/vnd.github+json",
"User-Agent": "openairframes-downloader/1.0",
}
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
return _http_get_json(url, headers=headers)
def get_release_assets_from_release_data(release_data: dict) -> list[ReleaseAsset]:
"""Extract assets from a release data dictionary."""
assets = []
for a in release_data.get("assets", []):
assets.append(
ReleaseAsset(
name=a["name"],
download_url=a["browser_download_url"],
size=int(a.get("size", 0)),
)
)
return assets
def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]: def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
url = f"https://api.github.com/repos/{repo}/releases/latest" url = f"https://api.github.com/repos/{repo}/releases/latest"
headers = { headers = {
@@ -37,16 +64,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
headers["Authorization"] = f"Bearer {github_token}" headers["Authorization"] = f"Bearer {github_token}"
payload = _http_get_json(url, headers=headers) payload = _http_get_json(url, headers=headers)
assets = [] return get_release_assets_from_release_data(payload)
for a in payload.get("assets", []):
assets.append(
ReleaseAsset(
name=a["name"],
download_url=a["browser_download_url"],
size=int(a.get("size", 0)),
)
)
return assets
def pick_asset( def pick_asset(
@@ -119,6 +137,7 @@ def download_latest_aircraft_csv(
Returns: Returns:
Path to the downloaded file Path to the downloaded file
""" """
output_dir = Path(output_dir)
assets = get_latest_release_assets(repo, github_token=github_token) assets = get_latest_release_assets(repo, github_token=github_token)
try: try:
asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$") asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$")
@@ -154,7 +173,8 @@ def download_latest_aircraft_adsb_csv(
repo: str = REPO, repo: str = REPO,
) -> Path: ) -> Path:
""" """
Download the latest openairframes_adsb_*.csv file from the latest GitHub release. Download the latest openairframes_adsb_*.csv file from GitHub releases.
If the latest release doesn't have the file, searches previous releases.
Args: Args:
output_dir: Directory to save the downloaded file (default: "downloads") output_dir: Directory to save the downloaded file (default: "downloads")
@@ -164,26 +184,70 @@ def download_latest_aircraft_adsb_csv(
Returns: Returns:
Path to the downloaded file Path to the downloaded file
""" """
assets = get_latest_release_assets(repo, github_token=github_token) output_dir = Path(output_dir)
asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to
def get_latest_aircraft_adsb_csv_df():
csv_path = download_latest_aircraft_adsb_csv()
import pandas as pd
df = pd.read_csv(csv_path)
df = df.fillna("")
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
date_str = match.group(1) # Get multiple releases
return df, date_str releases = get_releases(repo, github_token=github_token, per_page=30)
# Try each release until we find one with the matching asset
for release in releases:
assets = get_release_assets_from_release_data(release)
try:
asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to
except FileNotFoundError:
# This release doesn't have the matching asset, try the next one
continue
raise FileNotFoundError(
f"No release in the last 30 releases has an asset matching 'openairframes_adsb_.*\\.csv(\\.gz)?$'"
)
import polars as pl
def get_latest_aircraft_adsb_csv_df():
"""Download and load the latest ADS-B CSV from GitHub releases.
Returns:
tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
"""
import re
csv_path = download_latest_aircraft_adsb_csv()
df = pl.read_csv(csv_path, null_values=[""])
# Parse time column: values like "2025-12-31T00:00:00.040" or "2025-05-11T15:15:50.540+0000"
# Try with timezone first (convert to naive), then without timezone
df = df.with_columns(
pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f%z", strict=False)
.dt.replace_time_zone(None) # Convert to naive datetime first
.fill_null(pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f", strict=False))
)
# Cast dbFlags and year to strings to match the schema used in compress functions
for col in ['dbFlags', 'year']:
if col in df.columns:
df = df.with_columns(pl.col(col).cast(pl.Utf8))
# Fill nulls with empty strings for string columns
for col in df.columns:
if df[col].dtype == pl.Utf8:
df = df.with_columns(pl.col(col).fill_null(""))
# Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
if not match:
raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
start_date = match.group(1)
end_date = match.group(2)
print(df.columns)
print(df.dtypes)
return df, start_date, end_date
if __name__ == "__main__": if __name__ == "__main__":
download_latest_aircraft_csv() download_latest_aircraft_csv()
download_latest_aircraft_adsb_csv()