mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-05-03 16:25:08 +02:00
Compare commits
26 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a8b2b66952 | |||
| 3f38263a0c | |||
| 1a553d5f44 | |||
| 405855c566 | |||
| 4e81dde201 | |||
| fde8ef029c | |||
| 18ab51bd60 | |||
| 83b9d2a76d | |||
| 8874619ab0 | |||
| 823f291728 | |||
| 982011b36f | |||
| 1b15e43669 | |||
| f17adc4574 | |||
| 6a250a63fb | |||
| 9e24fcbc63 | |||
| 8ce04f1f83 | |||
| 9441761ac9 | |||
| ccf55b2308 | |||
| 76eaf118ef | |||
| 0fcbad0fbc | |||
| 0c7484e7bf | |||
| 8c60ac611d | |||
| 145f1006be | |||
| f5465f0552 | |||
| 17098ae39a | |||
| 6f6b65780a |
@@ -1,182 +0,0 @@
|
||||
name: Historical ADS-B Processing
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
date:
|
||||
description: 'YYYY-MM-DD'
|
||||
required: true
|
||||
type: string
|
||||
concat_with_latest_csv:
|
||||
description: 'Also concatenate with latest CSV from GitHub releases'
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
workflow_call:
|
||||
inputs:
|
||||
date:
|
||||
description: 'YYYY-MM-DD'
|
||||
required: true
|
||||
type: string
|
||||
concat_with_latest_csv:
|
||||
description: 'Also concatenate with latest CSV from GitHub releases'
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
jobs:
|
||||
adsb-extract:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download and split ADS-B data
|
||||
env:
|
||||
DATE: ${{ inputs.date }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
python -m src.adsb.download_and_list_icaos --date "$DATE"
|
||||
ls -lah data/output/adsb_archives/"$DATE" || true
|
||||
|
||||
- name: Upload archive part 0
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-archive-${{ inputs.date }}-part-0
|
||||
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_0.tar.gz
|
||||
retention-days: 1
|
||||
compression-level: 0
|
||||
if-no-files-found: error
|
||||
|
||||
- name: Upload archive part 1
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-archive-${{ inputs.date }}-part-1
|
||||
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_1.tar.gz
|
||||
retention-days: 1
|
||||
compression-level: 0
|
||||
if-no-files-found: error
|
||||
|
||||
- name: Upload archive part 2
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-archive-${{ inputs.date }}-part-2
|
||||
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_2.tar.gz
|
||||
retention-days: 1
|
||||
compression-level: 0
|
||||
if-no-files-found: error
|
||||
|
||||
- name: Upload archive part 3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-archive-${{ inputs.date }}-part-3
|
||||
path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_3.tar.gz
|
||||
retention-days: 1
|
||||
compression-level: 0
|
||||
if-no-files-found: error
|
||||
|
||||
adsb-map:
|
||||
needs: adsb-extract
|
||||
runs-on: ubuntu-24.04-arm
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
part_id: [0, 1, 2, 3]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download archive part
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: adsb-archive-${{ inputs.date }}-part-${{ matrix.part_id }}
|
||||
path: data/output/adsb_archives/${{ inputs.date }}
|
||||
|
||||
- name: Verify archive
|
||||
run: |
|
||||
FILE="data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_${{ matrix.part_id }}.tar.gz"
|
||||
ls -lah data/output/adsb_archives/${{ inputs.date }}/
|
||||
if [ ! -f "$FILE" ]; then
|
||||
echo "::error::Archive not found: $FILE"
|
||||
exit 1
|
||||
fi
|
||||
echo "Verified: $(du -h "$FILE")"
|
||||
|
||||
- name: Process part
|
||||
env:
|
||||
DATE: ${{ inputs.date }}
|
||||
run: |
|
||||
python -m src.adsb.process_icao_chunk --part-id ${{ matrix.part_id }} --date "$DATE"
|
||||
|
||||
- name: Upload compressed outputs
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-compressed-${{ inputs.date }}-part-${{ matrix.part_id }}
|
||||
path: data/output/compressed/${{ inputs.date }}
|
||||
retention-days: 1
|
||||
compression-level: 0
|
||||
if-no-files-found: error
|
||||
|
||||
adsb-reduce:
|
||||
needs: adsb-map
|
||||
runs-on: ubuntu-24.04-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download compressed outputs
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: adsb-compressed-${{ inputs.date }}-part-*
|
||||
path: data/output/compressed/${{ inputs.date }}
|
||||
merge-multiple: true
|
||||
|
||||
- name: Concatenate final outputs
|
||||
env:
|
||||
DATE: ${{ inputs.date }}
|
||||
CONCAT_WITH_LATEST_CSV: ${{ inputs.concat_with_latest_csv }}
|
||||
run: |
|
||||
EXTRA=""
|
||||
if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
|
||||
EXTRA="--concat_with_latest_csv"
|
||||
fi
|
||||
python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
|
||||
ls -lah data/output/ || true
|
||||
|
||||
- name: Upload final artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: openairframes_adsb-${{ inputs.date }}
|
||||
path: data/output/openairframes_adsb_*
|
||||
retention-days: 30
|
||||
if-no-files-found: error
|
||||
@@ -1,118 +0,0 @@
|
||||
name: adsb-to-aircraft-multiple-day-run
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
start_date:
|
||||
description: 'YYYY-MM-DD (inclusive)'
|
||||
required: true
|
||||
type: string
|
||||
end_date:
|
||||
description: 'YYYY-MM-DD (exclusive)'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
generate-dates:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
outputs:
|
||||
dates: ${{ steps.generate.outputs.dates }}
|
||||
steps:
|
||||
- name: Generate date list
|
||||
id: generate
|
||||
env:
|
||||
START_DATE: ${{ inputs.start_date }}
|
||||
END_DATE: ${{ inputs.end_date }}
|
||||
run: |
|
||||
python - <<'PY'
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
start = datetime.strptime(os.environ["START_DATE"], "%Y-%m-%d")
|
||||
end = datetime.strptime(os.environ["END_DATE"], "%Y-%m-%d")
|
||||
if end <= start:
|
||||
raise SystemExit("end_date must be after start_date")
|
||||
|
||||
dates = []
|
||||
cur = start
|
||||
while cur < end:
|
||||
dates.append(cur.strftime("%Y-%m-%d"))
|
||||
cur += timedelta(days=1)
|
||||
|
||||
with open(os.environ["GITHUB_OUTPUT"], "a") as f:
|
||||
f.write(f"dates={json.dumps(dates)}\n")
|
||||
PY
|
||||
|
||||
adsb-day:
|
||||
needs: generate-dates
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
date: ${{ fromJson(needs.generate-dates.outputs.dates) }}
|
||||
uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
|
||||
with:
|
||||
date: ${{ matrix.date }}
|
||||
|
||||
adsb-final:
|
||||
needs: adsb-day
|
||||
runs-on: ubuntu-24.04-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download daily CSVs
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: openairframes_adsb-*
|
||||
path: outputs/daily/
|
||||
merge-multiple: true
|
||||
|
||||
- name: Concatenate all days to final CSV
|
||||
env:
|
||||
START_DATE: ${{ inputs.start_date }}
|
||||
END_DATE: ${{ inputs.end_date }}
|
||||
run: |
|
||||
python - <<'PY'
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
import polars as pl
|
||||
|
||||
start = os.environ["START_DATE"]
|
||||
end = os.environ["END_DATE"]
|
||||
daily_dir = Path("outputs/daily")
|
||||
files = sorted(daily_dir.glob("openairframes_adsb_*.csv.gz"))
|
||||
if not files:
|
||||
raise SystemExit("No daily CSVs found")
|
||||
|
||||
def date_key(path: Path) -> str:
|
||||
m = re.match(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", path.name)
|
||||
return m.group(1) if m else path.name
|
||||
|
||||
files = sorted(files, key=date_key)
|
||||
frames = [pl.read_csv(p) for p in files]
|
||||
df = pl.concat(frames, how="vertical", rechunk=True)
|
||||
|
||||
output_path = Path("outputs") / f"openairframes_adsb_{start}_{end}.csv.gz"
|
||||
df.write_csv(output_path, compression="gzip")
|
||||
print(f"Wrote {output_path} with {df.height} rows")
|
||||
PY
|
||||
|
||||
- name: Upload final CSV
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: openairframes_adsb-${{ inputs.start_date }}-${{ inputs.end_date }}
|
||||
path: outputs/openairframes_adsb_${{ inputs.start_date }}_${{ inputs.end_date }}.csv.gz
|
||||
retention-days: 30
|
||||
# gh workflow run adsb-to-aircraft-multiple-day-run.yaml --repo ggman12/OpenAirframes --ref jonah/fix-historical-proper -f start_date=2025-12-31 -f end_date=2026-01-02
|
||||
@@ -0,0 +1,286 @@
|
||||
name: Historical ADS-B Processing
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
start_date:
|
||||
description: 'Start date (YYYY-MM-DD, inclusive)'
|
||||
required: true
|
||||
type: string
|
||||
end_date:
|
||||
description: 'End date (YYYY-MM-DD, exclusive)'
|
||||
required: true
|
||||
type: string
|
||||
chunk_days:
|
||||
description: 'Days per job chunk (default: 7)'
|
||||
required: false
|
||||
type: number
|
||||
default: 7
|
||||
|
||||
jobs:
|
||||
generate-matrix:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
chunks: ${{ steps.generate.outputs.chunks }}
|
||||
global_start: ${{ inputs.start_date }}
|
||||
global_end: ${{ inputs.end_date }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Generate date chunks
|
||||
id: generate
|
||||
env:
|
||||
INPUT_START_DATE: ${{ inputs.start_date }}
|
||||
INPUT_END_DATE: ${{ inputs.end_date }}
|
||||
INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
|
||||
run: python src/adsb/historical_generate_matrix.py
|
||||
|
||||
adsb-extract:
|
||||
needs: generate-matrix
|
||||
runs-on: ubuntu-24.04-arm
|
||||
strategy:
|
||||
matrix:
|
||||
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
|
||||
max-parallel: 3
|
||||
fail-fast: true
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Free disk space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /usr/local/share/boost
|
||||
df -h
|
||||
|
||||
- name: Download and extract ADS-B data
|
||||
env:
|
||||
START_DATE: ${{ matrix.chunk.start_date }}
|
||||
END_DATE: ${{ matrix.chunk.end_date }}
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
|
||||
ls -lah data/output/
|
||||
|
||||
- name: Create tar of extracted data and split into chunks
|
||||
run: |
|
||||
cd data/output
|
||||
echo "=== Disk space before tar ==="
|
||||
df -h .
|
||||
echo "=== Files to tar ==="
|
||||
ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
|
||||
|
||||
# Create tar with explicit error checking
|
||||
if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
|
||||
tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
|
||||
echo "=== Tar file created ==="
|
||||
ls -lah extracted_data.tar
|
||||
# Verify tar integrity
|
||||
tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
|
||||
|
||||
# Record tar size and checksum for verification after reassembly
|
||||
echo "=== Recording tar metadata ==="
|
||||
ORIGINAL_SIZE=$(stat --format=%s extracted_data.tar)
|
||||
ORIGINAL_SHA=$(sha256sum extracted_data.tar | awk '{print $1}')
|
||||
echo "Size: $ORIGINAL_SIZE"
|
||||
echo "SHA256: $ORIGINAL_SHA"
|
||||
|
||||
# Split into 500MB chunks to avoid artifact upload issues
|
||||
echo "=== Splitting tar into 500MB chunks ==="
|
||||
mkdir -p tar_chunks
|
||||
split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
|
||||
rm extracted_data.tar
|
||||
|
||||
# Write metadata file (plain text so artifact upload won't skip it)
|
||||
echo "$ORIGINAL_SHA extracted_data.tar" > tar_chunks/checksum.txt
|
||||
echo "$ORIGINAL_SIZE" >> tar_chunks/checksum.txt
|
||||
|
||||
echo "=== Chunks created ==="
|
||||
ls -lah tar_chunks/
|
||||
echo "=== Checksum file ==="
|
||||
cat tar_chunks/checksum.txt
|
||||
else
|
||||
echo "ERROR: No extracted directories found, cannot create tar"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Upload extracted data chunks
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
|
||||
path: data/output/tar_chunks/
|
||||
retention-days: 1
|
||||
compression-level: 0
|
||||
if-no-files-found: warn
|
||||
|
||||
adsb-map:
|
||||
needs: [generate-matrix, adsb-extract]
|
||||
runs-on: ubuntu-24.04-arm
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
|
||||
icao_chunk: [0, 1, 2, 3]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Free disk space
|
||||
run: |
|
||||
sudo rm -rf /usr/share/dotnet
|
||||
sudo rm -rf /opt/ghc
|
||||
sudo rm -rf /usr/local/share/boost
|
||||
df -h
|
||||
|
||||
- name: Download extracted data
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
|
||||
path: data/output/tar_chunks/
|
||||
|
||||
- name: Reassemble and extract tar
|
||||
id: extract
|
||||
run: |
|
||||
cd data/output
|
||||
if [ -d tar_chunks ] && ls tar_chunks/extracted_data.tar.part_* 1>/dev/null 2>&1; then
|
||||
echo "=== Chunk files info ==="
|
||||
ls -lah tar_chunks/
|
||||
|
||||
cd tar_chunks
|
||||
|
||||
# Reassemble tar with explicit sorting
|
||||
echo "=== Reassembling tar file ==="
|
||||
ls -1 extracted_data.tar.part_?? | sort | while read part; do
|
||||
echo "Appending $part..."
|
||||
cat "$part" >> ../extracted_data.tar
|
||||
done
|
||||
cd ..
|
||||
|
||||
echo "=== Reassembled tar file info ==="
|
||||
ls -lah extracted_data.tar
|
||||
|
||||
# Verify integrity
|
||||
echo "=== Verifying reassembled tar ==="
|
||||
if [ -f tar_chunks/checksum.txt ]; then
|
||||
EXPECTED_SHA=$(head -1 tar_chunks/checksum.txt | awk '{print $1}')
|
||||
EXPECTED_SIZE=$(sed -n '2p' tar_chunks/checksum.txt)
|
||||
ACTUAL_SHA=$(sha256sum extracted_data.tar | awk '{print $1}')
|
||||
ACTUAL_SIZE=$(stat --format=%s extracted_data.tar)
|
||||
echo "Expected: SHA=$EXPECTED_SHA Size=$EXPECTED_SIZE"
|
||||
echo "Actual: SHA=$ACTUAL_SHA Size=$ACTUAL_SIZE"
|
||||
if [ "$EXPECTED_SHA" != "$ACTUAL_SHA" ] || [ "$EXPECTED_SIZE" != "$ACTUAL_SIZE" ]; then
|
||||
echo "ERROR: Reassembled tar does not match original - data corrupted during transfer"
|
||||
exit 1
|
||||
fi
|
||||
echo "Checksum and size verified"
|
||||
else
|
||||
echo "WARNING: No checksum file found, falling back to tar integrity check"
|
||||
tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
|
||||
echo "Tar integrity check passed"
|
||||
fi
|
||||
|
||||
rm -rf tar_chunks
|
||||
|
||||
echo "=== Extracting ==="
|
||||
tar -xf extracted_data.tar
|
||||
rm extracted_data.tar
|
||||
echo "has_data=true" >> "$GITHUB_OUTPUT"
|
||||
echo "=== Contents of data/output ==="
|
||||
ls -lah
|
||||
else
|
||||
echo "No tar chunks found"
|
||||
echo "has_data=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Process ICAO chunk
|
||||
if: steps.extract.outputs.has_data == 'true'
|
||||
env:
|
||||
START_DATE: ${{ matrix.chunk.start_date }}
|
||||
END_DATE: ${{ matrix.chunk.end_date }}
|
||||
run: |
|
||||
python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.icao_chunk }} --total-chunks 4 --start-date "$START_DATE" --end-date "$END_DATE"
|
||||
ls -lah data/output/adsb_chunks/ || echo "No chunks created"
|
||||
|
||||
- name: Upload chunk artifacts
|
||||
if: steps.extract.outputs.has_data == 'true'
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-map-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}-chunk-${{ matrix.icao_chunk }}
|
||||
path: data/output/adsb_chunks/
|
||||
retention-days: 1
|
||||
if-no-files-found: ignore
|
||||
|
||||
adsb-reduce:
|
||||
needs: [generate-matrix, adsb-map]
|
||||
runs-on: ubuntu-24.04-arm
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download all chunk artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: adsb-map-*
|
||||
path: data/output/adsb_chunks/
|
||||
merge-multiple: true
|
||||
|
||||
- name: Debug downloaded files
|
||||
run: |
|
||||
echo "=== Disk space before processing ==="
|
||||
df -h
|
||||
echo "=== Listing data/output/adsb_chunks/ ==="
|
||||
find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
|
||||
echo "=== Total parquet size ==="
|
||||
du -sh data/output/adsb_chunks/ || echo "No chunks dir"
|
||||
|
||||
- name: Combine chunks to CSV
|
||||
env:
|
||||
START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
|
||||
END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
|
||||
run: |
|
||||
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
|
||||
ls -lah data/openairframes/
|
||||
|
||||
- name: Upload final artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
|
||||
path: data/openairframes/*.csv.gz
|
||||
retention-days: 30
|
||||
@@ -1,4 +1,4 @@
|
||||
name: openairframes-daily-release
|
||||
name: OpenAirframes Daily Release
|
||||
|
||||
on:
|
||||
schedule:
|
||||
@@ -76,75 +76,159 @@ jobs:
|
||||
data/faa_releasable/ReleasableAircraft_*.zip
|
||||
retention-days: 1
|
||||
|
||||
resolve-dates:
|
||||
runs-on: ubuntu-latest
|
||||
adsb-extract:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
if: github.event_name != 'schedule'
|
||||
outputs:
|
||||
date: ${{ steps.out.outputs.date }}
|
||||
adsb_date: ${{ steps.out.outputs.adsb_date }}
|
||||
steps:
|
||||
- id: out
|
||||
run: |
|
||||
if [ -n "${{ inputs.date }}" ]; then
|
||||
echo "date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
|
||||
echo "adsb_date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
|
||||
echo "adsb_date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
adsb-to-aircraft:
|
||||
needs: resolve-dates
|
||||
if: github.event_name != 'schedule'
|
||||
uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
|
||||
with:
|
||||
date: ${{ needs.resolve-dates.outputs.adsb_date }}
|
||||
concat_with_latest_csv: true
|
||||
|
||||
adsb-reduce:
|
||||
needs: [resolve-dates, adsb-to-aircraft]
|
||||
if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
|
||||
runs-on: ubuntu-24.04-arm
|
||||
manifest-exists: ${{ steps.check.outputs.exists }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.12'
|
||||
python-version: "3.14"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download compressed outputs
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
|
||||
path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
|
||||
merge-multiple: true
|
||||
|
||||
- name: Concatenate final outputs
|
||||
- name: Download and extract ADS-B data
|
||||
env:
|
||||
DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
|
||||
CONCAT_WITH_LATEST_CSV: true
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
EXTRA=""
|
||||
if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
|
||||
EXTRA="--concat_with_latest_csv"
|
||||
fi
|
||||
python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
|
||||
ls -lah data/output/ || true
|
||||
python -m src.adsb.download_and_list_icaos ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
|
||||
ls -lah data/output/
|
||||
|
||||
- name: Upload final artifacts
|
||||
- name: Check manifest exists
|
||||
id: check
|
||||
run: |
|
||||
if ls data/output/icao_manifest_*.txt 1>/dev/null 2>&1; then
|
||||
echo "exists=true" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "exists=false" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
- name: Create tar of extracted data
|
||||
run: |
|
||||
cd data/output
|
||||
tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
|
||||
ls -lah extracted_data.tar
|
||||
|
||||
- name: Upload extracted data
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
|
||||
path: data/output/openairframes_adsb_*
|
||||
retention-days: 30
|
||||
if-no-files-found: error
|
||||
name: adsb-extracted
|
||||
path: data/output/extracted_data.tar
|
||||
retention-days: 1
|
||||
compression-level: 0 # Already compressed trace files
|
||||
|
||||
adsb-map:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
needs: adsb-extract
|
||||
if: github.event_name != 'schedule' && needs.adsb-extract.outputs.manifest-exists == 'true'
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
chunk: [0, 1, 2, 3]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.14"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download extracted data
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: adsb-extracted
|
||||
path: data/output/
|
||||
|
||||
- name: Extract tar
|
||||
run: |
|
||||
cd data/output
|
||||
tar -xf extracted_data.tar
|
||||
rm extracted_data.tar
|
||||
echo "=== Contents of data/output ==="
|
||||
ls -lah
|
||||
echo "=== Looking for manifest ==="
|
||||
cat icao_manifest_*.txt | head -20 || echo "No manifest found"
|
||||
echo "=== Looking for extracted dirs ==="
|
||||
ls -d *-planes-readsb-prod-0* 2>/dev/null || echo "No extracted dirs"
|
||||
|
||||
- name: Process chunk ${{ matrix.chunk }}
|
||||
run: |
|
||||
python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4 ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
|
||||
mkdir -p data/output/adsb_chunks
|
||||
ls -lah data/output/adsb_chunks/ || echo "No chunks created"
|
||||
|
||||
- name: Upload chunk artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-chunk-${{ matrix.chunk }}
|
||||
path: data/output/adsb_chunks/
|
||||
retention-days: 1
|
||||
|
||||
adsb-reduce:
|
||||
runs-on: ubuntu-24.04-arm
|
||||
needs: adsb-map
|
||||
if: github.event_name != 'schedule'
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.14"
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Download all chunk artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
pattern: adsb-chunk-*
|
||||
path: data/output/adsb_chunks/
|
||||
merge-multiple: true
|
||||
|
||||
- name: Debug downloaded files
|
||||
run: |
|
||||
echo "=== Listing data/ ==="
|
||||
find data/ -type f 2>/dev/null | head -50 || echo "No files in data/"
|
||||
echo "=== Looking for parquet files ==="
|
||||
find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found"
|
||||
|
||||
- name: Combine chunks to CSV
|
||||
run: |
|
||||
mkdir -p data/output/adsb_chunks
|
||||
ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
|
||||
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
|
||||
ls -lah data/openairframes/
|
||||
|
||||
- name: Upload ADS-B artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: adsb-release
|
||||
path: data/openairframes/openairframes_adsb_*.csv.gz
|
||||
retention-days: 1
|
||||
|
||||
build-community:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -233,14 +317,9 @@ jobs:
|
||||
|
||||
create-release:
|
||||
runs-on: ubuntu-latest
|
||||
needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
|
||||
if: github.event_name != 'schedule' && !cancelled()
|
||||
needs: [build-faa, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
|
||||
if: github.event_name != 'schedule' && !failure() && !cancelled()
|
||||
steps:
|
||||
- name: Check ADS-B workflow status
|
||||
if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
|
||||
run: |
|
||||
echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"
|
||||
|
||||
- name: Checkout for gh CLI
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@@ -249,33 +328,31 @@ jobs:
|
||||
sparse-checkout-cone-mode: false
|
||||
|
||||
- name: Download FAA artifacts
|
||||
uses: actions/download-artifact@v5
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: faa-release
|
||||
path: artifacts/faa
|
||||
|
||||
- name: Download ADS-B artifacts
|
||||
uses: actions/download-artifact@v5
|
||||
if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
|
||||
continue-on-error: true
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
|
||||
name: adsb-release
|
||||
path: artifacts/adsb
|
||||
|
||||
- name: Download Community artifacts
|
||||
uses: actions/download-artifact@v5
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: community-release
|
||||
path: artifacts/community
|
||||
|
||||
- name: Download ADS-B Exchange JSON artifact
|
||||
uses: actions/download-artifact@v5
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: adsbexchange-json
|
||||
path: artifacts/adsbexchange
|
||||
|
||||
- name: Download Mictronics DB artifact
|
||||
uses: actions/download-artifact@v5
|
||||
uses: actions/download-artifact@v4
|
||||
continue-on-error: true
|
||||
with:
|
||||
name: mictronics-db
|
||||
@@ -311,11 +388,7 @@ jobs:
|
||||
|
||||
# Find files from artifacts using find (handles nested structures)
|
||||
CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
|
||||
# Prefer concatenated file (with date range) over single-day file
|
||||
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*_*.csv.gz" -type f 2>/dev/null | head -1)
|
||||
if [ -z "$CSV_FILE_ADSB" ]; then
|
||||
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
|
||||
fi
|
||||
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
|
||||
CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
|
||||
ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
|
||||
JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1)
|
||||
@@ -326,6 +399,9 @@ jobs:
|
||||
if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then
|
||||
MISSING_FILES="$MISSING_FILES FAA_CSV"
|
||||
fi
|
||||
if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
|
||||
MISSING_FILES="$MISSING_FILES ADSB_CSV"
|
||||
fi
|
||||
if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then
|
||||
MISSING_FILES="$MISSING_FILES FAA_ZIP"
|
||||
fi
|
||||
@@ -335,11 +411,6 @@ jobs:
|
||||
|
||||
# Optional files - warn but don't fail
|
||||
OPTIONAL_MISSING=""
|
||||
if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
|
||||
OPTIONAL_MISSING="$OPTIONAL_MISSING ADSB_CSV"
|
||||
CSV_FILE_ADSB=""
|
||||
CSV_BASENAME_ADSB=""
|
||||
fi
|
||||
if [ -z "$ZIP_FILE_MICTRONICS" ] || [ ! -f "$ZIP_FILE_MICTRONICS" ]; then
|
||||
OPTIONAL_MISSING="$OPTIONAL_MISSING MICTRONICS_ZIP"
|
||||
ZIP_FILE_MICTRONICS=""
|
||||
@@ -357,9 +428,7 @@ jobs:
|
||||
|
||||
# Get basenames for display
|
||||
CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
|
||||
if [ -n "$CSV_FILE_ADSB" ]; then
|
||||
CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
|
||||
fi
|
||||
CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
|
||||
CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
|
||||
ZIP_BASENAME=$(basename "$ZIP_FILE")
|
||||
JSON_BASENAME_ADSBX=$(basename "$JSON_FILE_ADSBX")
|
||||
@@ -414,7 +483,7 @@ jobs:
|
||||
|
||||
Assets:
|
||||
- ${{ steps.meta.outputs.csv_basename_faa }}
|
||||
${{ steps.meta.outputs.csv_basename_adsb && format('- {0}', steps.meta.outputs.csv_basename_adsb) || '' }}
|
||||
- ${{ steps.meta.outputs.csv_basename_adsb }}
|
||||
- ${{ steps.meta.outputs.csv_basename_community }}
|
||||
- ${{ steps.meta.outputs.zip_basename }}
|
||||
- ${{ steps.meta.outputs.json_basename_adsbx }}
|
||||
|
||||
+1
-4
@@ -281,7 +281,4 @@ read*lock
|
||||
.nx/
|
||||
|
||||
# jsii-rosetta files
|
||||
type-fingerprints.txt
|
||||
|
||||
notebooks/whatever.ipynb
|
||||
.snapshots/
|
||||
type-fingerprints.txt
|
||||
@@ -16,19 +16,11 @@ A daily release is created at **06:00 UTC** and includes:
|
||||
- **openairframes_community.csv**
|
||||
All community submissions
|
||||
|
||||
- **openairframes_adsb.csv**
|
||||
Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that day’s ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
|
||||
Example Usage:
|
||||
```python
|
||||
import pandas as pd
|
||||
url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
|
||||
df = pd.read_csv(url)
|
||||
df
|
||||
```
|
||||

|
||||
- **openairframes_faa.csv**
|
||||
All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)
|
||||
|
||||
- **openairframes_adsb.csv**
|
||||
Airframe information derived from ADS-B messages on the [ADSB.lol](https://www.adsb.lol/) network, from 2026-02-12 to present (will be from 2024-01-01 soon). The airframe information originates from [mictronics aircraft database](https://www.mictronics.de/aircraft-database/) (~5 MB).
|
||||
|
||||
- **ReleasableAircraft_{date}.zip**
|
||||
A daily snapshot of the FAA database, which updates at **05:30 UTC**
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
[
|
||||
{
|
||||
"contributor_name": "JohnSmith.com",
|
||||
"contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
|
||||
"creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
|
||||
"registration_number": "ZM146",
|
||||
"tags": {
|
||||
"citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
|
||||
"icao_aircraft_type": "L1J",
|
||||
"manufacturer_icao": "LOCKHEED MARTIN",
|
||||
"manufacturer_name": "Lockheed-martin",
|
||||
"model": "F-35B Lightning II",
|
||||
"operator": "Royal Air Force",
|
||||
"operator_callsign": "RAFAIR",
|
||||
"operator_icao": "RFR",
|
||||
"serial_number": "BK-12",
|
||||
"type_code": "VF35"
|
||||
},
|
||||
"transponder_code_hex": "43C81C"
|
||||
},
|
||||
{
|
||||
"contributor_name": "JohnSmith.com",
|
||||
"contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
|
||||
"creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
|
||||
"registration_number": "ZM148",
|
||||
"tags": {
|
||||
"citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
|
||||
"icao_aircraft_type": "L1J",
|
||||
"manufacturer_icao": "LOCKHEED MARTIN",
|
||||
"manufacturer_name": "Lockheed-martin",
|
||||
"model": "F-35B Lightning II",
|
||||
"operator": "Royal Air Force",
|
||||
"operator_callsign": "RAFAIR",
|
||||
"operator_icao": "RFR",
|
||||
"serial_number": "BK-14",
|
||||
"type_code": "VF35"
|
||||
},
|
||||
"transponder_code_hex": "43C811"
|
||||
}
|
||||
]
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 99 KiB |
@@ -54,38 +54,7 @@
|
||||
"additionalProperties": {
|
||||
"$ref": "#/$defs/tagValue"
|
||||
},
|
||||
"properties": {
|
||||
"citation_0": {
|
||||
"type": "string"
|
||||
},
|
||||
"icao_aircraft_type": {
|
||||
"type": "string"
|
||||
},
|
||||
"manufacturer_icao": {
|
||||
"type": "string"
|
||||
},
|
||||
"manufacturer_name": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"operator": {
|
||||
"type": "string"
|
||||
},
|
||||
"operator_callsign": {
|
||||
"type": "string"
|
||||
},
|
||||
"operator_icao": {
|
||||
"type": "string"
|
||||
},
|
||||
"serial_number": {
|
||||
"type": "string"
|
||||
},
|
||||
"type_code": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
"properties": {}
|
||||
}
|
||||
},
|
||||
"allOf": [
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import re
|
||||
from pathlib import Path
|
||||
import polars as pl
|
||||
|
||||
# Find all CSV.gz files in the downloaded artifacts
|
||||
artifacts_dir = Path("downloads/adsb_artifacts")
|
||||
files = sorted(artifacts_dir.glob("*/openairframes_adsb_*.csv.gz"))
|
||||
|
||||
if not files:
|
||||
raise SystemExit("No CSV.gz files found in downloads/adsb_artifacts/")
|
||||
|
||||
print(f"Found {len(files)} files to concatenate")
|
||||
|
||||
# Extract dates from filenames to determine range
|
||||
def extract_dates(path: Path) -> tuple[str, str]:
|
||||
"""Extract start and end dates from filename"""
|
||||
m = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv\.gz", path.name)
|
||||
if m:
|
||||
return m.group(1), m.group(2)
|
||||
return None, None
|
||||
|
||||
# Collect all dates
|
||||
all_dates = []
|
||||
for f in files:
|
||||
start, end = extract_dates(f)
|
||||
if start and end:
|
||||
all_dates.extend([start, end])
|
||||
print(f" {f.name}: {start} to {end}")
|
||||
|
||||
if not all_dates:
|
||||
raise SystemExit("Could not extract dates from filenames")
|
||||
|
||||
# Find earliest and latest dates
|
||||
earliest = min(all_dates)
|
||||
latest = max(all_dates)
|
||||
print(f"\nDate range: {earliest} to {latest}")
|
||||
|
||||
# Read and concatenate all files
|
||||
print("\nReading and concatenating files...")
|
||||
frames = [pl.read_csv(f) for f in files]
|
||||
df = pl.concat(frames, how="vertical", rechunk=True)
|
||||
|
||||
# Write output
|
||||
output_path = Path("downloads") / f"openairframes_adsb_{earliest}_{latest}.csv.gz"
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
df.write_csv(output_path, compression="gzip")
|
||||
|
||||
print(f"\nWrote {output_path} with {df.height:,} rows")
|
||||
@@ -1,40 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Create download directory
|
||||
mkdir -p downloads/adsb_artifacts
|
||||
|
||||
# Repository from the workflow comment
|
||||
REPO="ggman12/OpenAirframes"
|
||||
|
||||
# Get last 15 runs of the workflow and download matching artifacts
|
||||
gh run list \
|
||||
--repo "$REPO" \
|
||||
--workflow adsb-to-aircraft-multiple-day-run.yaml \
|
||||
--limit 15 \
|
||||
--json databaseId \
|
||||
--jq '.[].databaseId' | while read -r run_id; do
|
||||
|
||||
echo "Checking run ID: $run_id"
|
||||
|
||||
# List artifacts for this run using the API
|
||||
# Match pattern: openairframes_adsb-YYYY-MM-DD-YYYY-MM-DD (with second date)
|
||||
gh api \
|
||||
--paginate \
|
||||
"repos/$REPO/actions/runs/$run_id/artifacts" \
|
||||
--jq '.artifacts[] | select(.name | test("^openairframes_adsb-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}$")) | .name' | while read -r artifact_name; do
|
||||
|
||||
# Check if artifact directory already exists and has files
|
||||
if [ -d "downloads/adsb_artifacts/$artifact_name" ] && [ -n "$(ls -A "downloads/adsb_artifacts/$artifact_name" 2>/dev/null)" ]; then
|
||||
echo " Skipping (already exists): $artifact_name"
|
||||
continue
|
||||
fi
|
||||
|
||||
echo " Downloading: $artifact_name"
|
||||
gh run download "$run_id" \
|
||||
--repo "$REPO" \
|
||||
--name "$artifact_name" \
|
||||
--dir "downloads/adsb_artifacts/$artifact_name"
|
||||
done
|
||||
done
|
||||
|
||||
echo "Download complete! Files saved to downloads/adsb_artifacts/"
|
||||
@@ -1,182 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download and concatenate artifacts from a specific set of workflow runs.
|
||||
|
||||
Usage:
|
||||
python scripts/download_and_concat_runs.py triggered_runs_20260216_123456.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def download_run_artifact(run_id, output_dir):
|
||||
"""Download artifact from a specific workflow run."""
|
||||
print(f" Downloading artifacts from run {run_id}...")
|
||||
|
||||
cmd = [
|
||||
'gh', 'run', 'download', str(run_id),
|
||||
'--pattern', 'openairframes_adsb-*',
|
||||
'--dir', output_dir
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f" ✓ Downloaded")
|
||||
return True
|
||||
else:
|
||||
if "no artifacts" in result.stderr.lower():
|
||||
print(f" ⚠ No artifacts found (workflow may still be running)")
|
||||
else:
|
||||
print(f" ✗ Failed: {result.stderr}")
|
||||
return False
|
||||
|
||||
|
||||
def find_csv_files(download_dir):
|
||||
"""Find all CSV.gz files in the download directory."""
|
||||
csv_files = []
|
||||
for root, dirs, files in os.walk(download_dir):
|
||||
for file in files:
|
||||
if file.endswith('.csv.gz'):
|
||||
csv_files.append(os.path.join(root, file))
|
||||
return sorted(csv_files)
|
||||
|
||||
|
||||
def concatenate_csv_files(csv_files, output_file):
|
||||
"""Concatenate CSV files in order, preserving headers."""
|
||||
import gzip
|
||||
|
||||
print(f"\nConcatenating {len(csv_files)} CSV files...")
|
||||
|
||||
with gzip.open(output_file, 'wt') as outf:
|
||||
header_written = False
|
||||
|
||||
for i, csv_file in enumerate(csv_files, 1):
|
||||
print(f" [{i}/{len(csv_files)}] Processing {os.path.basename(csv_file)}")
|
||||
|
||||
with gzip.open(csv_file, 'rt') as inf:
|
||||
lines = inf.readlines()
|
||||
|
||||
if not header_written:
|
||||
# Write header from first file
|
||||
outf.writelines(lines)
|
||||
header_written = True
|
||||
else:
|
||||
# Skip header for subsequent files
|
||||
outf.writelines(lines[1:])
|
||||
|
||||
print(f"\n✓ Concatenated CSV saved to: {output_file}")
|
||||
|
||||
# Show file size
|
||||
size_mb = os.path.getsize(output_file) / (1024 * 1024)
|
||||
print(f" Size: {size_mb:.1f} MB")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Download and concatenate artifacts from workflow runs'
|
||||
)
|
||||
parser.add_argument(
|
||||
'runs_file',
|
||||
help='JSON file containing run IDs (from run_historical_adsb_action.py)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-dir',
|
||||
default='./downloads/historical_concat',
|
||||
help='Directory for downloads (default: ./downloads/historical_concat)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--wait',
|
||||
action='store_true',
|
||||
help='Wait for workflows to complete before downloading'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load run IDs
|
||||
if not os.path.exists(args.runs_file):
|
||||
print(f"Error: File not found: {args.runs_file}")
|
||||
sys.exit(1)
|
||||
|
||||
with open(args.runs_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
runs = data['runs']
|
||||
start_date = data['start_date']
|
||||
end_date = data['end_date']
|
||||
|
||||
print("=" * 60)
|
||||
print("Download and Concatenate Historical Artifacts")
|
||||
print("=" * 60)
|
||||
print(f"Date range: {start_date} to {end_date}")
|
||||
print(f"Workflow runs: {len(runs)}")
|
||||
print(f"Output directory: {args.output_dir}")
|
||||
print("=" * 60)
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Wait for workflows to complete if requested
|
||||
if args.wait:
|
||||
print("\nWaiting for workflows to complete...")
|
||||
for run_info in runs:
|
||||
run_id = run_info['run_id']
|
||||
print(f" Checking run {run_id}...")
|
||||
|
||||
cmd = ['gh', 'run', 'watch', str(run_id)]
|
||||
subprocess.run(cmd)
|
||||
|
||||
# Download artifacts
|
||||
print("\nDownloading artifacts...")
|
||||
successful_downloads = 0
|
||||
|
||||
for i, run_info in enumerate(runs, 1):
|
||||
run_id = run_info['run_id']
|
||||
print(f"\n[{i}/{len(runs)}] Run {run_id} ({run_info['start']} to {run_info['end']})")
|
||||
|
||||
if download_run_artifact(run_id, args.output_dir):
|
||||
successful_downloads += 1
|
||||
|
||||
print(f"\n\nDownload Summary: {successful_downloads}/{len(runs)} artifacts downloaded")
|
||||
|
||||
if successful_downloads == 0:
|
||||
print("\nNo artifacts downloaded. Workflows may still be running.")
|
||||
print("Use --wait to wait for completion, or try again later.")
|
||||
sys.exit(1)
|
||||
|
||||
# Find all CSV files
|
||||
csv_files = find_csv_files(args.output_dir)
|
||||
|
||||
if not csv_files:
|
||||
print("\nError: No CSV files found in download directory")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\nFound {len(csv_files)} CSV file(s):")
|
||||
for csv_file in csv_files:
|
||||
print(f" - {os.path.basename(csv_file)}")
|
||||
|
||||
# Concatenate
|
||||
# Calculate actual end date for filename (end_date - 1 day since it's exclusive)
|
||||
from datetime import datetime, timedelta
|
||||
end_dt = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
|
||||
actual_end = end_dt.strftime('%Y-%m-%d')
|
||||
|
||||
output_file = os.path.join(
|
||||
args.output_dir,
|
||||
f"openairframes_adsb_{start_date}_{actual_end}.csv.gz"
|
||||
)
|
||||
|
||||
concatenate_csv_files(csv_files, output_file)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Done!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,215 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks.
|
||||
|
||||
Usage:
|
||||
python scripts/run_historical_adsb_action.py --start-date 2025-01-01 --end-date 2025-06-01
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from calendar import monthrange
|
||||
|
||||
|
||||
def generate_monthly_chunks(start_date_str, end_date_str):
|
||||
"""Generate date ranges in monthly chunks from start to end date.
|
||||
|
||||
End dates are exclusive (e.g., to process Jan 1-31, end_date should be Feb 1).
|
||||
"""
|
||||
start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
|
||||
end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
|
||||
|
||||
chunks = []
|
||||
current = start_date
|
||||
|
||||
while current < end_date:
|
||||
# Get the first day of the next month (exclusive end)
|
||||
_, days_in_month = monthrange(current.year, current.month)
|
||||
month_end = current.replace(day=days_in_month)
|
||||
next_month_start = month_end + timedelta(days=1)
|
||||
|
||||
# Don't go past the global end date
|
||||
chunk_end = min(next_month_start, end_date)
|
||||
|
||||
chunks.append({
|
||||
'start': current.strftime('%Y-%m-%d'),
|
||||
'end': chunk_end.strftime('%Y-%m-%d')
|
||||
})
|
||||
|
||||
# Move to first day of next month
|
||||
if next_month_start >= end_date:
|
||||
break
|
||||
current = next_month_start
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def trigger_workflow(start_date, end_date, repo='ggman12/OpenAirframes', branch='main', dry_run=False):
|
||||
"""Trigger the adsb-to-aircraft-multiple-day-run workflow via GitHub CLI."""
|
||||
cmd = [
|
||||
'gh', 'workflow', 'run', 'adsb-to-aircraft-multiple-day-run.yaml',
|
||||
'--repo', repo,
|
||||
'--ref', branch,
|
||||
'-f', f'start_date={start_date}',
|
||||
'-f', f'end_date={end_date}'
|
||||
]
|
||||
|
||||
if dry_run:
|
||||
print(f"[DRY RUN] Would run: {' '.join(cmd)}")
|
||||
return True, None
|
||||
|
||||
print(f"Triggering workflow: {start_date} to {end_date} (on {branch})")
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"✓ Successfully triggered workflow for {start_date} to {end_date}")
|
||||
|
||||
# Get the run ID of the workflow we just triggered
|
||||
# Wait a moment for it to appear
|
||||
import time
|
||||
time.sleep(2)
|
||||
|
||||
# Get the most recent run (should be the one we just triggered)
|
||||
list_cmd = [
|
||||
'gh', 'run', 'list',
|
||||
'--repo', repo,
|
||||
'--workflow', 'adsb-to-aircraft-multiple-day-run.yaml',
|
||||
'--branch', branch,
|
||||
'--limit', '1',
|
||||
'--json', 'databaseId',
|
||||
'--jq', '.[0].databaseId'
|
||||
]
|
||||
list_result = subprocess.run(list_cmd, capture_output=True, text=True)
|
||||
run_id = list_result.stdout.strip() if list_result.returncode == 0 else None
|
||||
|
||||
return True, run_id
|
||||
else:
|
||||
print(f"✗ Failed to trigger workflow for {start_date} to {end_date}")
|
||||
print(f"Error: {result.stderr}")
|
||||
return False, None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--start-date', '--start_date',
|
||||
dest='start_date',
|
||||
required=True,
|
||||
help='Start date in YYYY-MM-DD format (inclusive)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--end-date', '--end_date',
|
||||
dest='end_date',
|
||||
required=True,
|
||||
help='End date in YYYY-MM-DD format (exclusive)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--repo',
|
||||
type=str,
|
||||
default='ggman12/OpenAirframes',
|
||||
help='GitHub repository (default: ggman12/OpenAirframes)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--branch',
|
||||
type=str,
|
||||
default='main',
|
||||
help='Branch to run the workflow on (default: main)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--dry-run',
|
||||
action='store_true',
|
||||
help='Print commands without executing them'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--delay',
|
||||
type=int,
|
||||
default=5,
|
||||
help='Delay in seconds between workflow triggers (default: 5)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate dates
|
||||
try:
|
||||
start = datetime.strptime(args.start_date, '%Y-%m-%d')
|
||||
end = datetime.strptime(args.end_date, '%Y-%m-%d')
|
||||
if start > end:
|
||||
print("Error: start_date must be before or equal to end_date")
|
||||
sys.exit(1)
|
||||
except ValueError as e:
|
||||
print(f"Error: Invalid date format - {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Generate monthly chunks
|
||||
chunks = generate_monthly_chunks(args.start_date, args.end_date)
|
||||
|
||||
print(f"\nGenerating {len(chunks)} monthly workflow runs on branch '{args.branch}' (repo: {args.repo}):")
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
print(f" {i}. {chunk['start']} to {chunk['end']}")
|
||||
|
||||
if not args.dry_run:
|
||||
response = input(f"\nProceed with triggering {len(chunks)} workflows on '{args.branch}'? [y/N]: ")
|
||||
if response.lower() != 'y':
|
||||
print("Cancelled.")
|
||||
sys.exit(0)
|
||||
|
||||
print()
|
||||
|
||||
# Trigger workflows
|
||||
import time
|
||||
success_count = 0
|
||||
triggered_runs = []
|
||||
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
print(f"\n[{i}/{len(chunks)}] ", end='')
|
||||
|
||||
success, run_id = trigger_workflow(
|
||||
chunk['start'],
|
||||
chunk['end'],
|
||||
repo=args.repo,
|
||||
branch=args.branch,
|
||||
dry_run=args.dry_run
|
||||
)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
if run_id:
|
||||
triggered_runs.append({
|
||||
'run_id': run_id,
|
||||
'start': chunk['start'],
|
||||
'end': chunk['end']
|
||||
})
|
||||
|
||||
# Add delay between triggers (except for last one)
|
||||
if i < len(chunks) and not args.dry_run:
|
||||
time.sleep(args.delay)
|
||||
|
||||
print(f"\n\nSummary: {success_count}/{len(chunks)} workflows triggered successfully")
|
||||
|
||||
# Save triggered run IDs to a file
|
||||
if triggered_runs and not args.dry_run:
|
||||
import json
|
||||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
runs_file = f"./output/triggered_runs_{timestamp}.json"
|
||||
with open(runs_file, 'w') as f:
|
||||
json.dump({
|
||||
'start_date': args.start_date,
|
||||
'end_date': args.end_date,
|
||||
'repo': args.repo,
|
||||
'branch': args.branch,
|
||||
'runs': triggered_runs
|
||||
}, f, indent=2)
|
||||
print(f"\nRun IDs saved to: {runs_file}")
|
||||
print(f"\nTo download and concatenate these artifacts, run:")
|
||||
print(f" python scripts/download_and_concat_runs.py {runs_file}")
|
||||
|
||||
if success_count < len(chunks):
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,82 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run src.adsb.main in an isolated git worktree so edits in the main
|
||||
working tree won't affect subprocess imports during the run.
|
||||
|
||||
Usage:
|
||||
python scripts/run_main_isolated.py 2026-01-01
|
||||
python scripts/run_main_isolated.py --start_date 2026-01-01 --end_date 2026-01-03
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def run(
|
||||
cmd: list[str],
|
||||
*,
|
||||
cwd: Path | None = None,
|
||||
check: bool = True,
|
||||
) -> subprocess.CompletedProcess:
|
||||
print(f"\n>>> {' '.join(cmd)}")
|
||||
return subprocess.run(cmd, cwd=cwd, check=check)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Run src.adsb.main in an isolated worktree")
|
||||
parser.add_argument("date", nargs="?", help="Single date to process (YYYY-MM-DD)")
|
||||
parser.add_argument("--start_date", help="Start date (inclusive, YYYY-MM-DD)")
|
||||
parser.add_argument("--end_date", help="End date (exclusive, YYYY-MM-DD)")
|
||||
parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.date and (args.start_date or args.end_date):
|
||||
raise SystemExit("Use a single date or --start_date/--end_date, not both.")
|
||||
|
||||
if args.date:
|
||||
datetime.strptime(args.date, "%Y-%m-%d")
|
||||
main_args = ["--date", args.date]
|
||||
else:
|
||||
if not args.start_date or not args.end_date:
|
||||
raise SystemExit("Provide --start_date and --end_date, or a single date.")
|
||||
datetime.strptime(args.start_date, "%Y-%m-%d")
|
||||
datetime.strptime(args.end_date, "%Y-%m-%d")
|
||||
main_args = ["--start_date", args.start_date, "--end_date", args.end_date]
|
||||
|
||||
if args.concat_with_latest_csv:
|
||||
main_args.append("--concat_with_latest_csv")
|
||||
|
||||
repo_root = Path(__file__).resolve().parents[1]
|
||||
snapshots_root = repo_root / ".snapshots"
|
||||
snapshots_root.mkdir(exist_ok=True)
|
||||
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
snapshot_root = snapshots_root / f"run_{timestamp}"
|
||||
snapshot_src = snapshot_root / "src"
|
||||
|
||||
exit_code = 0
|
||||
try:
|
||||
shutil.copytree(repo_root / "src", snapshot_src)
|
||||
|
||||
runner = (
|
||||
"import sys, runpy; "
|
||||
f"sys.path.insert(0, {repr(str(snapshot_root))}); "
|
||||
f"sys.argv = ['src.adsb.main'] + {main_args!r}; "
|
||||
"runpy.run_module('src.adsb.main', run_name='__main__')"
|
||||
)
|
||||
cmd = [sys.executable, "-c", runner]
|
||||
run(cmd, cwd=repo_root)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
exit_code = exc.returncode
|
||||
finally:
|
||||
shutil.rmtree(snapshot_root, ignore_errors=True)
|
||||
|
||||
return exit_code
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,242 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
|
||||
|
||||
Source: "TheAirTraffic Database - Aircraft 2.csv"
|
||||
Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
|
||||
|
||||
Categories in the spreadsheet columns (paired: name, registrations, separator):
|
||||
Col 1-3: Business
|
||||
Col 4-6: Government
|
||||
Col 7-9: People
|
||||
Col 10-12: Sports
|
||||
Col 13-15: Celebrity
|
||||
Col 16-18: State Govt./Law
|
||||
Col 19-21: Other
|
||||
Col 22-24: Test Aircraft
|
||||
Col 25-27: YouTubers
|
||||
Col 28-30: Formula 1 VIP's
|
||||
Col 31-33: Active GII's and GIII's (test/demo aircraft)
|
||||
Col 34-37: Russia & Ukraine (extra col for old/new)
|
||||
Col 38-40: Helicopters & Blimps
|
||||
Col 41-43: Unique Reg's
|
||||
Col 44-46: Saudi & UAE
|
||||
Col 47-49: Schools
|
||||
Col 50-52: Special Charter
|
||||
Col 53-55: Unknown Owners
|
||||
Col 56-59: Frequent Flyers (extra cols: name, aircraft, logged, hours)
|
||||
"""
|
||||
|
||||
import csv
|
||||
import json
|
||||
import hashlib
|
||||
import re
|
||||
import sys
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# ── Category mapping ────────────────────────────────────────────────────────
|
||||
# Each entry: (name_col, reg_col, owner_category_tags)
|
||||
# owner_category_tags is a dict of tag keys to add beyond "owner"
|
||||
CATEGORY_COLUMNS = [
|
||||
# (name_col, reg_col, {tag_key: tag_value, ...})
|
||||
(1, 2, {"owner_category_0": "business"}),
|
||||
(4, 5, {"owner_category_0": "government"}),
|
||||
(7, 8, {"owner_category_0": "celebrity"}),
|
||||
(10, 11, {"owner_category_0": "sports"}),
|
||||
(13, 14, {"owner_category_0": "celebrity"}),
|
||||
(16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
|
||||
(19, 20, {"owner_category_0": "other"}),
|
||||
(22, 23, {"owner_category_0": "test_aircraft"}),
|
||||
(25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
|
||||
(28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
|
||||
(31, 32, {"owner_category_0": "test_aircraft"}),
|
||||
# Russia & Ukraine: col 34=name, col 35 or 36 may have reg
|
||||
(34, 35, {"owner_category_0": "russia_ukraine"}),
|
||||
(38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
|
||||
(41, 42, {"owner_category_0": "other"}),
|
||||
(44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
|
||||
(47, 48, {"owner_category_0": "education"}),
|
||||
(50, 51, {"owner_category_0": "charter"}),
|
||||
(53, 54, {"owner_category_0": "unknown"}),
|
||||
(56, 57, {"owner_category_0": "celebrity"}), # Frequent Flyers name col, aircraft col
|
||||
]
|
||||
|
||||
# First data row index (0-based) in the CSV
|
||||
DATA_START_ROW = 4
|
||||
|
||||
# ── Contributor info ────────────────────────────────────────────────────────
|
||||
CONTRIBUTOR_NAME = "TheAirTraffic"
|
||||
# Deterministic UUID v5 from contributor name
|
||||
CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
|
||||
|
||||
# Citation
|
||||
CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
|
||||
|
||||
|
||||
def looks_like_military_serial(reg: str) -> bool:
|
||||
"""
|
||||
Detect military-style serials like 92-9000, 82-8000, 98-0001
|
||||
or pure numeric IDs like 929000, 828000, 980001.
|
||||
These aren't standard civil registrations; use openairframes_id.
|
||||
"""
|
||||
# Pattern: NN-NNNN
|
||||
if re.match(r'^\d{2}-\d{4}$', reg):
|
||||
return True
|
||||
# Pure 6-digit numbers (likely ICAO hex or military mode-S)
|
||||
if re.match(r'^\d{6}$', reg):
|
||||
return True
|
||||
# Short numeric-only (1-5 digits) like "01", "02", "676"
|
||||
if re.match(r'^\d{1,5}$', reg):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def normalize_reg(raw: str) -> str:
|
||||
"""Clean up a registration string."""
|
||||
reg = raw.strip().rstrip(',').strip()
|
||||
# Remove carriage returns and other whitespace
|
||||
reg = reg.replace('\r', '').replace('\n', '').strip()
|
||||
return reg
|
||||
|
||||
|
||||
def parse_regs(cell_value: str) -> list[str]:
|
||||
"""
|
||||
Parse a cell that may contain one or many registrations,
|
||||
separated by commas, possibly wrapped in quotes.
|
||||
"""
|
||||
if not cell_value or not cell_value.strip():
|
||||
return []
|
||||
|
||||
# Some cells have ADS-B exchange URLs – skip those
|
||||
if 'globe.adsbexchange.com' in cell_value:
|
||||
return []
|
||||
if cell_value.strip() in ('.', ',', ''):
|
||||
return []
|
||||
|
||||
results = []
|
||||
# Split on comma
|
||||
parts = cell_value.split(',')
|
||||
for part in parts:
|
||||
reg = normalize_reg(part)
|
||||
if not reg:
|
||||
continue
|
||||
# Skip URLs, section labels, etc.
|
||||
if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
|
||||
continue
|
||||
# Skip if it's just whitespace or dots
|
||||
if reg in ('.', '..', '...'):
|
||||
continue
|
||||
results.append(reg)
|
||||
return results
|
||||
|
||||
|
||||
def make_submission(
|
||||
reg: str,
|
||||
owner: str,
|
||||
category_tags: dict[str, str],
|
||||
) -> dict:
|
||||
"""Build a single community_submission.v1 object."""
|
||||
|
||||
entry: dict = {}
|
||||
|
||||
# Decide identifier field
|
||||
if looks_like_military_serial(reg):
|
||||
entry["openairframes_id"] = reg
|
||||
else:
|
||||
entry["registration_number"] = reg
|
||||
|
||||
# Tags
|
||||
tags: dict = {
|
||||
"citation_0": CITATION,
|
||||
}
|
||||
if owner:
|
||||
tags["owner"] = owner.strip()
|
||||
tags.update(category_tags)
|
||||
entry["tags"] = tags
|
||||
|
||||
return entry
|
||||
|
||||
|
||||
def main():
|
||||
csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
|
||||
"/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
|
||||
)
|
||||
|
||||
if not csv_path.exists():
|
||||
print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Read CSV
|
||||
with open(csv_path, 'r', encoding='utf-8-sig') as f:
|
||||
reader = csv.reader(f)
|
||||
rows = list(reader)
|
||||
|
||||
print(f"Read {len(rows)} rows from {csv_path.name}")
|
||||
|
||||
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
submissions: list[dict] = []
|
||||
seen: set[tuple] = set() # (reg, owner) dedup
|
||||
|
||||
for row_idx in range(DATA_START_ROW, len(rows)):
|
||||
row = rows[row_idx]
|
||||
if len(row) < 3:
|
||||
continue
|
||||
|
||||
for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
|
||||
if reg_col >= len(row) or name_col >= len(row):
|
||||
continue
|
||||
|
||||
owner_raw = row[name_col].strip().rstrip(',').strip()
|
||||
reg_raw = row[reg_col]
|
||||
|
||||
# Clean owner name
|
||||
owner = owner_raw.replace('\r', '').replace('\n', '').strip()
|
||||
if not owner or owner in ('.', ',', 'Section 1'):
|
||||
continue
|
||||
# Skip header-like values
|
||||
if owner.startswith('http') or owner.startswith('Link '):
|
||||
continue
|
||||
|
||||
regs = parse_regs(reg_raw)
|
||||
if not regs:
|
||||
# For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
|
||||
if name_col == 34 and reg_col + 1 < len(row):
|
||||
regs = parse_regs(row[reg_col + 1])
|
||||
|
||||
for reg in regs:
|
||||
key = (reg, owner)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
submissions.append(make_submission(reg, owner, cat_tags))
|
||||
|
||||
print(f"Generated {len(submissions)} submissions")
|
||||
|
||||
# Write output
|
||||
proj_root = Path(__file__).resolve().parent.parent
|
||||
out_dir = proj_root / "community" / date_str
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
out_file = out_dir / f"theairtraffic_{date_str}.json"
|
||||
|
||||
with open(out_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(submissions, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Written to {out_file}")
|
||||
print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
|
||||
|
||||
# Quick stats
|
||||
cats = {}
|
||||
for s in submissions:
|
||||
c = s['tags'].get('owner_category_0', 'NONE')
|
||||
cats[c] = cats.get(c, 0) + 1
|
||||
print("\nCategory breakdown:")
|
||||
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
|
||||
print(f" {c}: {n}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,69 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate the generated theairtraffic JSON output."""
|
||||
import json
|
||||
import glob
|
||||
import sys
|
||||
|
||||
# Find the latest output
|
||||
files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
|
||||
if not files:
|
||||
print("No output files found!")
|
||||
sys.exit(1)
|
||||
|
||||
path = files[-1]
|
||||
print(f"Validating: {path}")
|
||||
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
print(f"Total entries: {len(data)}")
|
||||
|
||||
# Check military serial handling
|
||||
mil = [d for d in data if "openairframes_id" in d]
|
||||
print(f"\nEntries using openairframes_id: {len(mil)}")
|
||||
for m in mil[:10]:
|
||||
print(f" {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
|
||||
|
||||
# Check youtuber entries
|
||||
yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
|
||||
print(f"\nYouTuber entries: {len(yt)}")
|
||||
for y in yt[:5]:
|
||||
reg = y.get("registration_number", y.get("openairframes_id"))
|
||||
c0 = y["tags"].get("owner_category_0")
|
||||
c1 = y["tags"].get("owner_category_1")
|
||||
print(f" {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
|
||||
|
||||
# Check US Govt / military
|
||||
gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
|
||||
print(f"\nUSA 747/757 entries: {len(gov)}")
|
||||
for g in gov:
|
||||
oid = g.get("openairframes_id", g.get("registration_number"))
|
||||
print(f" {oid}")
|
||||
|
||||
# Schema validation
|
||||
issues = 0
|
||||
for i, d in enumerate(data):
|
||||
has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
|
||||
if not has_id:
|
||||
print(f" Entry {i}: no identifier!")
|
||||
issues += 1
|
||||
if "tags" not in d:
|
||||
print(f" Entry {i}: no tags!")
|
||||
issues += 1
|
||||
# Check tag key format
|
||||
for k in d.get("tags", {}):
|
||||
import re
|
||||
if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
|
||||
print(f" Entry {i}: invalid tag key '{k}'")
|
||||
issues += 1
|
||||
|
||||
print(f"\nSchema issues: {issues}")
|
||||
|
||||
# Category breakdown
|
||||
cats = {}
|
||||
for s in data:
|
||||
c = s["tags"].get("owner_category_0", "NONE")
|
||||
cats[c] = cats.get(c, 0) + 1
|
||||
print("\nCategory breakdown:")
|
||||
for c, n in sorted(cats.items(), key=lambda x: -x[1]):
|
||||
print(f" {c}: {n}")
|
||||
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Combines chunk parquet files and compresses to final aircraft CSV.
|
||||
This is the reduce phase of the map-reduce pipeline.
|
||||
|
||||
Supports both single-day (daily) and multi-day (historical) modes.
|
||||
|
||||
Memory-efficient: processes each chunk separately, compresses, then combines.
|
||||
|
||||
Usage:
|
||||
# Daily mode
|
||||
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
|
||||
|
||||
# Historical mode
|
||||
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
|
||||
"""
|
||||
import gc
|
||||
import gzip
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import argparse
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import polars as pl
|
||||
|
||||
from src.adsb.download_adsb_data_to_parquet import OUTPUT_DIR, get_resource_usage
|
||||
from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLUMNS
|
||||
|
||||
|
||||
DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
|
||||
FINAL_OUTPUT_DIR = "./data/openairframes"
|
||||
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
|
||||
def get_target_day() -> datetime:
|
||||
"""Get yesterday's date (the day we're processing)."""
|
||||
return datetime.now(timezone.utc) - timedelta(days=1)
|
||||
|
||||
|
||||
def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
|
||||
"""Load and compress a single chunk parquet file.
|
||||
|
||||
Args:
|
||||
chunk_path: Path to parquet file
|
||||
delete_after_load: If True, delete the parquet file after loading to free disk space
|
||||
"""
|
||||
print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
|
||||
|
||||
# Load chunk - only columns we need
|
||||
needed_columns = ['time', 'icao'] + COLUMNS
|
||||
df = pl.read_parquet(chunk_path, columns=needed_columns)
|
||||
print(f" Loaded {len(df)} rows")
|
||||
|
||||
# Delete file immediately after loading to free disk space
|
||||
if delete_after_load:
|
||||
try:
|
||||
os.remove(chunk_path)
|
||||
print(f" Deleted {chunk_path} to free disk space")
|
||||
except Exception as e:
|
||||
print(f" Warning: Failed to delete {chunk_path}: {e}")
|
||||
|
||||
# Compress to aircraft records (one per ICAO) using shared function
|
||||
compressed = compress_multi_icao_df(df, verbose=True)
|
||||
print(f" Compressed to {len(compressed)} aircraft records")
|
||||
|
||||
del df
|
||||
gc.collect()
|
||||
|
||||
return compressed
|
||||
|
||||
|
||||
def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFrame:
|
||||
"""Combine multiple compressed DataFrames.
|
||||
|
||||
Since chunks are partitioned by ICAO hash, each ICAO only appears in one chunk.
|
||||
No deduplication needed here - just concatenate.
|
||||
"""
|
||||
print(f"Combining {len(compressed_dfs)} compressed chunks... | {get_resource_usage()}")
|
||||
|
||||
# Concat all
|
||||
combined = pl.concat(compressed_dfs)
|
||||
print(f"Combined: {len(combined)} records")
|
||||
|
||||
return combined
|
||||
|
||||
|
||||
def download_and_merge_base_release(compressed_df: pl.DataFrame) -> tuple[pl.DataFrame, str | None]:
|
||||
"""Download base release and merge with new data.
|
||||
|
||||
Returns:
|
||||
Tuple of (merged_df, earliest_date_str) where earliest_date_str is None if no base release was merged
|
||||
"""
|
||||
from src.get_latest_release import download_latest_aircraft_adsb_csv
|
||||
|
||||
print("Downloading base ADS-B release...")
|
||||
base_path = download_latest_aircraft_adsb_csv(
|
||||
output_dir="./data/openairframes_base"
|
||||
)
|
||||
print(f"Download returned: {base_path}")
|
||||
|
||||
print(f"Loading base release from {base_path}")
|
||||
|
||||
# Extract start date from filename (e.g., openairframes_adsb_2025-05-01_2026-02-14.csv.gz)
|
||||
import re
|
||||
filename = os.path.basename(str(base_path))
|
||||
match = re.search(r'openairframes_adsb_(\d{4}-\d{2}-\d{2})_', filename)
|
||||
earliest_date = match.group(1) if match else None
|
||||
print(f"Start date from base filename: {earliest_date}")
|
||||
|
||||
# Read CSV with schema matching the new data
|
||||
base_df = pl.read_csv(base_path, schema=compressed_df.schema)
|
||||
print(f"Base release has {len(base_df)} records")
|
||||
|
||||
# Ensure columns match
|
||||
base_cols = set(base_df.columns)
|
||||
new_cols = set(compressed_df.columns)
|
||||
print(f"Base columns: {sorted(base_cols)}")
|
||||
print(f"New columns: {sorted(new_cols)}")
|
||||
|
||||
# Add missing columns
|
||||
for col in new_cols - base_cols:
|
||||
base_df = base_df.with_columns(pl.lit(None).alias(col))
|
||||
for col in base_cols - new_cols:
|
||||
compressed_df = compressed_df.with_columns(pl.lit(None).alias(col))
|
||||
|
||||
# Reorder columns to match
|
||||
compressed_df = compressed_df.select(base_df.columns)
|
||||
|
||||
# Concat and deduplicate by icao (keep new data - it comes last)
|
||||
combined = pl.concat([base_df, compressed_df])
|
||||
print(f"After concat: {len(combined)} records")
|
||||
|
||||
return combined, earliest_date
|
||||
|
||||
def cleanup_chunks(output_id: str, chunks_dir: str):
|
||||
"""Delete chunk parquet files after successful merge."""
|
||||
pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
|
||||
chunk_files = glob.glob(pattern)
|
||||
for f in chunk_files:
|
||||
try:
|
||||
os.remove(f)
|
||||
print(f"Deleted {f}")
|
||||
except Exception as e:
|
||||
print(f"Failed to delete {f}: {e}")
|
||||
|
||||
|
||||
def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
|
||||
"""Find chunk parquet files matching the output ID."""
|
||||
pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
|
||||
chunk_files = sorted(glob.glob(pattern))
|
||||
|
||||
if not chunk_files:
|
||||
# Try recursive search for historical mode with merged artifacts
|
||||
pattern = os.path.join(chunks_dir, "**", "*.parquet")
|
||||
chunk_files = sorted(glob.glob(pattern, recursive=True))
|
||||
|
||||
return chunk_files
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
|
||||
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
|
||||
parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
|
||||
parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
|
||||
parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
|
||||
parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
|
||||
parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
|
||||
parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine output ID and filename based on mode
|
||||
if args.start_date and args.end_date:
|
||||
# Historical mode
|
||||
output_id = f"{args.start_date}_{args.end_date}"
|
||||
output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv.gz"
|
||||
print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
|
||||
else:
|
||||
# Daily mode - use same date for start and end
|
||||
if args.date:
|
||||
target_day = datetime.strptime(args.date, "%Y-%m-%d")
|
||||
else:
|
||||
target_day = get_target_day()
|
||||
|
||||
date_str = target_day.strftime("%Y-%m-%d")
|
||||
output_id = date_str
|
||||
output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv.gz"
|
||||
print(f"Combining chunks for {date_str}")
|
||||
|
||||
chunks_dir = args.chunks_dir
|
||||
print(f"Chunks directory: {chunks_dir}")
|
||||
print(f"Resource usage at start: {get_resource_usage()}")
|
||||
|
||||
# Find chunk files
|
||||
chunk_files = find_chunk_files(chunks_dir, output_id)
|
||||
|
||||
if not chunk_files:
|
||||
print(f"No chunk files found in: {chunks_dir}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(chunk_files)} chunk files")
|
||||
|
||||
# Process each chunk separately to save memory
|
||||
# With --stream, delete parquet files immediately after loading to save disk space
|
||||
compressed_chunks = []
|
||||
for chunk_path in chunk_files:
|
||||
compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
|
||||
compressed_chunks.append(compressed)
|
||||
gc.collect()
|
||||
|
||||
# Combine all compressed chunks
|
||||
combined = combine_compressed_chunks(compressed_chunks)
|
||||
|
||||
# Free memory from individual chunks
|
||||
del compressed_chunks
|
||||
gc.collect()
|
||||
print(f"After combining: {get_resource_usage()}")
|
||||
|
||||
# Merge with base release (unless skipped)
|
||||
base_start_date = None
|
||||
if not args.skip_base:
|
||||
combined, base_start_date = download_and_merge_base_release(combined)
|
||||
|
||||
# Update filename if we merged with base release and got a start date
|
||||
if base_start_date and not (args.start_date and args.end_date):
|
||||
# Only update filename for daily mode when base was merged
|
||||
output_filename = f"openairframes_adsb_{base_start_date}_{date_str}.csv.gz"
|
||||
print(f"Updated filename to reflect date range: {output_filename}")
|
||||
|
||||
# Convert list columns to strings for CSV compatibility
|
||||
for col in combined.columns:
|
||||
if combined[col].dtype == pl.List:
|
||||
combined = combined.with_columns(
|
||||
pl.col(col).list.join(",").alias(col)
|
||||
)
|
||||
|
||||
# Sort by time for consistent output
|
||||
if 'time' in combined.columns:
|
||||
combined = combined.sort('time')
|
||||
|
||||
# Replace empty strings with null across all string columns to avoid quoted empty strings
|
||||
for col in combined.columns:
|
||||
if combined[col].dtype == pl.Utf8:
|
||||
combined = combined.with_columns(
|
||||
pl.when(pl.col(col) == "").then(None).otherwise(pl.col(col)).alias(col)
|
||||
)
|
||||
|
||||
# Write final CSV
|
||||
output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
|
||||
with gzip.open(output_path, "wb", compresslevel=9) as f:
|
||||
combined.write_csv(f, null_value='', quote_style='necessary')
|
||||
print(f"Wrote {len(combined)} records to {output_path}")
|
||||
|
||||
# Cleanup
|
||||
if not args.keep_chunks:
|
||||
cleanup_chunks(output_id, chunks_dir)
|
||||
|
||||
print(f"Done! | {get_resource_usage()}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -5,6 +5,23 @@ import polars as pl
|
||||
COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']
|
||||
|
||||
|
||||
def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""For each icao, keep only the earliest row with each unique signature.
|
||||
|
||||
This is used for deduplicating across multiple compressed chunks.
|
||||
"""
|
||||
# Create signature column
|
||||
df = df.with_columns(
|
||||
pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
|
||||
)
|
||||
# Group by icao and signature, take first row (earliest due to time sort)
|
||||
df = df.sort("time")
|
||||
df_deduped = df.group_by(["icao", "_signature"]).first()
|
||||
df_deduped = df_deduped.drop("_signature")
|
||||
df_deduped = df_deduped.sort("time")
|
||||
return df_deduped
|
||||
|
||||
|
||||
def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
|
||||
"""Compress a single ICAO group to its most informative row using Polars."""
|
||||
# Create signature string
|
||||
@@ -82,6 +99,9 @@ def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
|
||||
def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
|
||||
"""Compress a DataFrame with multiple ICAOs to one row per ICAO.
|
||||
|
||||
This is the main entry point for compressing ADS-B data.
|
||||
Used by both daily GitHub Actions runs and historical AWS runs.
|
||||
|
||||
Args:
|
||||
df: DataFrame with columns ['time', 'icao'] + COLUMNS
|
||||
verbose: Whether to print progress
|
||||
@@ -100,27 +120,29 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
|
||||
if col in df.columns:
|
||||
df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
|
||||
|
||||
# Quick deduplication of exact duplicates
|
||||
# First pass: quick deduplication of exact duplicates
|
||||
df = df.unique(subset=['icao'] + COLUMNS, keep='first')
|
||||
if verbose:
|
||||
print(f"After quick dedup: {df.height} records")
|
||||
|
||||
# Compress per ICAO
|
||||
# Second pass: sophisticated compression per ICAO
|
||||
if verbose:
|
||||
print("Compressing per ICAO...")
|
||||
|
||||
# Process each ICAO group
|
||||
icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
|
||||
compressed_dfs = []
|
||||
|
||||
for icao_key, group_df in icao_groups.items():
|
||||
icao = icao_key[0]
|
||||
# partition_by with as_dict=True returns tuple keys, extract first element
|
||||
icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
|
||||
compressed = compress_df_polars(group_df, str(icao))
|
||||
compressed_dfs.append(compressed)
|
||||
|
||||
if compressed_dfs:
|
||||
df_compressed = pl.concat(compressed_dfs)
|
||||
else:
|
||||
df_compressed = df.head(0)
|
||||
df_compressed = df.head(0) # Empty with same schema
|
||||
|
||||
if verbose:
|
||||
print(f"After compress: {df_compressed.height} records")
|
||||
@@ -133,22 +155,45 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
|
||||
return df_compressed
|
||||
|
||||
|
||||
def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
|
||||
"""Load a single parquet part file for a date.
|
||||
|
||||
Args:
|
||||
part_id: Part ID (e.g., 1, 2, 3)
|
||||
date: Date string in YYYY-MM-DD format
|
||||
|
||||
Returns:
|
||||
DataFrame with ADS-B data
|
||||
"""
|
||||
def load_raw_adsb_for_day(day):
|
||||
"""Load raw ADS-B data for a day from parquet file."""
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
|
||||
parquet_file = Path(f"data/output/parquet_output/part_{part_id}_{date}.parquet")
|
||||
start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
# Check for parquet file first
|
||||
version_date = f"v{start_time.strftime('%Y.%m.%d')}"
|
||||
parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
|
||||
|
||||
if not parquet_file.exists():
|
||||
print(f"Parquet file not found: {parquet_file}")
|
||||
# Try to generate parquet file by calling the download function
|
||||
print(f" Parquet file not found: {parquet_file}")
|
||||
print(f" Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
|
||||
|
||||
from download_adsb_data_to_parquet import create_parquet_for_day
|
||||
result_path = create_parquet_for_day(start_time, keep_folders=False)
|
||||
|
||||
if result_path:
|
||||
print(f" Successfully generated parquet file: {result_path}")
|
||||
else:
|
||||
raise Exception("Failed to generate parquet file")
|
||||
|
||||
if parquet_file.exists():
|
||||
print(f" Loading from parquet: {parquet_file}")
|
||||
df = pl.read_parquet(
|
||||
parquet_file,
|
||||
columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
|
||||
)
|
||||
|
||||
# Convert to timezone-naive datetime
|
||||
if df["time"].dtype == pl.Datetime:
|
||||
df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
|
||||
|
||||
return df
|
||||
else:
|
||||
# Return empty DataFrame if parquet file doesn't exist
|
||||
print(f" No data available for {start_time.strftime('%Y-%m-%d')}")
|
||||
return pl.DataFrame(schema={
|
||||
'time': pl.Datetime,
|
||||
'icao': pl.Utf8,
|
||||
@@ -160,33 +205,17 @@ def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
|
||||
'desc': pl.Utf8,
|
||||
'aircraft_category': pl.Utf8
|
||||
})
|
||||
|
||||
print(f"Loading from parquet: {parquet_file}")
|
||||
df = pl.read_parquet(
|
||||
parquet_file,
|
||||
columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
|
||||
)
|
||||
|
||||
# Convert to timezone-naive datetime
|
||||
if df["time"].dtype == pl.Datetime:
|
||||
df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
|
||||
os.remove(parquet_file)
|
||||
return df
|
||||
|
||||
|
||||
def compress_parquet_part(part_id: int, date: str) -> pl.DataFrame:
|
||||
"""Load and compress a single parquet part file."""
|
||||
df = load_parquet_part(part_id, date)
|
||||
|
||||
def load_historical_for_day(day):
|
||||
"""Load and compress historical ADS-B data for a day."""
|
||||
df = load_raw_adsb_for_day(day)
|
||||
if df.height == 0:
|
||||
return df
|
||||
|
||||
# Filter to rows within the given date (UTC-naive). This is because sometimes adsb.lol export can have rows at 00:00:00 of next day or similar.
|
||||
date_lit = pl.lit(date).str.strptime(pl.Date, "%Y-%m-%d")
|
||||
df = df.filter(pl.col("time").dt.date() == date_lit)
|
||||
|
||||
print(f"Loaded {df.height} raw records for part {part_id}, date {date}")
|
||||
print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")
|
||||
|
||||
# Use shared compression function
|
||||
return compress_multi_icao_df(df, verbose=True)
|
||||
|
||||
|
||||
@@ -194,4 +223,52 @@ def concat_compressed_dfs(df_base, df_new):
|
||||
"""Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
|
||||
# Combine both dataframes
|
||||
df_combined = pl.concat([df_base, df_new])
|
||||
return df_combined
|
||||
|
||||
# Sort by ICAO and time
|
||||
df_combined = df_combined.sort(['icao', 'time'])
|
||||
|
||||
# Fill null values
|
||||
for col in COLUMNS:
|
||||
if col in df_combined.columns:
|
||||
df_combined = df_combined.with_columns(pl.col(col).fill_null(""))
|
||||
|
||||
# Apply compression logic per ICAO to get the best row
|
||||
icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
|
||||
compressed_dfs = []
|
||||
|
||||
for icao, group_df in icao_groups.items():
|
||||
compressed = compress_df_polars(group_df, icao)
|
||||
compressed_dfs.append(compressed)
|
||||
|
||||
if compressed_dfs:
|
||||
df_compressed = pl.concat(compressed_dfs)
|
||||
else:
|
||||
df_compressed = df_combined.head(0)
|
||||
|
||||
# Sort by time
|
||||
df_compressed = df_compressed.sort('time')
|
||||
|
||||
return df_compressed
|
||||
|
||||
|
||||
def get_latest_aircraft_adsb_csv_df():
|
||||
"""Download and load the latest ADS-B CSV from GitHub releases."""
|
||||
from get_latest_release import download_latest_aircraft_adsb_csv
|
||||
import re
|
||||
|
||||
csv_path = download_latest_aircraft_adsb_csv()
|
||||
df = pl.read_csv(csv_path, null_values=[""])
|
||||
|
||||
# Fill nulls with empty strings
|
||||
for col in df.columns:
|
||||
if df[col].dtype == pl.Utf8:
|
||||
df = df.with_columns(pl.col(col).fill_null(""))
|
||||
|
||||
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
|
||||
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
|
||||
if not match:
|
||||
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
|
||||
|
||||
date_str = match.group(1)
|
||||
return df, date_str
|
||||
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
from pathlib import Path
|
||||
import polars as pl
|
||||
import argparse
|
||||
import os
|
||||
OUTPUT_DIR = Path("./data/output")
|
||||
CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Concatenate compressed parquet files for a single day")
|
||||
parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
|
||||
parser.add_argument("--concat_with_latest_csv", action="store_true", help="Whether to also concatenate with the latest CSV from GitHub releases")
|
||||
args = parser.parse_args()
|
||||
|
||||
compressed_dir = OUTPUT_DIR / "compressed"
|
||||
date_dir = compressed_dir / args.date
|
||||
|
||||
parquet_files = sorted(date_dir.glob("*.parquet"))
|
||||
df = None
|
||||
if parquet_files: # TODO: This logic could be updated slightly.
|
||||
print(f"No parquet files found in {date_dir}")
|
||||
|
||||
frames = [pl.read_parquet(p) for p in parquet_files]
|
||||
df = pl.concat(frames, how="vertical", rechunk=True)
|
||||
|
||||
df = df.sort(["time", "icao"])
|
||||
df = df.select(CORRECT_ORDER_OF_COLUMNS)
|
||||
|
||||
output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
|
||||
print(f"Writing combined parquet to {output_path} with {df.height} rows")
|
||||
df.write_parquet(output_path)
|
||||
|
||||
csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
|
||||
print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
|
||||
df.write_csv(csv_output_path, compression="gzip")
|
||||
|
||||
if args.concat_with_latest_csv:
|
||||
print("Loading latest CSV from GitHub releases to concatenate with...")
|
||||
from src.get_latest_release import get_latest_aircraft_adsb_csv_df
|
||||
from datetime import datetime
|
||||
|
||||
df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
|
||||
|
||||
# Compare dates: end_date is exclusive, so if csv_end_date > args.date,
|
||||
# the latest CSV already includes this day's data
|
||||
csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
|
||||
args_dt = datetime.strptime(args.date, "%Y-%m-%d")
|
||||
|
||||
if df is None or csv_end_dt >= args_dt:
|
||||
print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
|
||||
print("Writing latest CSV directly without concatenation to avoid duplicates")
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
|
||||
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
|
||||
df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
|
||||
else:
|
||||
print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
|
||||
# Ensure column order matches before concatenating
|
||||
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
|
||||
from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
|
||||
df_final = concat_compressed_dfs(df_latest_csv, df)
|
||||
df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
|
||||
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
|
||||
df_final.write_csv(final_csv_output_path, compression="gzip")
|
||||
print(f"Final CSV written to {final_csv_output_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,33 +1,42 @@
|
||||
"""
|
||||
Downloads adsb.lol data and writes to Parquet files.
|
||||
|
||||
This file contains utility functions for downloading and processing adsb.lol trace data.
|
||||
Used by the historical ADS-B processing pipeline.
|
||||
Usage:
|
||||
python -m src.process_historical_adsb_data.download_to_parquet 2025-01-01 2025-01-02
|
||||
|
||||
This will download trace data for the specified date range and output Parquet files.
|
||||
|
||||
This file is self-contained and does not import from other project modules.
|
||||
"""
|
||||
import datetime as dt
|
||||
import gc
|
||||
import glob
|
||||
import gzip
|
||||
import os
|
||||
import re
|
||||
import resource
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import time
|
||||
import re
|
||||
import signal
|
||||
import concurrent.futures
|
||||
import subprocess
|
||||
import os
|
||||
import argparse
|
||||
import datetime as dt
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
import orjson
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Configuration
|
||||
# ============================================================================
|
||||
|
||||
OUTPUT_DIR = Path("./data/output")
|
||||
OUTPUT_DIR = "./data/output"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output")
|
||||
@@ -76,7 +85,7 @@ def _fetch_releases_from_repo(year: str, version_date: str) -> list:
|
||||
|
||||
while True:
|
||||
max_retries = 10
|
||||
retry_delay = 60*5
|
||||
retry_delay = 60
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
@@ -88,7 +97,7 @@ def _fetch_releases_from_repo(year: str, version_date: str) -> list:
|
||||
else:
|
||||
print(f"Failed to fetch releases (attempt {attempt}/{max_retries}): {response.status} {response.reason}")
|
||||
if attempt < max_retries:
|
||||
print(f"Waiting {retry_delay} seconds before retry")
|
||||
print(f"Waiting {retry_delay} seconds before retry...")
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
print(f"Giving up after {max_retries} attempts")
|
||||
@@ -96,7 +105,7 @@ def _fetch_releases_from_repo(year: str, version_date: str) -> list:
|
||||
except Exception as e:
|
||||
print(f"Request exception (attempt {attempt}/{max_retries}): {e}")
|
||||
if attempt < max_retries:
|
||||
print(f"Waiting {retry_delay} seconds before retry")
|
||||
print(f"Waiting {retry_delay} seconds before retry...")
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
print(f"Giving up after {max_retries} attempts")
|
||||
@@ -123,105 +132,47 @@ def fetch_releases(version_date: str) -> list:
|
||||
# For last day of year, also check next year's repo if nothing found
|
||||
if not releases and version_date.endswith(".12.31"):
|
||||
next_year = str(int(year) + 1)
|
||||
print(f"No releases found for {version_date} in {year} repo, checking {next_year} repo")
|
||||
print(f"No releases found for {version_date} in {year} repo, checking {next_year} repo...")
|
||||
releases = _fetch_releases_from_repo(next_year, version_date)
|
||||
|
||||
return releases
|
||||
|
||||
|
||||
def download_asset(asset_url: str, file_path: str, expected_size: int | None = None) -> bool:
|
||||
"""Download a single release asset with size verification.
|
||||
|
||||
Args:
|
||||
asset_url: URL to download from
|
||||
file_path: Local path to save to
|
||||
expected_size: Expected file size in bytes (for verification)
|
||||
|
||||
Returns:
|
||||
True if download succeeded and size matches (if provided), False otherwise
|
||||
"""
|
||||
def download_asset(asset_url: str, file_path: str) -> bool:
|
||||
"""Download a single release asset."""
|
||||
os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
# Check if file exists and has correct size
|
||||
if os.path.exists(file_path):
|
||||
if expected_size is not None:
|
||||
actual_size = os.path.getsize(file_path)
|
||||
if actual_size == expected_size:
|
||||
print(f"[SKIP] {file_path} already downloaded and verified ({actual_size} bytes).")
|
||||
print(f"[SKIP] {file_path} already downloaded.")
|
||||
return True
|
||||
|
||||
print(f"Downloading {asset_url}...")
|
||||
try:
|
||||
signal.signal(signal.SIGALRM, timeout_handler)
|
||||
signal.alarm(40) # 40-second timeout
|
||||
|
||||
req = urllib.request.Request(asset_url, headers=HEADERS)
|
||||
with urllib.request.urlopen(req) as response:
|
||||
signal.alarm(0)
|
||||
|
||||
if response.status == 200:
|
||||
with open(file_path, "wb") as file:
|
||||
while True:
|
||||
chunk = response.read(8192)
|
||||
if not chunk:
|
||||
break
|
||||
file.write(chunk)
|
||||
print(f"Saved {file_path}")
|
||||
return True
|
||||
else:
|
||||
print(f"[WARN] {file_path} exists but size mismatch (expected {expected_size}, got {actual_size}). Re-downloading.")
|
||||
os.remove(file_path)
|
||||
else:
|
||||
print(f"[SKIP] {file_path} already downloaded.")
|
||||
return True
|
||||
|
||||
max_retries = 2
|
||||
retry_delay = 30
|
||||
timeout_seconds = 140
|
||||
|
||||
for attempt in range(1, max_retries + 1):
|
||||
print(f"Downloading {asset_url} (attempt {attempt}/{max_retries})")
|
||||
try:
|
||||
req = urllib.request.Request(asset_url, headers=HEADERS)
|
||||
with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
|
||||
if response.status == 200:
|
||||
with open(file_path, "wb") as file:
|
||||
while True:
|
||||
chunk = response.read(8192)
|
||||
if not chunk:
|
||||
break
|
||||
file.write(chunk)
|
||||
|
||||
# Verify file size if expected_size was provided
|
||||
if expected_size is not None:
|
||||
actual_size = os.path.getsize(file_path)
|
||||
if actual_size != expected_size:
|
||||
print(f"[ERROR] Size mismatch for {file_path}: expected {expected_size} bytes, got {actual_size} bytes")
|
||||
os.remove(file_path)
|
||||
if attempt < max_retries:
|
||||
print(f"Waiting {retry_delay} seconds before retry")
|
||||
time.sleep(retry_delay)
|
||||
continue
|
||||
return False
|
||||
print(f"Saved {file_path} ({actual_size} bytes, verified)")
|
||||
else:
|
||||
print(f"Saved {file_path}")
|
||||
return True
|
||||
else:
|
||||
print(f"Failed to download {asset_url}: {response.status} {response.msg}")
|
||||
if attempt < max_retries:
|
||||
print(f"Waiting {retry_delay} seconds before retry")
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
return False
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404:
|
||||
print(f"404 Not Found: {asset_url}")
|
||||
raise Exception(f"Asset not found (404): {asset_url}")
|
||||
else:
|
||||
print(f"HTTP error occurred (attempt {attempt}/{max_retries}): {e.code} {e.reason}")
|
||||
if attempt < max_retries:
|
||||
print(f"Waiting {retry_delay} seconds before retry")
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
return False
|
||||
except urllib.error.URLError as e:
|
||||
print(f"URL/Timeout error (attempt {attempt}/{max_retries}): {e}")
|
||||
if attempt < max_retries:
|
||||
print(f"Waiting {retry_delay} seconds before retry")
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
print(f"Failed to download {asset_url}: {response.status} {response.msg}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"An error occurred (attempt {attempt}/{max_retries}): {e}")
|
||||
if attempt < max_retries:
|
||||
print(f"Waiting {retry_delay} seconds before retry")
|
||||
time.sleep(retry_delay)
|
||||
else:
|
||||
return False
|
||||
|
||||
return False
|
||||
except DownloadTimeoutException as e:
|
||||
print(f"Download aborted for {asset_url}: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"An error occurred while downloading {asset_url}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
|
||||
@@ -260,6 +211,7 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
|
||||
stdin=cat_proc.stdout,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=True
|
||||
)
|
||||
cat_proc.stdout.close()
|
||||
cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
|
||||
@@ -268,24 +220,6 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
|
||||
if cat_stderr:
|
||||
print(f"cat stderr: {cat_stderr}")
|
||||
|
||||
tar_stderr = result.stderr.decode() if result.stderr else ""
|
||||
if result.returncode != 0:
|
||||
# GNU tar exits non-zero for format issues that BSD tar silently
|
||||
# tolerates (e.g. trailing junk after the last valid entry).
|
||||
# Check whether files were actually extracted before giving up.
|
||||
extracted_items = os.listdir(extract_dir)
|
||||
if extracted_items:
|
||||
print(f"[WARN] tar exited {result.returncode} but extracted "
|
||||
f"{len(extracted_items)} items — treating as success")
|
||||
if tar_stderr:
|
||||
print(f"tar stderr: {tar_stderr}")
|
||||
else:
|
||||
print(f"Failed to extract split archive (tar exit {result.returncode})")
|
||||
if tar_stderr:
|
||||
print(f"tar stderr: {tar_stderr}")
|
||||
shutil.rmtree(extract_dir, ignore_errors=True)
|
||||
return False
|
||||
|
||||
print(f"Successfully extracted archive to {extract_dir}")
|
||||
|
||||
# Delete tar files immediately after extraction
|
||||
@@ -302,9 +236,11 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
|
||||
print(f"Disk space after tar deletion: {free_gb:.1f}GB free")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
except subprocess.CalledProcessError as e:
|
||||
stderr_output = e.stderr.decode() if e.stderr else ""
|
||||
print(f"Failed to extract split archive: {e}")
|
||||
shutil.rmtree(extract_dir, ignore_errors=True)
|
||||
if stderr_output:
|
||||
print(f"tar stderr: {stderr_output}")
|
||||
return False
|
||||
|
||||
|
||||
@@ -468,6 +404,8 @@ COLUMNS = [
|
||||
|
||||
OS_CPU_COUNT = os.cpu_count() or 1
|
||||
MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1
|
||||
CHUNK_SIZE = MAX_WORKERS * 500 # Reduced for lower RAM usage
|
||||
BATCH_SIZE = 250_000 # Fixed size for predictable memory usage (~500MB per batch)
|
||||
|
||||
# PyArrow schema for efficient Parquet writing
|
||||
PARQUET_SCHEMA = pa.schema([
|
||||
@@ -555,6 +493,217 @@ def collect_trace_files_with_find(root_dir):
|
||||
return trace_dict
|
||||
|
||||
|
||||
def generate_version_dates(start_date: str, end_date: str) -> list:
|
||||
"""Generate a list of dates from start_date to end_date inclusive."""
|
||||
start = datetime.strptime(start_date, "%Y-%m-%d")
|
||||
end = datetime.strptime(end_date, "%Y-%m-%d")
|
||||
delta = end - start
|
||||
return [start + timedelta(days=i) for i in range(delta.days + 1)]
|
||||
|
||||
|
||||
def safe_process(fp):
|
||||
"""Safely process a file, returning empty list on error."""
|
||||
try:
|
||||
return process_file(fp)
|
||||
except Exception as e:
|
||||
logging.error(f"Error processing {fp}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def rows_to_arrow_table(rows: list) -> pa.Table:
|
||||
"""Convert list of rows to a PyArrow Table directly (no pandas)."""
|
||||
# Transpose rows into columns
|
||||
columns = list(zip(*rows))
|
||||
|
||||
# Build arrays for each column according to schema
|
||||
arrays = []
|
||||
for i, field in enumerate(PARQUET_SCHEMA):
|
||||
col_data = list(columns[i]) if i < len(columns) else [None] * len(rows)
|
||||
arrays.append(pa.array(col_data, type=field.type))
|
||||
|
||||
return pa.Table.from_arrays(arrays, schema=PARQUET_SCHEMA)
|
||||
|
||||
|
||||
def write_batch_to_parquet(rows: list, version_date: str, batch_idx: int):
|
||||
"""Write a batch of rows to a Parquet file."""
|
||||
if not rows:
|
||||
return
|
||||
|
||||
table = rows_to_arrow_table(rows)
|
||||
|
||||
parquet_path = os.path.join(PARQUET_DIR, f"{version_date}_batch_{batch_idx:04d}.parquet")
|
||||
|
||||
pq.write_table(table, parquet_path, compression='snappy')
|
||||
|
||||
print(f"Written parquet batch {batch_idx} ({len(rows)} rows) | {get_resource_usage()}")
|
||||
|
||||
|
||||
def merge_parquet_files(version_date: str, delete_batches: bool = True):
|
||||
"""Merge all batch parquet files for a version_date into a single file using streaming."""
|
||||
pattern = os.path.join(PARQUET_DIR, f"{version_date}_batch_*.parquet")
|
||||
batch_files = sorted(glob.glob(pattern))
|
||||
|
||||
if not batch_files:
|
||||
print(f"No batch files found for {version_date}")
|
||||
return None
|
||||
|
||||
print(f"Merging {len(batch_files)} batch files for {version_date} (streaming)...")
|
||||
|
||||
merged_path = os.path.join(PARQUET_DIR, f"{version_date}.parquet")
|
||||
total_rows = 0
|
||||
|
||||
# Stream write: read one batch at a time to minimize RAM usage
|
||||
writer = None
|
||||
try:
|
||||
for i, f in enumerate(batch_files):
|
||||
table = pq.read_table(f)
|
||||
total_rows += table.num_rows
|
||||
|
||||
if writer is None:
|
||||
writer = pq.ParquetWriter(merged_path, table.schema, compression='snappy')
|
||||
|
||||
writer.write_table(table)
|
||||
|
||||
# Delete batch file immediately after reading to free disk space
|
||||
if delete_batches:
|
||||
os.remove(f)
|
||||
|
||||
# Free memory
|
||||
del table
|
||||
if (i + 1) % 10 == 0:
|
||||
gc.collect()
|
||||
print(f" Merged {i + 1}/{len(batch_files)} batches... | {get_resource_usage()}")
|
||||
finally:
|
||||
if writer is not None:
|
||||
writer.close()
|
||||
|
||||
print(f"Merged parquet file written to {merged_path} ({total_rows} total rows) | {get_resource_usage()}")
|
||||
|
||||
if delete_batches:
|
||||
print(f"Deleted {len(batch_files)} batch files during merge")
|
||||
|
||||
gc.collect()
|
||||
return merged_path
|
||||
|
||||
|
||||
def process_version_date(version_date: str, keep_folders: bool = False):
|
||||
"""Download, extract, and process trace files for a single version date."""
|
||||
print(f"\nProcessing version_date: {version_date}")
|
||||
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
|
||||
|
||||
def collect_trace_files_for_version_date(vd):
|
||||
releases = fetch_releases(vd)
|
||||
if len(releases) == 0:
|
||||
print(f"No releases found for {vd}.")
|
||||
return None
|
||||
|
||||
# Prefer non-tmp releases; only use tmp if no normal releases exist
|
||||
normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
|
||||
tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
|
||||
releases = normal_releases if normal_releases else tmp_releases
|
||||
print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
|
||||
|
||||
downloaded_files = []
|
||||
for release in releases:
|
||||
tag_name = release["tag_name"]
|
||||
print(f"Processing release: {tag_name}")
|
||||
|
||||
# Only download prod-0 if available, else prod-0tmp
|
||||
assets = release.get("assets", [])
|
||||
normal_assets = [
|
||||
a for a in assets
|
||||
if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
|
||||
]
|
||||
tmp_assets = [
|
||||
a for a in assets
|
||||
if "planes-readsb-prod-0tmp" in a["name"]
|
||||
]
|
||||
use_assets = normal_assets if normal_assets else tmp_assets
|
||||
|
||||
for asset in use_assets:
|
||||
asset_name = asset["name"]
|
||||
asset_url = asset["browser_download_url"]
|
||||
file_path = os.path.join(OUTPUT_DIR, asset_name)
|
||||
result = download_asset(asset_url, file_path)
|
||||
if result:
|
||||
downloaded_files.append(file_path)
|
||||
|
||||
extract_split_archive(downloaded_files, extract_dir)
|
||||
return collect_trace_files_with_find(extract_dir)
|
||||
|
||||
# Check if files already exist
|
||||
pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
|
||||
matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
|
||||
|
||||
if matches:
|
||||
print(f"Found existing files for {version_date}:")
|
||||
# Prefer non-tmp slices when reusing existing files
|
||||
normal_matches = [
|
||||
p for p in matches
|
||||
if "-planes-readsb-prod-0." in os.path.basename(p)
|
||||
and "tmp" not in os.path.basename(p)
|
||||
]
|
||||
downloaded_files = normal_matches if normal_matches else matches
|
||||
|
||||
extract_split_archive(downloaded_files, extract_dir)
|
||||
trace_files = collect_trace_files_with_find(extract_dir)
|
||||
else:
|
||||
trace_files = collect_trace_files_for_version_date(version_date)
|
||||
|
||||
if trace_files is None or len(trace_files) == 0:
|
||||
print(f"No trace files found for version_date: {version_date}")
|
||||
return 0
|
||||
|
||||
file_list = list(trace_files.values())
|
||||
|
||||
start_time = time.perf_counter()
|
||||
total_num_rows = 0
|
||||
batch_rows = []
|
||||
batch_idx = 0
|
||||
|
||||
# Process files in chunks
|
||||
for offset in range(0, len(file_list), CHUNK_SIZE):
|
||||
chunk = file_list[offset:offset + CHUNK_SIZE]
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_executor:
|
||||
for rows in process_executor.map(safe_process, chunk):
|
||||
if not rows:
|
||||
continue
|
||||
batch_rows.extend(rows)
|
||||
|
||||
if len(batch_rows) >= BATCH_SIZE:
|
||||
total_num_rows += len(batch_rows)
|
||||
write_batch_to_parquet(batch_rows, version_date, batch_idx)
|
||||
batch_idx += 1
|
||||
batch_rows = []
|
||||
|
||||
elapsed = time.perf_counter() - start_time
|
||||
speed = total_num_rows / elapsed if elapsed > 0 else 0
|
||||
print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
|
||||
|
||||
gc.collect()
|
||||
|
||||
# Final batch
|
||||
if batch_rows:
|
||||
total_num_rows += len(batch_rows)
|
||||
write_batch_to_parquet(batch_rows, version_date, batch_idx)
|
||||
elapsed = time.perf_counter() - start_time
|
||||
speed = total_num_rows / elapsed if elapsed > 0 else 0
|
||||
print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
|
||||
|
||||
print(f"Total rows processed for version_date {version_date}: {total_num_rows}")
|
||||
|
||||
# Clean up extracted directory immediately after processing (before merging parquet files)
|
||||
if not keep_folders and os.path.isdir(extract_dir):
|
||||
print(f"Deleting extraction directory with 100,000+ files: {extract_dir}")
|
||||
shutil.rmtree(extract_dir)
|
||||
print(f"Successfully deleted extraction directory: {extract_dir} | {get_resource_usage()}")
|
||||
|
||||
# Merge batch files into a single parquet file
|
||||
merge_parquet_files(version_date, delete_batches=True)
|
||||
|
||||
return total_num_rows
|
||||
|
||||
|
||||
def create_parquet_for_day(day, keep_folders: bool = False):
|
||||
"""Create parquet file for a single day.
|
||||
|
||||
@@ -578,10 +727,42 @@ def create_parquet_for_day(day, keep_folders: bool = False):
|
||||
print(f"Parquet file already exists: {parquet_path}")
|
||||
return parquet_path
|
||||
|
||||
print(f"Creating parquet for {version_date}")
|
||||
print(f"Creating parquet for {version_date}...")
|
||||
rows_processed = process_version_date(version_date, keep_folders)
|
||||
|
||||
if rows_processed > 0 and parquet_path.exists():
|
||||
return parquet_path
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def main(start_date: str, end_date: str, keep_folders: bool = False):
|
||||
"""Main function to download and convert adsb.lol data to Parquet."""
|
||||
version_dates = [f"v{date.strftime('%Y.%m.%d')}" for date in generate_version_dates(start_date, end_date)]
|
||||
print(f"Processing dates: {version_dates}")
|
||||
|
||||
total_rows_all = 0
|
||||
for version_date in version_dates:
|
||||
rows_processed = process_version_date(version_date, keep_folders)
|
||||
total_rows_all += rows_processed
|
||||
|
||||
print(f"\n=== Summary ===")
|
||||
print(f"Total dates processed: {len(version_dates)}")
|
||||
print(f"Total rows written to Parquet: {total_rows_all}")
|
||||
print(f"Parquet files location: {PARQUET_DIR}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download adsb.lol data and write to Parquet files"
|
||||
)
|
||||
parser.add_argument("start_date", type=str, help="Start date in YYYY-MM-DD format")
|
||||
parser.add_argument("end_date", type=str, help="End date in YYYY-MM-DD format")
|
||||
parser.add_argument("--keep-folders", action="store_true",
|
||||
help="Keep extracted folders after processing")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args.start_date, args.end_date, args.keep_folders)
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
"""
|
||||
Downloads and extracts adsb.lol tar files for a single day, then lists all ICAO folders.
|
||||
Downloads and extracts adsb.lol tar files, then lists all ICAO folders.
|
||||
This is the first step of the map-reduce pipeline.
|
||||
|
||||
Supports both single-day (daily) and multi-day (historical) modes.
|
||||
|
||||
Outputs:
|
||||
- Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
|
||||
- ICAO manifest at data/output/icao_manifest_{date}.txt
|
||||
@@ -23,6 +25,11 @@ from src.adsb.download_adsb_data_to_parquet import (
|
||||
)
|
||||
|
||||
|
||||
def get_target_day() -> datetime:
|
||||
"""Get yesterday's date (the day we're processing)."""
|
||||
return datetime.utcnow() - timedelta(days=1)
|
||||
|
||||
|
||||
def download_and_extract(version_date: str) -> str | None:
|
||||
"""Download and extract tar files, return extract directory path."""
|
||||
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
|
||||
@@ -77,9 +84,8 @@ def download_and_extract(version_date: str) -> str | None:
|
||||
for asset in use_assets:
|
||||
asset_name = asset["name"]
|
||||
asset_url = asset["browser_download_url"]
|
||||
asset_size = asset.get("size") # Get expected file size
|
||||
file_path = os.path.join(OUTPUT_DIR, asset_name)
|
||||
if download_asset(asset_url, file_path, expected_size=asset_size):
|
||||
if download_asset(asset_url, file_path):
|
||||
downloaded_files.append(file_path)
|
||||
|
||||
if not downloaded_files:
|
||||
@@ -100,6 +106,21 @@ def list_icao_folders(extract_dir: str) -> list[str]:
|
||||
return icaos
|
||||
|
||||
|
||||
def write_manifest(icaos: list[str], manifest_id: str) -> str:
|
||||
"""Write ICAO list to manifest file.
|
||||
|
||||
Args:
|
||||
icaos: List of ICAO codes
|
||||
manifest_id: Identifier for manifest file (date or date range)
|
||||
"""
|
||||
manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
|
||||
with open(manifest_path, "w") as f:
|
||||
for icao in sorted(icaos):
|
||||
f.write(f"{icao}\n")
|
||||
print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
|
||||
return manifest_path
|
||||
|
||||
|
||||
def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
|
||||
"""Process a single day: download, extract, list ICAOs.
|
||||
|
||||
@@ -114,50 +135,82 @@ def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
|
||||
extract_dir = download_and_extract(version_date)
|
||||
if not extract_dir:
|
||||
print(f"Failed to download/extract data for {date_str}")
|
||||
raise Exception(f"No data available for {date_str}")
|
||||
return None, []
|
||||
|
||||
icaos = list_icao_folders(extract_dir)
|
||||
print(f"Found {len(icaos)} ICAOs for {date_str}")
|
||||
|
||||
return extract_dir, icaos
|
||||
|
||||
from pathlib import Path
|
||||
import tarfile
|
||||
NUMBER_PARTS = 4
|
||||
def split_folders_into_gzip_archives(extract_dir: Path, tar_output_dir: Path, icaos: list[str], parts = NUMBER_PARTS) -> list[str]:
|
||||
traces_dir = extract_dir / "traces"
|
||||
buckets = sorted(traces_dir.iterdir())
|
||||
tars = []
|
||||
for i in range(parts):
|
||||
tar_path = tar_output_dir / f"{tar_output_dir.name}_part_{i}.tar.gz"
|
||||
tars.append(tarfile.open(tar_path, "w:gz"))
|
||||
for idx, bucket_path in enumerate(buckets):
|
||||
tar_idx = idx % parts
|
||||
tars[tar_idx].add(bucket_path, arcname=bucket_path.name)
|
||||
for tar in tars:
|
||||
tar.close()
|
||||
|
||||
def process_date_range(start_date: datetime, end_date: datetime) -> set[str]:
|
||||
"""Process multiple days: download, extract, combine ICAO lists.
|
||||
|
||||
Args:
|
||||
start_date: Start date (inclusive)
|
||||
end_date: End date (inclusive)
|
||||
|
||||
Returns:
|
||||
Combined set of all ICAOs across the date range
|
||||
"""
|
||||
all_icaos: set[str] = set()
|
||||
current = start_date
|
||||
|
||||
# Both start and end are inclusive
|
||||
while current <= end_date:
|
||||
_, icaos = process_single_day(current)
|
||||
all_icaos.update(icaos)
|
||||
current += timedelta(days=1)
|
||||
|
||||
return all_icaos
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data for a single day")
|
||||
parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
|
||||
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
|
||||
parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
|
||||
parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
|
||||
args = parser.parse_args()
|
||||
|
||||
target_day = datetime.strptime(args.date, "%Y-%m-%d")
|
||||
date_str = target_day.strftime("%Y-%m-%d")
|
||||
tar_output_dir = Path(f"./data/output/adsb_archives/{date_str}")
|
||||
|
||||
extract_dir, icaos = process_single_day(target_day)
|
||||
extract_dir = Path(extract_dir)
|
||||
print(extract_dir)
|
||||
tar_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
split_folders_into_gzip_archives(extract_dir, tar_output_dir, icaos)
|
||||
if not icaos:
|
||||
print("No ICAOs found")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\nDone! Extract dir: {extract_dir}")
|
||||
print(f"Total ICAOs: {len(icaos)}")
|
||||
# Determine mode: single day or date range
|
||||
if args.start_date and args.end_date:
|
||||
# Historical mode: process date range
|
||||
start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
|
||||
end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
|
||||
|
||||
print(f"Processing date range: {args.start_date} to {args.end_date}")
|
||||
|
||||
all_icaos = process_date_range(start_date, end_date)
|
||||
|
||||
if not all_icaos:
|
||||
print("No ICAOs found in date range")
|
||||
sys.exit(1)
|
||||
|
||||
# Write combined manifest with range identifier
|
||||
manifest_id = f"{args.start_date}_{args.end_date}"
|
||||
write_manifest(list(all_icaos), manifest_id)
|
||||
|
||||
print(f"\nDone! Total ICAOs: {len(all_icaos)}")
|
||||
|
||||
else:
|
||||
# Daily mode: single day
|
||||
if args.date:
|
||||
target_day = datetime.strptime(args.date, "%Y-%m-%d")
|
||||
else:
|
||||
target_day = get_target_day()
|
||||
|
||||
date_str = target_day.strftime("%Y-%m-%d")
|
||||
|
||||
extract_dir, icaos = process_single_day(target_day)
|
||||
|
||||
if not icaos:
|
||||
print("No ICAOs found")
|
||||
sys.exit(1)
|
||||
|
||||
write_manifest(icaos, date_str)
|
||||
|
||||
print(f"\nDone! Extract dir: {extract_dir}")
|
||||
print(f"Total ICAOs: {len(icaos)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -41,7 +41,7 @@ def main() -> None:
|
||||
"""Main entry point for GitHub Actions."""
|
||||
start_date = os.environ.get("INPUT_START_DATE")
|
||||
end_date = os.environ.get("INPUT_END_DATE")
|
||||
chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "1"))
|
||||
chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
|
||||
|
||||
if not start_date or not end_date:
|
||||
print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
"""
|
||||
Main pipeline for processing ADS-B data from adsb.lol.
|
||||
|
||||
Usage:
|
||||
python -m src.adsb.main --date 2026-01-01
|
||||
python -m src.adsb.main --start_date 2026-01-01 --end_date 2026-01-03
|
||||
"""
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import polars as pl
|
||||
|
||||
from src.adsb.download_and_list_icaos import NUMBER_PARTS
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Process ADS-B data for a single day or date range")
|
||||
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format")
|
||||
parser.add_argument("--start_date", type=str, help="Start date (inclusive, YYYY-MM-DD)")
|
||||
parser.add_argument("--end_date", type=str, help="End date (exclusive, YYYY-MM-DD)")
|
||||
parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.date and (args.start_date or args.end_date):
|
||||
raise SystemExit("Use --date or --start_date/--end_date, not both.")
|
||||
|
||||
if args.date:
|
||||
start_date = datetime.strptime(args.date, "%Y-%m-%d")
|
||||
end_date = start_date + timedelta(days=1)
|
||||
else:
|
||||
if not args.start_date or not args.end_date:
|
||||
raise SystemExit("Provide --start_date and --end_date, or use --date.")
|
||||
start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
|
||||
end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
|
||||
|
||||
current = start_date
|
||||
while current < end_date:
|
||||
date_str = current.strftime("%Y-%m-%d")
|
||||
print(f"Processing day: {date_str}")
|
||||
|
||||
# Download and split
|
||||
subprocess.run([sys.executable, "-m", "src.adsb.download_and_list_icaos", "--date", date_str], check=True)
|
||||
|
||||
# Process parts
|
||||
for part_id in range(NUMBER_PARTS):
|
||||
subprocess.run([sys.executable, "-m", "src.adsb.process_icao_chunk", "--part-id", str(part_id), "--date", date_str], check=True)
|
||||
|
||||
# Concatenate
|
||||
concat_cmd = [sys.executable, "-m", "src.adsb.concat_parquet_to_final", "--date", date_str]
|
||||
if args.concat_with_latest_csv:
|
||||
concat_cmd.append("--concat_with_latest_csv")
|
||||
subprocess.run(concat_cmd, check=True)
|
||||
|
||||
current += timedelta(days=1)
|
||||
|
||||
if end_date - start_date > timedelta(days=1):
|
||||
dates = []
|
||||
cur = start_date
|
||||
while cur < end_date:
|
||||
dates.append(cur.strftime("%Y-%m-%d"))
|
||||
cur += timedelta(days=1)
|
||||
csv_files = [
|
||||
f"data/outputs/openairframes_adsb_{d}_{d}.csv"
|
||||
for d in dates
|
||||
]
|
||||
frames = [pl.read_csv(p) for p in csv_files]
|
||||
df = pl.concat(frames, how="vertical", rechunk=True)
|
||||
output_path = f"data/outputs/openairframes_adsb_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv"
|
||||
df.write_csv(output_path)
|
||||
print(f"Wrote combined CSV: {output_path}")
|
||||
|
||||
print("Done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+241
-62
@@ -1,9 +1,18 @@
|
||||
"""
|
||||
Processes trace files from a single archive part for a single day.
|
||||
Processes a chunk of ICAOs from pre-extracted trace files.
|
||||
This is the map phase of the map-reduce pipeline.
|
||||
|
||||
Supports both single-day (daily) and multi-day (historical) modes.
|
||||
|
||||
Expects extract_dir to already exist with trace files.
|
||||
Reads ICAO manifest to determine which ICAOs to process based on chunk-id.
|
||||
|
||||
Usage:
|
||||
python -m src.adsb.process_icao_chunk --part-id 1 --date 2026-01-01
|
||||
# Daily mode (single day)
|
||||
python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
|
||||
|
||||
# Historical mode (date range)
|
||||
python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
|
||||
"""
|
||||
import gc
|
||||
import os
|
||||
@@ -12,9 +21,6 @@ import argparse
|
||||
import time
|
||||
import concurrent.futures
|
||||
from datetime import datetime, timedelta
|
||||
import tarfile
|
||||
import tempfile
|
||||
import shutil
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
@@ -31,21 +37,72 @@ from src.adsb.download_adsb_data_to_parquet import (
|
||||
)
|
||||
|
||||
|
||||
CHUNK_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
|
||||
os.makedirs(CHUNK_OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
# Smaller batch size for memory efficiency
|
||||
BATCH_SIZE = 100_000
|
||||
|
||||
def build_trace_file_map(archive_path: str) -> dict[str, str]:
|
||||
"""Build a map of ICAO -> trace file path by extracting tar.gz archive."""
|
||||
print(f"Extracting {archive_path}...")
|
||||
|
||||
def get_target_day() -> datetime:
|
||||
"""Get yesterday's date (the day we're processing)."""
|
||||
return datetime.utcnow() - timedelta(days=1)
|
||||
|
||||
|
||||
def read_manifest(manifest_id: str) -> list[str]:
|
||||
"""Read ICAO manifest file.
|
||||
|
||||
temp_dir = tempfile.mkdtemp(prefix="adsb_extract_")
|
||||
Args:
|
||||
manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
|
||||
"""
|
||||
manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
|
||||
if not os.path.exists(manifest_path):
|
||||
raise FileNotFoundError(f"Manifest not found: {manifest_path}")
|
||||
|
||||
with tarfile.open(archive_path, 'r:gz') as tar:
|
||||
tar.extractall(path=temp_dir, filter='data')
|
||||
with open(manifest_path, "r") as f:
|
||||
icaos = [line.strip() for line in f if line.strip()]
|
||||
return icaos
|
||||
|
||||
|
||||
def deterministic_hash(s: str) -> int:
|
||||
"""Return a deterministic hash for a string (unlike Python's hash() which is randomized)."""
|
||||
# Use sum of byte values - simple but deterministic
|
||||
return sum(ord(c) for c in s)
|
||||
|
||||
|
||||
def get_chunk_icaos(icaos: list[str], chunk_id: int, total_chunks: int) -> list[str]:
|
||||
"""Get the subset of ICAOs for this chunk based on deterministic hash partitioning."""
|
||||
return [icao for icao in icaos if deterministic_hash(icao) % total_chunks == chunk_id]
|
||||
|
||||
|
||||
def build_trace_file_map(extract_dir: str) -> dict[str, str]:
|
||||
"""Build a map of ICAO -> trace file path using find command."""
|
||||
print(f"Building trace file map from {extract_dir}...")
|
||||
|
||||
trace_map = collect_trace_files_with_find(temp_dir)
|
||||
# Debug: check what's in extract_dir
|
||||
if os.path.isdir(extract_dir):
|
||||
items = os.listdir(extract_dir)[:10]
|
||||
print(f"First 10 items in extract_dir: {items}")
|
||||
# Check if there are subdirectories
|
||||
for item in items[:3]:
|
||||
subpath = os.path.join(extract_dir, item)
|
||||
if os.path.isdir(subpath):
|
||||
subitems = os.listdir(subpath)[:5]
|
||||
print(f" Contents of {item}/: {subitems}")
|
||||
|
||||
trace_map = collect_trace_files_with_find(extract_dir)
|
||||
print(f"Found {len(trace_map)} trace files")
|
||||
|
||||
if len(trace_map) == 0:
|
||||
# Debug: try manual find
|
||||
import subprocess
|
||||
result = subprocess.run(
|
||||
['find', extract_dir, '-type', 'f', '-name', 'trace_full_*'],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
print(f"Manual find output (first 500 chars): {result.stdout[:500]}")
|
||||
print(f"Manual find stderr: {result.stderr[:200]}")
|
||||
|
||||
return trace_map
|
||||
|
||||
|
||||
@@ -68,13 +125,42 @@ def rows_to_table(rows: list) -> pa.Table:
|
||||
|
||||
|
||||
def process_chunk(
|
||||
trace_files: list[str],
|
||||
part_id: int,
|
||||
date_str: str,
|
||||
chunk_id: int,
|
||||
total_chunks: int,
|
||||
trace_map: dict[str, str],
|
||||
icaos: list[str],
|
||||
output_id: str,
|
||||
) -> str | None:
|
||||
"""Process trace files and write to a single parquet file."""
|
||||
"""Process a chunk of ICAOs and write to parquet.
|
||||
|
||||
output_path = os.path.join(PARQUET_DIR, f"part_{part_id}_{date_str}.parquet")
|
||||
Args:
|
||||
chunk_id: This chunk's ID (0-indexed)
|
||||
total_chunks: Total number of chunks
|
||||
trace_map: Map of ICAO -> trace file path
|
||||
icaos: Full list of ICAOs from manifest
|
||||
output_id: Identifier for output file (date or date range)
|
||||
"""
|
||||
chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
|
||||
print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
|
||||
|
||||
if not chunk_icaos:
|
||||
print(f"Chunk {chunk_id}: No ICAOs to process")
|
||||
return None
|
||||
|
||||
# Get trace file paths from the map
|
||||
trace_files = []
|
||||
for icao in chunk_icaos:
|
||||
if icao in trace_map:
|
||||
trace_files.append(trace_map[icao])
|
||||
|
||||
print(f"Chunk {chunk_id}: Found {len(trace_files)} trace files")
|
||||
|
||||
if not trace_files:
|
||||
print(f"Chunk {chunk_id}: No trace files found")
|
||||
return None
|
||||
|
||||
# Process files and write parquet in batches
|
||||
output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
|
||||
|
||||
start_time = time.perf_counter()
|
||||
total_rows = 0
|
||||
@@ -82,8 +168,7 @@ def process_chunk(
|
||||
writer = None
|
||||
|
||||
try:
|
||||
writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
|
||||
|
||||
# Process in parallel batches
|
||||
files_per_batch = MAX_WORKERS * 100
|
||||
for offset in range(0, len(trace_files), files_per_batch):
|
||||
batch_files = trace_files[offset:offset + files_per_batch]
|
||||
@@ -93,72 +178,166 @@ def process_chunk(
|
||||
if rows:
|
||||
batch_rows.extend(rows)
|
||||
|
||||
# Write when batch is full
|
||||
if len(batch_rows) >= BATCH_SIZE:
|
||||
writer.write_table(rows_to_table(batch_rows))
|
||||
table = rows_to_table(batch_rows)
|
||||
total_rows += len(batch_rows)
|
||||
|
||||
if writer is None:
|
||||
writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
|
||||
writer.write_table(table)
|
||||
|
||||
batch_rows = []
|
||||
del table
|
||||
gc.collect()
|
||||
|
||||
elapsed = time.perf_counter() - start_time
|
||||
print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
|
||||
|
||||
gc.collect()
|
||||
|
||||
# Write remaining rows
|
||||
if batch_rows:
|
||||
writer.write_table(rows_to_table(batch_rows))
|
||||
table = rows_to_table(batch_rows)
|
||||
total_rows += len(batch_rows)
|
||||
|
||||
if writer is None:
|
||||
writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
|
||||
writer.write_table(table)
|
||||
del table
|
||||
|
||||
finally:
|
||||
if writer:
|
||||
writer.close()
|
||||
|
||||
print(f"Part {part_id}: Done! {total_rows} rows in {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
|
||||
elapsed = time.perf_counter() - start_time
|
||||
print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
|
||||
|
||||
return output_path if total_rows > 0 else None
|
||||
if total_rows > 0:
|
||||
return output_path
|
||||
return None
|
||||
|
||||
|
||||
def process_single_day(
|
||||
chunk_id: int,
|
||||
total_chunks: int,
|
||||
target_day: datetime,
|
||||
) -> str | None:
|
||||
"""Process a single day for this chunk."""
|
||||
date_str = target_day.strftime("%Y-%m-%d")
|
||||
version_date = f"v{target_day.strftime('%Y.%m.%d')}"
|
||||
|
||||
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
|
||||
|
||||
if not os.path.isdir(extract_dir):
|
||||
print(f"Extract directory not found: {extract_dir}")
|
||||
return None
|
||||
|
||||
trace_map = build_trace_file_map(extract_dir)
|
||||
if not trace_map:
|
||||
print("No trace files found")
|
||||
return None
|
||||
|
||||
icaos = read_manifest(date_str)
|
||||
print(f"Total ICAOs in manifest: {len(icaos)}")
|
||||
|
||||
return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
|
||||
|
||||
|
||||
def process_date_range(
|
||||
chunk_id: int,
|
||||
total_chunks: int,
|
||||
start_date: datetime,
|
||||
end_date: datetime,
|
||||
) -> str | None:
|
||||
"""Process a date range for this chunk.
|
||||
|
||||
Combines trace files from all days in the range.
|
||||
|
||||
Args:
|
||||
chunk_id: This chunk's ID (0-indexed)
|
||||
total_chunks: Total number of chunks
|
||||
start_date: Start date (inclusive)
|
||||
end_date: End date (inclusive)
|
||||
"""
|
||||
start_str = start_date.strftime("%Y-%m-%d")
|
||||
end_str = end_date.strftime("%Y-%m-%d")
|
||||
manifest_id = f"{start_str}_{end_str}"
|
||||
|
||||
print(f"Processing date range: {start_str} to {end_str}")
|
||||
|
||||
# Build combined trace map from all days
|
||||
combined_trace_map: dict[str, str] = {}
|
||||
current = start_date
|
||||
|
||||
# Both start and end are inclusive
|
||||
while current <= end_date:
|
||||
version_date = f"v{current.strftime('%Y.%m.%d')}"
|
||||
extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
|
||||
|
||||
if os.path.isdir(extract_dir):
|
||||
trace_map = build_trace_file_map(extract_dir)
|
||||
# Later days override earlier days (use most recent trace file)
|
||||
combined_trace_map.update(trace_map)
|
||||
print(f" {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
|
||||
else:
|
||||
print(f" {current.strftime('%Y-%m-%d')}: no extract directory")
|
||||
|
||||
current += timedelta(days=1)
|
||||
|
||||
if not combined_trace_map:
|
||||
print("No trace files found in date range")
|
||||
return None
|
||||
|
||||
print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
|
||||
|
||||
icaos = read_manifest(manifest_id)
|
||||
print(f"Total ICAOs in manifest: {len(icaos)}")
|
||||
|
||||
return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Process a single archive part for a day")
|
||||
parser.add_argument("--part-id", type=int, required=True, help="Part ID (1-indexed)")
|
||||
parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
|
||||
parser = argparse.ArgumentParser(description="Process a chunk of ICAOs")
|
||||
parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
|
||||
parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks")
|
||||
parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
|
||||
parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
|
||||
parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Processing part {args.part_id} for {args.date}")
|
||||
print(f"Processing chunk {args.chunk_id}/{args.total_chunks}")
|
||||
print(f"OUTPUT_DIR: {OUTPUT_DIR}")
|
||||
print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
|
||||
print(f"Resource usage at start: {get_resource_usage()}")
|
||||
|
||||
# Get specific archive file for this part
|
||||
archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", args.date)
|
||||
archive_path = os.path.join(archive_dir, f"{args.date}_part_{args.part_id}.tar.gz")
|
||||
# Debug: List what's in OUTPUT_DIR
|
||||
print(f"\nContents of {OUTPUT_DIR}:")
|
||||
if os.path.isdir(OUTPUT_DIR):
|
||||
for item in os.listdir(OUTPUT_DIR)[:20]:
|
||||
print(f" - {item}")
|
||||
else:
|
||||
print(f" Directory does not exist!")
|
||||
|
||||
if not os.path.isfile(archive_path):
|
||||
print(f"ERROR: Archive not found: {archive_path}")
|
||||
if os.path.isdir(archive_dir):
|
||||
print(f"Files in {archive_dir}: {os.listdir(archive_dir)}")
|
||||
# Determine mode: single day or date range
|
||||
if args.start_date and args.end_date:
|
||||
# Historical mode
|
||||
start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
|
||||
end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
|
||||
output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
|
||||
else:
|
||||
# Daily mode
|
||||
if args.date:
|
||||
target_day = datetime.strptime(args.date, "%Y-%m-%d")
|
||||
else:
|
||||
print(f"Directory does not exist: {archive_dir}")
|
||||
sys.exit(1)
|
||||
target_day = get_target_day()
|
||||
output_path = process_single_day(args.chunk_id, args.total_chunks, target_day)
|
||||
|
||||
# Extract and collect trace files
|
||||
trace_map = build_trace_file_map(archive_path)
|
||||
all_trace_files = list(trace_map.values())
|
||||
|
||||
print(f"Total trace files: {len(all_trace_files)}")
|
||||
|
||||
# Process and write output
|
||||
output_path = process_chunk(all_trace_files, args.part_id, args.date)
|
||||
|
||||
from src.adsb.compress_adsb_to_aircraft_data import compress_parquet_part
|
||||
df_compressed = compress_parquet_part(args.part_id, args.date)
|
||||
|
||||
# Write parquet
|
||||
df_compressed_output = OUTPUT_DIR / "compressed" / args.date/ f"part_{args.part_id}_{args.date}.parquet"
|
||||
os.makedirs(df_compressed_output.parent, exist_ok=True)
|
||||
df_compressed.write_parquet(df_compressed_output, compression='snappy')
|
||||
|
||||
# Write CSV
|
||||
csv_output = OUTPUT_DIR / "compressed" / args.date / f"part_{args.part_id}_{args.date}.csv"
|
||||
df_compressed.write_csv(csv_output)
|
||||
|
||||
print(f"Raw output: {output_path}" if output_path else "No raw output generated")
|
||||
print(f"Compressed parquet: {df_compressed_output}")
|
||||
print(f"Compressed CSV: {csv_output}")
|
||||
if output_path:
|
||||
print(f"Output: {output_path}")
|
||||
else:
|
||||
print("No output generated")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
polars>=1.0
|
||||
boto3>=1.34
|
||||
@@ -0,0 +1,5 @@
|
||||
polars>=1.0
|
||||
pyarrow>=14.0
|
||||
orjson>=3.9
|
||||
boto3>=1.34
|
||||
zstandard>=0.22
|
||||
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Run the full ADS-B processing pipeline locally.
|
||||
|
||||
Downloads adsb.lol data, processes trace files, and outputs openairframes_adsb CSV.
|
||||
|
||||
Usage:
|
||||
# Single day (yesterday by default)
|
||||
python -m src.adsb.run_local
|
||||
|
||||
# Single day (specific date)
|
||||
python -m src.adsb.run_local 2024-01-15
|
||||
|
||||
# Date range (inclusive)
|
||||
python -m src.adsb.run_local 2024-01-01 2024-01-07
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
def run_cmd(cmd: list[str], description: str) -> None:
|
||||
"""Run a command and exit on failure."""
|
||||
print(f"\n>>> {' '.join(cmd)}")
|
||||
result = subprocess.run(cmd)
|
||||
if result.returncode != 0:
|
||||
print(f"ERROR: {description} failed with exit code {result.returncode}")
|
||||
sys.exit(result.returncode)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run full ADS-B processing pipeline locally",
|
||||
usage="python -m src.adsb.run_local [start_date] [end_date]"
|
||||
)
|
||||
parser.add_argument(
|
||||
"start_date",
|
||||
nargs="?",
|
||||
help="Start date (YYYY-MM-DD). Default: yesterday"
|
||||
)
|
||||
parser.add_argument(
|
||||
"end_date",
|
||||
nargs="?",
|
||||
help="End date (YYYY-MM-DD, inclusive). If omitted, processes single day"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunks",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of parallel chunks (default: 4)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-base",
|
||||
action="store_true",
|
||||
help="Skip downloading and merging with base release"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine dates
|
||||
if args.start_date:
|
||||
start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
|
||||
else:
|
||||
start_date = datetime.utcnow() - timedelta(days=1)
|
||||
|
||||
end_date = None
|
||||
if args.end_date:
|
||||
end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
|
||||
|
||||
start_str = start_date.strftime("%Y-%m-%d")
|
||||
end_str = end_date.strftime("%Y-%m-%d") if end_date else None
|
||||
|
||||
print("=" * 60)
|
||||
print("ADS-B Processing Pipeline")
|
||||
print("=" * 60)
|
||||
if end_str:
|
||||
print(f"Date range: {start_str} to {end_str}")
|
||||
else:
|
||||
print(f"Date: {start_str}")
|
||||
print(f"Chunks: {args.chunks}")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Download and extract
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 1: Download and Extract")
|
||||
print("=" * 60)
|
||||
|
||||
if end_str:
|
||||
cmd = ["python", "-m", "src.adsb.download_and_list_icaos",
|
||||
"--start-date", start_str, "--end-date", end_str]
|
||||
else:
|
||||
cmd = ["python", "-m", "src.adsb.download_and_list_icaos",
|
||||
"--date", start_str]
|
||||
run_cmd(cmd, "Download and extract")
|
||||
|
||||
# Step 2: Process chunks
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 2: Process Chunks")
|
||||
print("=" * 60)
|
||||
|
||||
for chunk_id in range(args.chunks):
|
||||
print(f"\n--- Chunk {chunk_id + 1}/{args.chunks} ---")
|
||||
if end_str:
|
||||
cmd = ["python", "-m", "src.adsb.process_icao_chunk",
|
||||
"--chunk-id", str(chunk_id),
|
||||
"--total-chunks", str(args.chunks),
|
||||
"--start-date", start_str,
|
||||
"--end-date", end_str]
|
||||
else:
|
||||
cmd = ["python", "-m", "src.adsb.process_icao_chunk",
|
||||
"--chunk-id", str(chunk_id),
|
||||
"--total-chunks", str(args.chunks),
|
||||
"--date", start_str]
|
||||
run_cmd(cmd, f"Process chunk {chunk_id}")
|
||||
|
||||
# Step 3: Combine chunks to CSV
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 3: Combine to CSV")
|
||||
print("=" * 60)
|
||||
|
||||
chunks_dir = "./data/output/adsb_chunks"
|
||||
cmd = ["python", "-m", "src.adsb.combine_chunks_to_csv",
|
||||
"--chunks-dir", chunks_dir]
|
||||
|
||||
if end_str:
|
||||
cmd.extend(["--start-date", start_str, "--end-date", end_str])
|
||||
else:
|
||||
cmd.extend(["--date", start_str])
|
||||
|
||||
if args.skip_base:
|
||||
cmd.append("--skip-base")
|
||||
|
||||
run_cmd(cmd, "Combine chunks")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Done!")
|
||||
print("=" * 60)
|
||||
|
||||
# Show output
|
||||
output_dir = "./data/openairframes"
|
||||
if end_str:
|
||||
output_file = f"openairframes_adsb_{start_str}_{end_str}.csv"
|
||||
else:
|
||||
output_file = f"openairframes_adsb_{start_str}_{start_str}.csv"
|
||||
|
||||
output_path = os.path.join(output_dir, output_file)
|
||||
if os.path.exists(output_path):
|
||||
size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
||||
print(f"Output: {output_path}")
|
||||
print(f"Size: {size_mb:.1f} MB")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -246,20 +246,6 @@ def process_submission(
|
||||
if schema_updated:
|
||||
schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
|
||||
|
||||
# Truncate JSON preview to stay under GitHub's 65536 char body limit
|
||||
max_json_preview = 50000
|
||||
if len(content_json) > max_json_preview:
|
||||
# Show first few entries as a preview
|
||||
preview_entries = submissions[:10]
|
||||
preview_json = json.dumps(preview_entries, indent=2, sort_keys=True)
|
||||
json_section = (
|
||||
f"### Submissions (showing 10 of {len(submissions)})\n"
|
||||
f"```json\n{preview_json}\n```\n\n"
|
||||
f"*Full submission ({len(submissions)} entries, {len(content_json):,} chars) is in the committed file.*"
|
||||
)
|
||||
else:
|
||||
json_section = f"### Submissions\n```json\n{content_json}\n```"
|
||||
|
||||
pr_body = f"""## Community Submission
|
||||
|
||||
Adds {len(submissions)} submission(s) from @{author_username}.
|
||||
@@ -271,7 +257,10 @@ Closes #{issue_number}
|
||||
|
||||
---
|
||||
|
||||
{json_section}"""
|
||||
### Submissions
|
||||
```json
|
||||
{content_json}
|
||||
```"""
|
||||
|
||||
pr = create_pull_request(
|
||||
title=f"Community submission: {filename}",
|
||||
|
||||
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
|
||||
"""Read all JSON submissions from the community directory."""
|
||||
all_submissions = []
|
||||
|
||||
for json_file in sorted(community_dir.glob("**/*.json")):
|
||||
for json_file in sorted(community_dir.glob("*.json")):
|
||||
try:
|
||||
with open(json_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
@@ -36,52 +36,6 @@ def get_latest_schema_version() -> int:
|
||||
return max_version
|
||||
|
||||
|
||||
def _is_balanced_json(text: str) -> bool:
|
||||
"""
|
||||
Check if JSON has balanced brackets/braces.
|
||||
|
||||
This is a simple check to ensure we captured complete JSON.
|
||||
Ignores brackets/braces inside strings.
|
||||
|
||||
Args:
|
||||
text: JSON text to check
|
||||
|
||||
Returns:
|
||||
True if balanced, False otherwise
|
||||
"""
|
||||
in_string = False
|
||||
escape = False
|
||||
stack = []
|
||||
pairs = {'[': ']', '{': '}'}
|
||||
|
||||
for char in text:
|
||||
if escape:
|
||||
escape = False
|
||||
continue
|
||||
|
||||
if char == '\\':
|
||||
escape = True
|
||||
continue
|
||||
|
||||
if char == '"' and not escape:
|
||||
in_string = not in_string
|
||||
continue
|
||||
|
||||
if in_string:
|
||||
continue
|
||||
|
||||
if char in pairs:
|
||||
stack.append(char)
|
||||
elif char in pairs.values():
|
||||
if not stack:
|
||||
return False
|
||||
if pairs[stack[-1]] != char:
|
||||
return False
|
||||
stack.pop()
|
||||
|
||||
return len(stack) == 0 and not in_string
|
||||
|
||||
|
||||
def get_schema_path(version: int | None = None) -> Path:
|
||||
"""
|
||||
Get path to a specific schema version, or latest if version is None.
|
||||
@@ -208,14 +162,10 @@ def extract_json_from_issue_body(body: str) -> str | None:
|
||||
return match.group(1).strip()
|
||||
|
||||
# Try: Raw JSON after "### Submission JSON" until next section or end
|
||||
# Use greedy matching since we have a clear boundary (next ### or end)
|
||||
pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*[\]}])(?=\s*\n###|\s*$)"
|
||||
pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*?[\]}])(?=\n###|\n\n###|$)"
|
||||
match = re.search(pattern_raw, body)
|
||||
if match:
|
||||
candidate = match.group(1).strip()
|
||||
# Validate it's complete JSON by checking balanced brackets
|
||||
if _is_balanced_json(candidate):
|
||||
return candidate
|
||||
return match.group(1).strip()
|
||||
|
||||
# Try: Any JSON object/array in the body (fallback)
|
||||
pattern_any = r"([\[{][\s\S]*?[\]}])"
|
||||
@@ -269,19 +219,7 @@ def parse_and_validate(json_str: str, schema: dict | None = None) -> tuple[list
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
# Provide detailed error context
|
||||
error_msg = f"Invalid JSON: {e}"
|
||||
|
||||
# Show context around the error position
|
||||
if hasattr(e, 'pos') and e.pos is not None:
|
||||
start = max(0, e.pos - 50)
|
||||
end = min(len(json_str), e.pos + 50)
|
||||
context = json_str[start:end]
|
||||
# Escape for readability
|
||||
context_escaped = repr(context)
|
||||
error_msg += f"\n\nContext around position {e.pos}: {context_escaped}"
|
||||
|
||||
return None, [error_msg]
|
||||
return None, [f"Invalid JSON: {e}"]
|
||||
|
||||
errors = validate_submission(data, schema)
|
||||
return data, errors
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import sys
|
||||
|
||||
import polars as pl
|
||||
|
||||
# Add adsb directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation
|
||||
|
||||
from adsb.compress_adsb_to_aircraft_data import (
|
||||
load_historical_for_day,
|
||||
concat_compressed_dfs,
|
||||
get_latest_aircraft_adsb_csv_df,
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Get yesterday's date (data for the previous day)
|
||||
day = datetime.now(timezone.utc) - timedelta(days=1)
|
||||
|
||||
# Find a day with complete data
|
||||
max_attempts = 2 # Don't look back more than a week
|
||||
for attempt in range(max_attempts):
|
||||
date_str = day.strftime("%Y-%m-%d")
|
||||
print(f"Processing ADS-B data for {date_str}")
|
||||
|
||||
print("Loading new ADS-B data...")
|
||||
df_new = load_historical_for_day(day)
|
||||
if df_new.height == 0:
|
||||
day = day - timedelta(days=1)
|
||||
continue
|
||||
max_time = df_new['time'].max()
|
||||
if max_time is not None:
|
||||
# Handle timezone
|
||||
max_time_dt = max_time
|
||||
if hasattr(max_time_dt, 'replace'):
|
||||
max_time_dt = max_time_dt.replace(tzinfo=timezone.utc)
|
||||
|
||||
end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5)
|
||||
|
||||
# Convert polars datetime to python datetime if needed
|
||||
if isinstance(max_time_dt, datetime):
|
||||
if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day:
|
||||
break
|
||||
else:
|
||||
# Polars returns python datetime already
|
||||
if max_time >= day.replace(hour=23, minute=54, second=59):
|
||||
break
|
||||
|
||||
print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.")
|
||||
day = day - timedelta(days=1)
|
||||
else:
|
||||
raise RuntimeError(f"Could not find complete data in the last {max_attempts} days")
|
||||
|
||||
try:
|
||||
# Get the latest release data
|
||||
print("Downloading latest ADS-B release...")
|
||||
df_base, start_date_str = get_latest_aircraft_adsb_csv_df()
|
||||
# Combine with historical data
|
||||
print("Combining with historical data...")
|
||||
df_combined = concat_compressed_dfs(df_base, df_new)
|
||||
except Exception as e:
|
||||
print(f"Error downloading latest ADS-B release: {e}")
|
||||
df_combined = df_new
|
||||
start_date_str = date_str
|
||||
|
||||
# Sort by time for consistent ordering
|
||||
df_combined = df_combined.sort('time')
|
||||
|
||||
# Convert any list columns to strings for CSV compatibility
|
||||
for col in df_combined.columns:
|
||||
if df_combined[col].dtype == pl.List:
|
||||
df_combined = df_combined.with_columns(
|
||||
pl.col(col).list.join(",").alias(col)
|
||||
)
|
||||
|
||||
# Save the result
|
||||
OUT_ROOT = Path("data/openairframes")
|
||||
OUT_ROOT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv.gz"
|
||||
df_combined.write_csv(output_file)
|
||||
|
||||
print(f"Saved: {output_file}")
|
||||
print(f"Total aircraft: {df_combined.height}")
|
||||
+25
-87
@@ -27,33 +27,6 @@ def _http_get_json(url: str, headers: dict[str, str]) -> dict:
|
||||
return json.loads(data.decode("utf-8"))
|
||||
|
||||
|
||||
def get_releases(repo: str = REPO, github_token: Optional[str] = None, per_page: int = 30) -> list[dict]:
|
||||
"""Get a list of releases from the repository."""
|
||||
url = f"https://api.github.com/repos/{repo}/releases?per_page={per_page}"
|
||||
headers = {
|
||||
"Accept": "application/vnd.github+json",
|
||||
"User-Agent": "openairframes-downloader/1.0",
|
||||
}
|
||||
if github_token:
|
||||
headers["Authorization"] = f"Bearer {github_token}"
|
||||
|
||||
return _http_get_json(url, headers=headers)
|
||||
|
||||
|
||||
def get_release_assets_from_release_data(release_data: dict) -> list[ReleaseAsset]:
|
||||
"""Extract assets from a release data dictionary."""
|
||||
assets = []
|
||||
for a in release_data.get("assets", []):
|
||||
assets.append(
|
||||
ReleaseAsset(
|
||||
name=a["name"],
|
||||
download_url=a["browser_download_url"],
|
||||
size=int(a.get("size", 0)),
|
||||
)
|
||||
)
|
||||
return assets
|
||||
|
||||
|
||||
def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
|
||||
url = f"https://api.github.com/repos/{repo}/releases/latest"
|
||||
headers = {
|
||||
@@ -64,7 +37,16 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
|
||||
headers["Authorization"] = f"Bearer {github_token}"
|
||||
|
||||
payload = _http_get_json(url, headers=headers)
|
||||
return get_release_assets_from_release_data(payload)
|
||||
assets = []
|
||||
for a in payload.get("assets", []):
|
||||
assets.append(
|
||||
ReleaseAsset(
|
||||
name=a["name"],
|
||||
download_url=a["browser_download_url"],
|
||||
size=int(a.get("size", 0)),
|
||||
)
|
||||
)
|
||||
return assets
|
||||
|
||||
|
||||
def pick_asset(
|
||||
@@ -173,8 +155,7 @@ def download_latest_aircraft_adsb_csv(
|
||||
repo: str = REPO,
|
||||
) -> Path:
|
||||
"""
|
||||
Download the latest openairframes_adsb_*.csv file from GitHub releases.
|
||||
If the latest release doesn't have the file, searches previous releases.
|
||||
Download the latest openairframes_adsb_*.csv file from the latest GitHub release.
|
||||
|
||||
Args:
|
||||
output_dir: Directory to save the downloaded file (default: "downloads")
|
||||
@@ -185,69 +166,26 @@ def download_latest_aircraft_adsb_csv(
|
||||
Path to the downloaded file
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
|
||||
# Get multiple releases
|
||||
releases = get_releases(repo, github_token=github_token, per_page=30)
|
||||
|
||||
# Try each release until we find one with the matching asset
|
||||
for release in releases:
|
||||
assets = get_release_assets_from_release_data(release)
|
||||
try:
|
||||
asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
|
||||
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
|
||||
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
|
||||
return saved_to
|
||||
except FileNotFoundError:
|
||||
# This release doesn't have the matching asset, try the next one
|
||||
continue
|
||||
|
||||
raise FileNotFoundError(
|
||||
f"No release in the last 30 releases has an asset matching 'openairframes_adsb_.*\\.csv(\\.gz)?$'"
|
||||
)
|
||||
assets = get_latest_release_assets(repo, github_token=github_token)
|
||||
asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
|
||||
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
|
||||
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
|
||||
return saved_to
|
||||
|
||||
|
||||
import polars as pl
|
||||
def get_latest_aircraft_adsb_csv_df():
|
||||
"""Download and load the latest ADS-B CSV from GitHub releases.
|
||||
|
||||
Returns:
|
||||
tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
|
||||
"""
|
||||
import re
|
||||
|
||||
csv_path = download_latest_aircraft_adsb_csv()
|
||||
df = pl.read_csv(csv_path, null_values=[""])
|
||||
|
||||
# Parse time column: values like "2025-12-31T00:00:00.040" or "2025-05-11T15:15:50.540+0000"
|
||||
# Try with timezone first (convert to naive), then without timezone
|
||||
df = df.with_columns(
|
||||
pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f%z", strict=False)
|
||||
.dt.replace_time_zone(None) # Convert to naive datetime first
|
||||
.fill_null(pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f", strict=False))
|
||||
)
|
||||
|
||||
# Cast dbFlags and year to strings to match the schema used in compress functions
|
||||
for col in ['dbFlags', 'year']:
|
||||
if col in df.columns:
|
||||
df = df.with_columns(pl.col(col).cast(pl.Utf8))
|
||||
|
||||
# Fill nulls with empty strings for string columns
|
||||
for col in df.columns:
|
||||
if df[col].dtype == pl.Utf8:
|
||||
df = df.with_columns(pl.col(col).fill_null(""))
|
||||
|
||||
# Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
|
||||
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
|
||||
import pandas as pd
|
||||
df = pd.read_csv(csv_path)
|
||||
df = df.fillna("")
|
||||
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
|
||||
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
|
||||
if not match:
|
||||
raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
|
||||
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
|
||||
|
||||
start_date = match.group(1)
|
||||
end_date = match.group(2)
|
||||
print(df.columns)
|
||||
print(df.dtypes)
|
||||
return df, start_date, end_date
|
||||
|
||||
date_str = match.group(1)
|
||||
return df, date_str
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
download_latest_aircraft_csv()
|
||||
download_latest_aircraft_adsb_csv()
|
||||
Reference in New Issue
Block a user