Merge pull request #3 from PlaneQuery/develop

Develop to main new historical adsb workflow. Community Submission updates.
This commit is contained in:
JG
2026-02-11 23:42:40 -05:00
committed by GitHub
7 changed files with 367 additions and 310 deletions
@@ -43,7 +43,7 @@ body:
id: contributor_name
attributes:
label: Contributor Name
description: Your display name for attribution. Leave blank to use your GitHub username. Max 150 characters.
description: Your display name for attribution. Leave blank for no attribution. Max 150 characters.
placeholder: "e.g., JamesBerry.com or leave blank"
validations:
required: false
@@ -58,28 +58,6 @@ body:
validations:
required: true
- type: dropdown
id: submission_type
attributes:
label: What did you submit?
options:
- Single object
- Multiple objects (array)
validations:
required: true
- type: checkboxes
id: confirmations
attributes:
label: Confirmations
options:
- label: "I confirm this is valid JSON (not JSONL) and matches the field names exactly."
required: true
- label: "I confirm `transponder_code_hex` values (if provided) are 6 hex characters."
required: true
- label: "I understand submissions are reviewed and may be rejected or require changes."
required: true
- type: textarea
id: notes
attributes:
+128
View File
@@ -0,0 +1,128 @@
name: Historical ADS-B Processing
on:
workflow_dispatch:
inputs:
start_date:
description: 'Start date (YYYY-MM-DD, inclusive)'
required: true
type: string
end_date:
description: 'End date (YYYY-MM-DD, inclusive)'
required: true
type: string
chunk_days:
description: 'Days per job chunk (default: 7)'
required: false
type: number
default: 7
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
chunks: ${{ steps.generate.outputs.chunks }}
global_start: ${{ inputs.start_date }}
global_end: ${{ inputs.end_date }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Generate date chunks
id: generate
env:
INPUT_START_DATE: ${{ inputs.start_date }}
INPUT_END_DATE: ${{ inputs.end_date }}
INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
run: python src/adsb/historical_generate_matrix.py
process-chunk:
needs: generate-matrix
runs-on: ubuntu-latest
strategy:
matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
max-parallel: 3
fail-fast: false
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install polars pyarrow orjson zstandard
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
df -h
- name: Process date range
env:
CHUNK_START_DATE: ${{ matrix.chunk.start_date }}
CHUNK_END_DATE: ${{ matrix.chunk.end_date }}
working-directory: src/adsb
run: python historical_process_chunk.py
- name: Upload chunk artifact
uses: actions/upload-artifact@v4
with:
name: chunk-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
path: data/chunks/*.csv
retention-days: 1
if-no-files-found: ignore
combine-chunks:
needs: [generate-matrix, process-chunk]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install polars
- name: Download all chunk artifacts
uses: actions/download-artifact@v4
with:
path: chunks
pattern: chunk-*
merge-multiple: true
- name: List downloaded chunks
run: |
echo "Downloaded chunks:"
find chunks -name "*.csv" -type f 2>/dev/null || echo "No CSV files found"
- name: Combine chunks
env:
GLOBAL_START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
GLOBAL_END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
run: python src/adsb/historical_combine_chunks.py
- name: Upload final artifact
uses: actions/upload-artifact@v4
with:
name: planequery_aircraft_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
path: data/planequery_aircraft/*.csv
retention-days: 30
@@ -1,171 +0,0 @@
name: Process Historical FAA Data
on:
workflow_dispatch: # Manual trigger
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Generate date ranges
id: set-matrix
run: |
python3 << 'EOF'
import json
from datetime import datetime, timedelta
start = datetime(2023, 8, 16)
end = datetime(2026, 1, 1)
ranges = []
current = start
# Process in 4-day chunks
while current < end:
chunk_end = current + timedelta(days=4)
# Don't go past the end date
if chunk_end > end:
chunk_end = end
ranges.append({
"since": current.strftime("%Y-%m-%d"),
"until": chunk_end.strftime("%Y-%m-%d")
})
current = chunk_end
print(f"::set-output name=matrix::{json.dumps(ranges)}")
EOF
clone-faa-repo:
runs-on: ubuntu-latest
steps:
- name: Cache FAA repository
id: cache-faa-repo
uses: actions/cache@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
- name: Clone FAA repository
if: steps.cache-faa-repo.outputs.cache-hit != 'true'
run: |
mkdir -p data
git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft
echo "Repository cloned successfully"
process-chunk:
needs: [generate-matrix, clone-faa-repo]
runs-on: ubuntu-latest
strategy:
max-parallel: 5 # Process 5 chunks at a time
matrix:
range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Restore FAA repository cache
uses: actions/cache/restore@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
fail-on-cache-miss: true
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }}
run: |
python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}"
- name: Upload CSV artifact
uses: actions/upload-artifact@v4
with:
name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }}
path: data/faa_releasable_historical/*.csv
retention-days: 1
create-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare release files
run: |
mkdir -p release-files
find artifacts -name "*.csv" -exec cp {} release-files/ \;
ls -lh release-files/
- name: Create Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-${{ github.run_number }}
name: Historical FAA Data Release ${{ github.run_number }}
body: |
Automated release of historical FAA aircraft data
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: release-files/*.csv
draft: false
prerelease: false
concatenate-and-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare CSVs for concatenation
run: |
mkdir -p data/faa_releasable_historical
find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
ls -lh data/faa_releasable_historical/
- name: Concatenate all CSVs
run: |
python scripts/concat_csvs.py
- name: Create Combined Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-combined-${{ github.run_number }}
name: Historical FAA Data Combined Release ${{ github.run_number }}
body: |
Combined historical FAA aircraft data (all chunks concatenated)
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: data/planequery_aircraft/*.csv
draft: false
prerelease: false
+85
View File
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""Combine processed chunks into final historical ADS-B release."""
import os
import sys
from pathlib import Path
import polars as pl
def combine_chunks(chunks_dir: Path, output_dir: Path, start_date: str, end_date: str) -> Path:
"""Combine all chunk CSVs into final output.
Args:
chunks_dir: Directory containing chunk CSV files
output_dir: Directory to write final output
start_date: Global start date for filename
end_date: Global end date for filename
Returns:
Path to final output CSV
"""
# Import here to allow script to be run from repo root
sys.path.insert(0, str(Path(__file__).parent))
from compress_adsb_to_aircraft_data import deduplicate_by_signature
csv_files = sorted(chunks_dir.glob("**/*.csv"))
print(f"Found {len(csv_files)} chunk files")
if not csv_files:
print("ERROR: No chunk files found", file=sys.stderr)
sys.exit(1)
dfs: list[pl.DataFrame] = []
for csv_file in csv_files:
print(f"Loading {csv_file}")
df = pl.read_csv(csv_file, null_values=[""])
dfs.append(df)
print(f" {df.height} rows")
df_combined = pl.concat(dfs)
print(f"Combined: {df_combined.height} rows")
df_combined = deduplicate_by_signature(df_combined)
print(f"After final dedup: {df_combined.height} rows")
# Sort by time
if "time" in df_combined.columns:
df_combined = df_combined.sort("time")
# Convert list columns to strings for CSV compatibility
for col in df_combined.columns:
if df_combined[col].dtype == pl.List:
df_combined = df_combined.with_columns(
pl.col(col).list.join(",").alias(col)
)
# Write output
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"planequery_aircraft_adsb_{start_date}_{end_date}.csv"
df_combined.write_csv(output_path)
print(f"Wrote final output: {output_path}")
print(f"Total records: {df_combined.height}")
return output_path
def main() -> None:
"""Main entry point for GitHub Actions."""
start_date = os.environ.get("GLOBAL_START_DATE")
end_date = os.environ.get("GLOBAL_END_DATE")
if not start_date or not end_date:
print("ERROR: GLOBAL_START_DATE and GLOBAL_END_DATE must be set", file=sys.stderr)
sys.exit(1)
chunks_dir = Path("chunks")
output_dir = Path("data/planequery_aircraft")
combine_chunks(chunks_dir, output_dir, start_date, end_date)
if __name__ == "__main__":
main()
+62
View File
@@ -0,0 +1,62 @@
#!/usr/bin/env python3
"""Generate date chunk matrix for historical ADS-B processing."""
import json
import os
import sys
from datetime import datetime, timedelta
def generate_chunks(start_date: str, end_date: str, chunk_days: int) -> list[dict]:
"""Generate date chunks for parallel processing.
Args:
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
chunk_days: Number of days per chunk
Returns:
List of chunk dictionaries with start_date and end_date
"""
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
chunks = []
current = start
while current <= end:
chunk_end = min(current + timedelta(days=chunk_days - 1), end)
chunks.append({
"start_date": current.strftime("%Y-%m-%d"),
"end_date": chunk_end.strftime("%Y-%m-%d"),
})
current = chunk_end + timedelta(days=1)
return chunks
def main() -> None:
"""Main entry point for GitHub Actions."""
start_date = os.environ.get("INPUT_START_DATE")
end_date = os.environ.get("INPUT_END_DATE")
chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
if not start_date or not end_date:
print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
sys.exit(1)
chunks = generate_chunks(start_date, end_date, chunk_days)
print(f"Generated {len(chunks)} chunks for {start_date} to {end_date}")
# Write to GitHub Actions output
github_output = os.environ.get("GITHUB_OUTPUT")
if github_output:
with open(github_output, "a") as f:
f.write(f"chunks={json.dumps(chunks)}\n")
else:
# For local testing, just print
print(json.dumps(chunks, indent=2))
if __name__ == "__main__":
main()
+91
View File
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
"""Process a single date chunk for historical ADS-B data."""
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
# Add parent directory to path for imports when run from repo root
sys.path.insert(0, str(Path(__file__).parent))
def process_chunk(start_date: str, end_date: str, output_dir: Path) -> Path | None:
"""Process a date range and output compressed CSV.
Args:
start_date: Start date in YYYY-MM-DD format
end_date: End date in YYYY-MM-DD format
output_dir: Directory to write output CSV
Returns:
Path to output CSV, or None if no data
"""
from compress_adsb_to_aircraft_data import (
load_historical_for_day,
deduplicate_by_signature,
)
import polars as pl
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
total_days = (end - start).days + 1
print(f"Processing {total_days} days [{start_date}, {end_date}]")
dfs: list[pl.DataFrame] = []
current_date = start
while current_date <= end:
day_str = current_date.strftime("%Y-%m-%d")
print(f" Loading {day_str}...")
try:
df_compressed = load_historical_for_day(current_date)
if df_compressed.height > 0:
dfs.append(df_compressed)
total_rows = sum(df.height for df in dfs)
print(f" +{df_compressed.height} rows (total: {total_rows})")
except Exception as e:
print(f" Warning: Failed to load {day_str}: {e}")
current_date += timedelta(days=1)
if not dfs:
print("No data found for this chunk")
return None
df_accumulated = pl.concat(dfs)
df_accumulated = deduplicate_by_signature(df_accumulated)
print(f"After dedup: {df_accumulated.height} rows")
# Write output
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"chunk_{start_date}_{end_date}.csv"
df_accumulated.write_csv(output_path)
print(f"Wrote {output_path}")
return output_path
def main() -> None:
"""Main entry point for GitHub Actions."""
start_date = os.environ.get("CHUNK_START_DATE")
end_date = os.environ.get("CHUNK_END_DATE")
if not start_date or not end_date:
print("ERROR: CHUNK_START_DATE and CHUNK_END_DATE must be set", file=sys.stderr)
sys.exit(1)
# Output to repo root data/chunks (script runs from src/adsb)
repo_root = Path(__file__).parent.parent.parent
output_dir = repo_root / "data" / "chunks"
result = process_chunk(start_date, end_date, output_dir)
if result is None:
print("No data produced for this chunk")
sys.exit(0)
if __name__ == "__main__":
main()
-116
View File
@@ -1,116 +0,0 @@
"""
For each commit-day in Feb 2024 (last commit per day):
- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
- Recombine MASTER-*.txt into Master.txt
- Produce Master.csv via convert_faa_master_txt_to_csv
Assumes the non-master files are present in every commit.
"""
import subprocess, re
from pathlib import Path
import shutil
from collections import OrderedDict
from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
import zipfile
import pandas as pd
import argparse
from datetime import datetime, timedelta
# Parse command line arguments
parser = argparse.ArgumentParser(description="Process historical FAA data from git commits")
parser.add_argument("since", help="Start date (YYYY-MM-DD)")
parser.add_argument("until", help="End date (YYYY-MM-DD)")
args = parser.parse_args()
# Clone repository if it doesn't exist
REPO = Path("data/scrape-faa-releasable-aircraft")
OUT_ROOT = Path("data/faa_releasable_historical")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
def run_git_text(*args: str) -> str:
return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip()
def run_git_bytes(*args: str) -> bytes:
return subprocess.check_output(["git", "-C", str(REPO), *args])
# Parse dates and adjust --since to the day before
since_date = datetime.strptime(args.since, "%Y-%m-%d")
adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d")
# All commits in specified date range (oldest -> newest)
log = run_git_text(
"log",
"--reverse",
"--format=%H %cs",
f"--since={adjusted_since}",
f"--until={args.until}",
)
lines = [ln for ln in log.splitlines() if ln.strip()]
if not lines:
raise SystemExit(f"No commits found between {args.since} and {args.until}.")
# date -> last SHA that day
date_to_sha = OrderedDict()
for ln in lines:
sha, date = ln.split()
date_to_sha[date] = sha
OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
master_re = re.compile(r"^MASTER-(\d+)\.txt$")
df_base = pd.DataFrame()
start_date = None
end_date = None
for date, sha in date_to_sha.items():
if start_date is None:
start_date = date
end_date = date
day_dir = OUT_ROOT / date
day_dir.mkdir(parents=True, exist_ok=True)
# Write auxiliary files (assumed present)
for fname in OTHER_FILES:
(day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
# Recombine MASTER parts
names = run_git_text("ls-tree", "--name-only", sha).splitlines()
parts = []
for n in names:
m = master_re.match(n)
if m:
parts.append((int(m.group(1)), n))
parts.sort()
if not parts:
raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
master_path = day_dir / "MASTER.txt"
with master_path.open("wb") as w:
for _, fname in parts:
data = run_git_bytes("show", f"{sha}:{fname}")
w.write(data)
if data and not data.endswith(b"\n"):
w.write(b"\n")
# 3) Zip the day's files
zip_path = day_dir / f"ReleasableAircraft.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
for p in day_dir.iterdir():
z.write(p, arcname=p.name)
print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
# 4) Convert ZIP -> CSV
df_new = convert_faa_master_txt_to_df(zip_path, date)
if df_base.empty:
df_base = df_new
print(len(df_base), "total entries so far")
# Delete all files in the day directory
shutil.rmtree(day_dir)
continue
df_base = concat_faa_historical_df(df_base, df_new)
shutil.rmtree(day_dir)
print(len(df_base), "total entries so far")
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
df_base.to_csv(OUT_ROOT / f"planequery_aircraft_faa_{start_date}_{end_date}.csv", index=False)
# TODO: get average number of new rows per day.