From 4e803dbb45743d83a0452e222836d782fffeace3 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Wed, 11 Feb 2026 23:28:50 -0500 Subject: [PATCH 1/3] remove confirmations --- .../ISSUE_TEMPLATE/community_submission.yaml | 24 +------------------ 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/community_submission.yaml b/.github/ISSUE_TEMPLATE/community_submission.yaml index 104aa8e..938843d 100644 --- a/.github/ISSUE_TEMPLATE/community_submission.yaml +++ b/.github/ISSUE_TEMPLATE/community_submission.yaml @@ -43,7 +43,7 @@ body: id: contributor_name attributes: label: Contributor Name - description: Your display name for attribution. Leave blank to use your GitHub username. Max 150 characters. + description: Your display name for attribution. Leave blank for no attribution. Max 150 characters. placeholder: "e.g., JamesBerry.com or leave blank" validations: required: false @@ -58,28 +58,6 @@ body: validations: required: true - - type: dropdown - id: submission_type - attributes: - label: What did you submit? - options: - - Single object - - Multiple objects (array) - validations: - required: true - - - type: checkboxes - id: confirmations - attributes: - label: Confirmations - options: - - label: "I confirm this is valid JSON (not JSONL) and matches the field names exactly." - required: true - - label: "I confirm `transponder_code_hex` values (if provided) are 6 hex characters." - required: true - - label: "I understand submissions are reviewed and may be rejected or require changes." - required: true - - type: textarea id: notes attributes: From e5c99b611cfb50c4a5826402e4f55afb2a735d16 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Wed, 11 Feb 2026 23:39:19 -0500 Subject: [PATCH 2/3] make a histoircla runner for adsb --- .github/workflows/historical-adsb.yaml | 128 +++++++++++++++++++++++++ src/adsb/historical_combine_chunks.py | 85 ++++++++++++++++ src/adsb/historical_generate_matrix.py | 62 ++++++++++++ src/adsb/historical_process_chunk.py | 91 ++++++++++++++++++ 4 files changed, 366 insertions(+) create mode 100644 .github/workflows/historical-adsb.yaml create mode 100644 src/adsb/historical_combine_chunks.py create mode 100644 src/adsb/historical_generate_matrix.py create mode 100644 src/adsb/historical_process_chunk.py diff --git a/.github/workflows/historical-adsb.yaml b/.github/workflows/historical-adsb.yaml new file mode 100644 index 0000000..db85e99 --- /dev/null +++ b/.github/workflows/historical-adsb.yaml @@ -0,0 +1,128 @@ +name: Historical ADS-B Processing + +on: + workflow_dispatch: + inputs: + start_date: + description: 'Start date (YYYY-MM-DD, inclusive)' + required: true + type: string + end_date: + description: 'End date (YYYY-MM-DD, inclusive)' + required: true + type: string + chunk_days: + description: 'Days per job chunk (default: 7)' + required: false + type: number + default: 7 + +jobs: + generate-matrix: + runs-on: ubuntu-latest + outputs: + chunks: ${{ steps.generate.outputs.chunks }} + global_start: ${{ inputs.start_date }} + global_end: ${{ inputs.end_date }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Generate date chunks + id: generate + env: + INPUT_START_DATE: ${{ inputs.start_date }} + INPUT_END_DATE: ${{ inputs.end_date }} + INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }} + run: python src/adsb/historical_generate_matrix.py + + process-chunk: + needs: generate-matrix + runs-on: ubuntu-latest + strategy: + matrix: + chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }} + max-parallel: 3 + fail-fast: false + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install polars pyarrow orjson zstandard + + - name: Free disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + df -h + + - name: Process date range + env: + CHUNK_START_DATE: ${{ matrix.chunk.start_date }} + CHUNK_END_DATE: ${{ matrix.chunk.end_date }} + working-directory: src/adsb + run: python historical_process_chunk.py + + - name: Upload chunk artifact + uses: actions/upload-artifact@v4 + with: + name: chunk-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }} + path: data/chunks/*.csv + retention-days: 1 + if-no-files-found: ignore + + combine-chunks: + needs: [generate-matrix, process-chunk] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install polars + + - name: Download all chunk artifacts + uses: actions/download-artifact@v4 + with: + path: chunks + pattern: chunk-* + merge-multiple: true + + - name: List downloaded chunks + run: | + echo "Downloaded chunks:" + find chunks -name "*.csv" -type f 2>/dev/null || echo "No CSV files found" + + - name: Combine chunks + env: + GLOBAL_START_DATE: ${{ needs.generate-matrix.outputs.global_start }} + GLOBAL_END_DATE: ${{ needs.generate-matrix.outputs.global_end }} + run: python src/adsb/historical_combine_chunks.py + + - name: Upload final artifact + uses: actions/upload-artifact@v4 + with: + name: planequery_aircraft_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }} + path: data/planequery_aircraft/*.csv + retention-days: 30 diff --git a/src/adsb/historical_combine_chunks.py b/src/adsb/historical_combine_chunks.py new file mode 100644 index 0000000..06ce5c6 --- /dev/null +++ b/src/adsb/historical_combine_chunks.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +"""Combine processed chunks into final historical ADS-B release.""" + +import os +import sys +from pathlib import Path + +import polars as pl + + +def combine_chunks(chunks_dir: Path, output_dir: Path, start_date: str, end_date: str) -> Path: + """Combine all chunk CSVs into final output. + + Args: + chunks_dir: Directory containing chunk CSV files + output_dir: Directory to write final output + start_date: Global start date for filename + end_date: Global end date for filename + + Returns: + Path to final output CSV + """ + # Import here to allow script to be run from repo root + sys.path.insert(0, str(Path(__file__).parent)) + from compress_adsb_to_aircraft_data import deduplicate_by_signature + + csv_files = sorted(chunks_dir.glob("**/*.csv")) + print(f"Found {len(csv_files)} chunk files") + + if not csv_files: + print("ERROR: No chunk files found", file=sys.stderr) + sys.exit(1) + + dfs: list[pl.DataFrame] = [] + for csv_file in csv_files: + print(f"Loading {csv_file}") + df = pl.read_csv(csv_file, null_values=[""]) + dfs.append(df) + print(f" {df.height} rows") + + df_combined = pl.concat(dfs) + print(f"Combined: {df_combined.height} rows") + + df_combined = deduplicate_by_signature(df_combined) + print(f"After final dedup: {df_combined.height} rows") + + # Sort by time + if "time" in df_combined.columns: + df_combined = df_combined.sort("time") + + # Convert list columns to strings for CSV compatibility + for col in df_combined.columns: + if df_combined[col].dtype == pl.List: + df_combined = df_combined.with_columns( + pl.col(col).list.join(",").alias(col) + ) + + # Write output + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"planequery_aircraft_adsb_{start_date}_{end_date}.csv" + + df_combined.write_csv(output_path) + print(f"Wrote final output: {output_path}") + print(f"Total records: {df_combined.height}") + + return output_path + + +def main() -> None: + """Main entry point for GitHub Actions.""" + start_date = os.environ.get("GLOBAL_START_DATE") + end_date = os.environ.get("GLOBAL_END_DATE") + + if not start_date or not end_date: + print("ERROR: GLOBAL_START_DATE and GLOBAL_END_DATE must be set", file=sys.stderr) + sys.exit(1) + + chunks_dir = Path("chunks") + output_dir = Path("data/planequery_aircraft") + + combine_chunks(chunks_dir, output_dir, start_date, end_date) + + +if __name__ == "__main__": + main() diff --git a/src/adsb/historical_generate_matrix.py b/src/adsb/historical_generate_matrix.py new file mode 100644 index 0000000..3a687e5 --- /dev/null +++ b/src/adsb/historical_generate_matrix.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +"""Generate date chunk matrix for historical ADS-B processing.""" + +import json +import os +import sys +from datetime import datetime, timedelta + + +def generate_chunks(start_date: str, end_date: str, chunk_days: int) -> list[dict]: + """Generate date chunks for parallel processing. + + Args: + start_date: Start date in YYYY-MM-DD format + end_date: End date in YYYY-MM-DD format + chunk_days: Number of days per chunk + + Returns: + List of chunk dictionaries with start_date and end_date + """ + start = datetime.strptime(start_date, "%Y-%m-%d") + end = datetime.strptime(end_date, "%Y-%m-%d") + + chunks = [] + current = start + + while current <= end: + chunk_end = min(current + timedelta(days=chunk_days - 1), end) + chunks.append({ + "start_date": current.strftime("%Y-%m-%d"), + "end_date": chunk_end.strftime("%Y-%m-%d"), + }) + current = chunk_end + timedelta(days=1) + + return chunks + + +def main() -> None: + """Main entry point for GitHub Actions.""" + start_date = os.environ.get("INPUT_START_DATE") + end_date = os.environ.get("INPUT_END_DATE") + chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7")) + + if not start_date or not end_date: + print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr) + sys.exit(1) + + chunks = generate_chunks(start_date, end_date, chunk_days) + print(f"Generated {len(chunks)} chunks for {start_date} to {end_date}") + + # Write to GitHub Actions output + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + with open(github_output, "a") as f: + f.write(f"chunks={json.dumps(chunks)}\n") + else: + # For local testing, just print + print(json.dumps(chunks, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/src/adsb/historical_process_chunk.py b/src/adsb/historical_process_chunk.py new file mode 100644 index 0000000..f5dbe1e --- /dev/null +++ b/src/adsb/historical_process_chunk.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +"""Process a single date chunk for historical ADS-B data.""" + +import os +import sys +from datetime import datetime, timedelta +from pathlib import Path + +# Add parent directory to path for imports when run from repo root +sys.path.insert(0, str(Path(__file__).parent)) + + +def process_chunk(start_date: str, end_date: str, output_dir: Path) -> Path | None: + """Process a date range and output compressed CSV. + + Args: + start_date: Start date in YYYY-MM-DD format + end_date: End date in YYYY-MM-DD format + output_dir: Directory to write output CSV + + Returns: + Path to output CSV, or None if no data + """ + from compress_adsb_to_aircraft_data import ( + load_historical_for_day, + deduplicate_by_signature, + ) + import polars as pl + + start = datetime.strptime(start_date, "%Y-%m-%d") + end = datetime.strptime(end_date, "%Y-%m-%d") + + total_days = (end - start).days + 1 + print(f"Processing {total_days} days [{start_date}, {end_date}]") + + dfs: list[pl.DataFrame] = [] + current_date = start + + while current_date <= end: + day_str = current_date.strftime("%Y-%m-%d") + print(f" Loading {day_str}...") + + try: + df_compressed = load_historical_for_day(current_date) + if df_compressed.height > 0: + dfs.append(df_compressed) + total_rows = sum(df.height for df in dfs) + print(f" +{df_compressed.height} rows (total: {total_rows})") + except Exception as e: + print(f" Warning: Failed to load {day_str}: {e}") + + current_date += timedelta(days=1) + + if not dfs: + print("No data found for this chunk") + return None + + df_accumulated = pl.concat(dfs) + df_accumulated = deduplicate_by_signature(df_accumulated) + print(f"After dedup: {df_accumulated.height} rows") + + # Write output + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"chunk_{start_date}_{end_date}.csv" + df_accumulated.write_csv(output_path) + print(f"Wrote {output_path}") + + return output_path + + +def main() -> None: + """Main entry point for GitHub Actions.""" + start_date = os.environ.get("CHUNK_START_DATE") + end_date = os.environ.get("CHUNK_END_DATE") + + if not start_date or not end_date: + print("ERROR: CHUNK_START_DATE and CHUNK_END_DATE must be set", file=sys.stderr) + sys.exit(1) + + # Output to repo root data/chunks (script runs from src/adsb) + repo_root = Path(__file__).parent.parent.parent + output_dir = repo_root / "data" / "chunks" + result = process_chunk(start_date, end_date, output_dir) + + if result is None: + print("No data produced for this chunk") + sys.exit(0) + + +if __name__ == "__main__": + main() From 953a3647dfca14721eb3c9e358d528299f356ef6 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Wed, 11 Feb 2026 23:40:46 -0500 Subject: [PATCH 3/3] remove process historical-faa github workflow --- .github/workflows/process-historical-faa.yaml | 171 ------------------ src/get_historical_faa.py | 116 ------------ 2 files changed, 287 deletions(-) delete mode 100644 .github/workflows/process-historical-faa.yaml delete mode 100644 src/get_historical_faa.py diff --git a/.github/workflows/process-historical-faa.yaml b/.github/workflows/process-historical-faa.yaml deleted file mode 100644 index d015499..0000000 --- a/.github/workflows/process-historical-faa.yaml +++ /dev/null @@ -1,171 +0,0 @@ -name: Process Historical FAA Data - -on: - workflow_dispatch: # Manual trigger - -jobs: - generate-matrix: - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - name: Generate date ranges - id: set-matrix - run: | - python3 << 'EOF' - import json - from datetime import datetime, timedelta - - start = datetime(2023, 8, 16) - end = datetime(2026, 1, 1) - - ranges = [] - current = start - - # Process in 4-day chunks - while current < end: - chunk_end = current + timedelta(days=4) - # Don't go past the end date - if chunk_end > end: - chunk_end = end - - ranges.append({ - "since": current.strftime("%Y-%m-%d"), - "until": chunk_end.strftime("%Y-%m-%d") - }) - - current = chunk_end - - print(f"::set-output name=matrix::{json.dumps(ranges)}") - EOF - - clone-faa-repo: - runs-on: ubuntu-latest - steps: - - name: Cache FAA repository - id: cache-faa-repo - uses: actions/cache@v4 - with: - path: data/scrape-faa-releasable-aircraft - key: faa-repo-v1 - - - name: Clone FAA repository - if: steps.cache-faa-repo.outputs.cache-hit != 'true' - run: | - mkdir -p data - git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft - echo "Repository cloned successfully" - - process-chunk: - needs: [generate-matrix, clone-faa-repo] - runs-on: ubuntu-latest - strategy: - max-parallel: 5 # Process 5 chunks at a time - matrix: - range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Restore FAA repository cache - uses: actions/cache/restore@v4 - with: - path: data/scrape-faa-releasable-aircraft - key: faa-repo-v1 - fail-on-cache-miss: true - - - name: Install dependencies - run: | - pip install -r requirements.txt - - - name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }} - run: | - python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}" - - - name: Upload CSV artifact - uses: actions/upload-artifact@v4 - with: - name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }} - path: data/faa_releasable_historical/*.csv - retention-days: 1 - - create-release: - needs: process-chunk - runs-on: ubuntu-latest - permissions: - contents: write - steps: - - name: Download all artifacts - uses: actions/download-artifact@v4 - with: - path: artifacts - - - name: Prepare release files - run: | - mkdir -p release-files - find artifacts -name "*.csv" -exec cp {} release-files/ \; - ls -lh release-files/ - - - name: Create Release - uses: softprops/action-gh-release@v1 - with: - tag_name: historical-faa-${{ github.run_number }} - name: Historical FAA Data Release ${{ github.run_number }} - body: | - Automated release of historical FAA aircraft data - Processing period: 2023-08-16 to 2026-01-01 - Generated: ${{ github.event.repository.updated_at }} - files: release-files/*.csv - draft: false - prerelease: false - - concatenate-and-release: - needs: process-chunk - runs-on: ubuntu-latest - permissions: - contents: write - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install dependencies - run: | - pip install -r requirements.txt - - - name: Download all artifacts - uses: actions/download-artifact@v4 - with: - path: artifacts - - - name: Prepare CSVs for concatenation - run: | - mkdir -p data/faa_releasable_historical - find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \; - ls -lh data/faa_releasable_historical/ - - - name: Concatenate all CSVs - run: | - python scripts/concat_csvs.py - - - name: Create Combined Release - uses: softprops/action-gh-release@v1 - with: - tag_name: historical-faa-combined-${{ github.run_number }} - name: Historical FAA Data Combined Release ${{ github.run_number }} - body: | - Combined historical FAA aircraft data (all chunks concatenated) - Processing period: 2023-08-16 to 2026-01-01 - Generated: ${{ github.event.repository.updated_at }} - files: data/planequery_aircraft/*.csv - draft: false - prerelease: false \ No newline at end of file diff --git a/src/get_historical_faa.py b/src/get_historical_faa.py deleted file mode 100644 index 656345e..0000000 --- a/src/get_historical_faa.py +++ /dev/null @@ -1,116 +0,0 @@ -""" -For each commit-day in Feb 2024 (last commit per day): -- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/ - ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt -- Recombine MASTER-*.txt into Master.txt -- Produce Master.csv via convert_faa_master_txt_to_csv - -Assumes the non-master files are present in every commit. -""" -import subprocess, re -from pathlib import Path -import shutil -from collections import OrderedDict -from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df -import zipfile -import pandas as pd -import argparse -from datetime import datetime, timedelta - -# Parse command line arguments -parser = argparse.ArgumentParser(description="Process historical FAA data from git commits") -parser.add_argument("since", help="Start date (YYYY-MM-DD)") -parser.add_argument("until", help="End date (YYYY-MM-DD)") -args = parser.parse_args() - -# Clone repository if it doesn't exist -REPO = Path("data/scrape-faa-releasable-aircraft") -OUT_ROOT = Path("data/faa_releasable_historical") -OUT_ROOT.mkdir(parents=True, exist_ok=True) - -def run_git_text(*args: str) -> str: - return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip() - -def run_git_bytes(*args: str) -> bytes: - return subprocess.check_output(["git", "-C", str(REPO), *args]) - -# Parse dates and adjust --since to the day before -since_date = datetime.strptime(args.since, "%Y-%m-%d") -adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d") - -# All commits in specified date range (oldest -> newest) -log = run_git_text( - "log", - "--reverse", - "--format=%H %cs", - f"--since={adjusted_since}", - f"--until={args.until}", -) -lines = [ln for ln in log.splitlines() if ln.strip()] -if not lines: - raise SystemExit(f"No commits found between {args.since} and {args.until}.") - -# date -> last SHA that day -date_to_sha = OrderedDict() -for ln in lines: - sha, date = ln.split() - date_to_sha[date] = sha - -OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"] -master_re = re.compile(r"^MASTER-(\d+)\.txt$") -df_base = pd.DataFrame() -start_date = None -end_date = None -for date, sha in date_to_sha.items(): - if start_date is None: - start_date = date - end_date = date - day_dir = OUT_ROOT / date - day_dir.mkdir(parents=True, exist_ok=True) - - # Write auxiliary files (assumed present) - for fname in OTHER_FILES: - (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}")) - - # Recombine MASTER parts - names = run_git_text("ls-tree", "--name-only", sha).splitlines() - parts = [] - for n in names: - m = master_re.match(n) - if m: - parts.append((int(m.group(1)), n)) - parts.sort() - if not parts: - raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found") - - master_path = day_dir / "MASTER.txt" - with master_path.open("wb") as w: - for _, fname in parts: - data = run_git_bytes("show", f"{sha}:{fname}") - w.write(data) - if data and not data.endswith(b"\n"): - w.write(b"\n") - - # 3) Zip the day's files - zip_path = day_dir / f"ReleasableAircraft.zip" - with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z: - for p in day_dir.iterdir(): - z.write(p, arcname=p.name) - - print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})") - # 4) Convert ZIP -> CSV - df_new = convert_faa_master_txt_to_df(zip_path, date) - if df_base.empty: - df_base = df_new - print(len(df_base), "total entries so far") - # Delete all files in the day directory - shutil.rmtree(day_dir) - continue - - df_base = concat_faa_historical_df(df_base, df_new) - shutil.rmtree(day_dir) - print(len(df_base), "total entries so far") - -assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" -df_base.to_csv(OUT_ROOT / f"planequery_aircraft_faa_{start_date}_{end_date}.csv", index=False) -# TODO: get average number of new rows per day.