Daily ADSB and Histoircal updates. Update readme.md

2026-04-23 19:46:09 +02:00 · 2026-02-13 11:49:18 -05:00
parent 4015a5fcf1
commit d216ea9329
32 changed files with 1489 additions and 1744 deletions
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import re
+from pathlib import Path
+import polars as pl
+
+# Find all CSV.gz files in the downloaded artifacts
+artifacts_dir = Path("downloads/adsb_artifacts")
+files = sorted(artifacts_dir.glob("*/openairframes_adsb_*.csv.gz"))
+
+if not files:
+    raise SystemExit("No CSV.gz files found in downloads/adsb_artifacts/")
+
+print(f"Found {len(files)} files to concatenate")
+
+# Extract dates from filenames to determine range
+def extract_dates(path: Path) -> tuple[str, str]:
+    """Extract start and end dates from filename"""
+    m = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv\.gz", path.name)
+    if m:
+        return m.group(1), m.group(2)
+    return None, None
+
+# Collect all dates
+all_dates = []
+for f in files:
+    start, end = extract_dates(f)
+    if start and end:
+        all_dates.extend([start, end])
+        print(f"  {f.name}: {start} to {end}")
+
+if not all_dates:
+    raise SystemExit("Could not extract dates from filenames")
+
+# Find earliest and latest dates
+earliest = min(all_dates)
+latest = max(all_dates)
+print(f"\nDate range: {earliest} to {latest}")
+
+# Read and concatenate all files
+print("\nReading and concatenating files...")
+frames = [pl.read_csv(f) for f in files]
+df = pl.concat(frames, how="vertical", rechunk=True)
+
+# Write output
+output_path = Path("downloads") / f"openairframes_adsb_{earliest}_{latest}.csv.gz"
+output_path.parent.mkdir(parents=True, exist_ok=True)
+df.write_csv(output_path, compression="gzip")
+
+print(f"\nWrote {output_path} with {df.height:,} rows")
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Create download directory
+mkdir -p downloads/adsb_artifacts
+
+# Repository from the workflow comment
+REPO="ggman12/OpenAirframes"
+
+# Get last 15 runs of the workflow and download matching artifacts
+gh run list \
+  --repo "$REPO" \
+  --workflow adsb-to-aircraft-multiple-day-run.yaml \
+  --limit 15 \
+  --json databaseId \
+  --jq '.[].databaseId' | while read -r run_id; do
+  
+  echo "Checking run ID: $run_id"
+  
+  # List artifacts for this run using the API
+  # Match pattern: openairframes_adsb-YYYY-MM-DD-YYYY-MM-DD (with second date)
+  gh api \
+    --paginate \
+    "repos/$REPO/actions/runs/$run_id/artifacts" \
+    --jq '.artifacts[] | select(.name | test("^openairframes_adsb-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}$")) | .name' | while read -r artifact_name; do
+    
+    echo "  Downloading: $artifact_name"
+    gh run download "$run_id" \
+      --repo "$REPO" \
+      --name "$artifact_name" \
+      --dir "downloads/adsb_artifacts/$artifact_name"
+  done
+done
+
+echo "Download complete! Files saved to downloads/adsb_artifacts/"
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+"""
+Download and concatenate artifacts from a specific set of workflow runs.
+
+Usage:
+    python scripts/download_and_concat_runs.py triggered_runs_20260216_123456.json
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def download_run_artifact(run_id, output_dir):
+    """Download artifact from a specific workflow run."""
+    print(f"  Downloading artifacts from run {run_id}...")
+    
+    cmd = [
+        'gh', 'run', 'download', str(run_id),
+        '--pattern', 'openairframes_adsb-*',
+        '--dir', output_dir
+    ]
+    
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode == 0:
+        print(f"  ✓ Downloaded")
+        return True
+    else:
+        if "no artifacts" in result.stderr.lower():
+            print(f"  ⚠ No artifacts found (workflow may still be running)")
+        else:
+            print(f"  ✗ Failed: {result.stderr}")
+        return False
+
+
+def find_csv_files(download_dir):
+    """Find all CSV.gz files in the download directory."""
+    csv_files = []
+    for root, dirs, files in os.walk(download_dir):
+        for file in files:
+            if file.endswith('.csv.gz'):
+                csv_files.append(os.path.join(root, file))
+    return sorted(csv_files)
+
+
+def concatenate_csv_files(csv_files, output_file):
+    """Concatenate CSV files in order, preserving headers."""
+    import gzip
+    
+    print(f"\nConcatenating {len(csv_files)} CSV files...")
+    
+    with gzip.open(output_file, 'wt') as outf:
+        header_written = False
+        
+        for i, csv_file in enumerate(csv_files, 1):
+            print(f"  [{i}/{len(csv_files)}] Processing {os.path.basename(csv_file)}")
+            
+            with gzip.open(csv_file, 'rt') as inf:
+                lines = inf.readlines()
+                
+                if not header_written:
+                    # Write header from first file
+                    outf.writelines(lines)
+                    header_written = True
+                else:
+                    # Skip header for subsequent files
+                    outf.writelines(lines[1:])
+    
+    print(f"\n✓ Concatenated CSV saved to: {output_file}")
+    
+    # Show file size
+    size_mb = os.path.getsize(output_file) / (1024 * 1024)
+    print(f"  Size: {size_mb:.1f} MB")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Download and concatenate artifacts from workflow runs'
+    )
+    parser.add_argument(
+        'runs_file',
+        help='JSON file containing run IDs (from run_historical_adsb_action.py)'
+    )
+    parser.add_argument(
+        '--output-dir',
+        default='./downloads/historical_concat',
+        help='Directory for downloads (default: ./downloads/historical_concat)'
+    )
+    parser.add_argument(
+        '--wait',
+        action='store_true',
+        help='Wait for workflows to complete before downloading'
+    )
+    
+    args = parser.parse_args()
+    
+    # Load run IDs
+    if not os.path.exists(args.runs_file):
+        print(f"Error: File not found: {args.runs_file}")
+        sys.exit(1)
+    
+    with open(args.runs_file, 'r') as f:
+        data = json.load(f)
+    
+    runs = data['runs']
+    start_date = data['start_date']
+    end_date = data['end_date']
+    
+    print("=" * 60)
+    print("Download and Concatenate Historical Artifacts")
+    print("=" * 60)
+    print(f"Date range: {start_date} to {end_date}")
+    print(f"Workflow runs: {len(runs)}")
+    print(f"Output directory: {args.output_dir}")
+    print("=" * 60)
+    
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    # Wait for workflows to complete if requested
+    if args.wait:
+        print("\nWaiting for workflows to complete...")
+        for run_info in runs:
+            run_id = run_info['run_id']
+            print(f"  Checking run {run_id}...")
+            
+            cmd = ['gh', 'run', 'watch', str(run_id)]
+            subprocess.run(cmd)
+    
+    # Download artifacts
+    print("\nDownloading artifacts...")
+    successful_downloads = 0
+    
+    for i, run_info in enumerate(runs, 1):
+        run_id = run_info['run_id']
+        print(f"\n[{i}/{len(runs)}] Run {run_id} ({run_info['start']} to {run_info['end']})")
+        
+        if download_run_artifact(run_id, args.output_dir):
+            successful_downloads += 1
+    
+    print(f"\n\nDownload Summary: {successful_downloads}/{len(runs)} artifacts downloaded")
+    
+    if successful_downloads == 0:
+        print("\nNo artifacts downloaded. Workflows may still be running.")
+        print("Use --wait to wait for completion, or try again later.")
+        sys.exit(1)
+    
+    # Find all CSV files
+    csv_files = find_csv_files(args.output_dir)
+    
+    if not csv_files:
+        print("\nError: No CSV files found in download directory")
+        sys.exit(1)
+    
+    print(f"\nFound {len(csv_files)} CSV file(s):")
+    for csv_file in csv_files:
+        print(f"  - {os.path.basename(csv_file)}")
+    
+    # Concatenate
+    # Calculate actual end date for filename (end_date - 1 day since it's exclusive)
+    from datetime import datetime, timedelta
+    end_dt = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
+    actual_end = end_dt.strftime('%Y-%m-%d')
+    
+    output_file = os.path.join(
+        args.output_dir,
+        f"openairframes_adsb_{start_date}_{actual_end}.csv.gz"
+    )
+    
+    concatenate_csv_files(csv_files, output_file)
+    
+    print("\n" + "=" * 60)
+    print("Done!")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+Script to trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks.
+
+Usage:
+    python scripts/run_historical_adsb_action.py --start-date 2025-01-01 --end-date 2025-06-01
+"""
+
+import argparse
+import subprocess
+import sys
+from datetime import datetime, timedelta
+from calendar import monthrange
+
+
+def generate_monthly_chunks(start_date_str, end_date_str):
+    """Generate date ranges in monthly chunks from start to end date.
+    
+    End dates are exclusive (e.g., to process Jan 1-31, end_date should be Feb 1).
+    """
+    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
+    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
+    
+    chunks = []
+    current = start_date
+    
+    while current < end_date:
+        # Get the first day of the next month (exclusive end)
+        _, days_in_month = monthrange(current.year, current.month)
+        month_end = current.replace(day=days_in_month)
+        next_month_start = month_end + timedelta(days=1)
+        
+        # Don't go past the global end date
+        chunk_end = min(next_month_start, end_date)
+        
+        chunks.append({
+            'start': current.strftime('%Y-%m-%d'),
+            'end': chunk_end.strftime('%Y-%m-%d')
+        })
+        
+        # Move to first day of next month
+        if next_month_start >= end_date:
+            break
+        current = next_month_start
+    
+    return chunks
+
+
+def trigger_workflow(start_date, end_date, repo='ggman12/OpenAirframes', branch='main', dry_run=False):
+    """Trigger the adsb-to-aircraft-multiple-day-run workflow via GitHub CLI."""
+    cmd = [
+        'gh', 'workflow', 'run', 'adsb-to-aircraft-multiple-day-run.yaml',
+        '--repo', repo,
+        '--ref', branch,
+        '-f', f'start_date={start_date}',
+        '-f', f'end_date={end_date}'
+    ]
+    
+    if dry_run:
+        print(f"[DRY RUN] Would run: {' '.join(cmd)}")
+        return True, None
+    
+    print(f"Triggering workflow: {start_date} to {end_date} (on {branch})")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode == 0:
+        print(f"✓ Successfully triggered workflow for {start_date} to {end_date}")
+        
+        # Get the run ID of the workflow we just triggered
+        # Wait a moment for it to appear
+        import time
+        time.sleep(2)
+        
+        # Get the most recent run (should be the one we just triggered)
+        list_cmd = [
+            'gh', 'run', 'list',
+            '--repo', repo,
+            '--workflow', 'adsb-to-aircraft-multiple-day-run.yaml',
+            '--branch', branch,
+            '--limit', '1',
+            '--json', 'databaseId',
+            '--jq', '.[0].databaseId'
+        ]
+        list_result = subprocess.run(list_cmd, capture_output=True, text=True)
+        run_id = list_result.stdout.strip() if list_result.returncode == 0 else None
+        
+        return True, run_id
+    else:
+        print(f"✗ Failed to trigger workflow for {start_date} to {end_date}")
+        print(f"Error: {result.stderr}")
+        return False, None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks'
+    )
+    parser.add_argument(
+        '--start-date', '--start_date',
+        dest='start_date',
+        required=True,
+        help='Start date in YYYY-MM-DD format (inclusive)'
+    )
+    parser.add_argument(
+        '--end-date', '--end_date',
+        dest='end_date',
+        required=True,
+        help='End date in YYYY-MM-DD format (exclusive)'
+    )
+    parser.add_argument(
+        '--repo',
+        type=str,
+        default='ggman12/OpenAirframes',
+        help='GitHub repository (default: ggman12/OpenAirframes)'
+    )
+    parser.add_argument(
+        '--branch',
+        type=str,
+        default='main',
+        help='Branch to run the workflow on (default: main)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Print commands without executing them'
+    )
+    parser.add_argument(
+        '--delay',
+        type=int,
+        default=5,
+        help='Delay in seconds between workflow triggers (default: 5)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate dates
+    try:
+        start = datetime.strptime(args.start_date, '%Y-%m-%d')
+        end = datetime.strptime(args.end_date, '%Y-%m-%d')
+        if start > end:
+            print("Error: start_date must be before or equal to end_date")
+            sys.exit(1)
+    except ValueError as e:
+        print(f"Error: Invalid date format - {e}")
+        sys.exit(1)
+    
+    # Generate monthly chunks
+    chunks = generate_monthly_chunks(args.start_date, args.end_date)
+    
+    print(f"\nGenerating {len(chunks)} monthly workflow runs on branch '{args.branch}' (repo: {args.repo}):")
+    for i, chunk in enumerate(chunks, 1):
+        print(f"  {i}. {chunk['start']} to {chunk['end']}")
+    
+    if not args.dry_run:
+        response = input(f"\nProceed with triggering {len(chunks)} workflows on '{args.branch}'? [y/N]: ")
+        if response.lower() != 'y':
+            print("Cancelled.")
+            sys.exit(0)
+    
+    print()
+    
+    # Trigger workflows
+    import time
+    success_count = 0
+    triggered_runs = []
+    
+    for i, chunk in enumerate(chunks, 1):
+        print(f"\n[{i}/{len(chunks)}] ", end='')
+        
+        success, run_id = trigger_workflow(
+            chunk['start'],
+            chunk['end'],
+            repo=args.repo,
+            branch=args.branch,
+            dry_run=args.dry_run
+        )
+        
+        if success:
+            success_count += 1
+            if run_id:
+                triggered_runs.append({
+                    'run_id': run_id,
+                    'start': chunk['start'],
+                    'end': chunk['end']
+                })
+        
+        # Add delay between triggers (except for last one)
+        if i < len(chunks) and not args.dry_run:
+            time.sleep(args.delay)
+    
+    print(f"\n\nSummary: {success_count}/{len(chunks)} workflows triggered successfully")
+    
+    # Save triggered run IDs to a file
+    if triggered_runs and not args.dry_run:
+        import json
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        runs_file = f"./triggered_runs_{timestamp}.json"
+        with open(runs_file, 'w') as f:
+            json.dump({
+                'start_date': args.start_date,
+                'end_date': args.end_date,
+                'repo': args.repo,
+                'branch': args.branch,
+                'runs': triggered_runs
+            }, f, indent=2)
+        print(f"\nRun IDs saved to: {runs_file}")
+        print(f"\nTo download and concatenate these artifacts, run:")
+        print(f"  python scripts/download_and_concat_runs.py {runs_file}")
+    
+    if success_count < len(chunks):
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Run src.adsb.main in an isolated git worktree so edits in the main
+working tree won't affect subprocess imports during the run.
+
+Usage:
+    python scripts/run_main_isolated.py 2026-01-01
+    python scripts/run_main_isolated.py --start_date 2026-01-01 --end_date 2026-01-03
+"""
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def run(
+    cmd: list[str],
+    *,
+    cwd: Path | None = None,
+    check: bool = True,
+) -> subprocess.CompletedProcess:
+    print(f"\n>>> {' '.join(cmd)}")
+    return subprocess.run(cmd, cwd=cwd, check=check)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run src.adsb.main in an isolated worktree")
+    parser.add_argument("date", nargs="?", help="Single date to process (YYYY-MM-DD)")
+    parser.add_argument("--start_date", help="Start date (inclusive, YYYY-MM-DD)")
+    parser.add_argument("--end_date", help="End date (exclusive, YYYY-MM-DD)")
+    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
+    args = parser.parse_args()
+
+    if args.date and (args.start_date or args.end_date):
+        raise SystemExit("Use a single date or --start_date/--end_date, not both.")
+
+    if args.date:
+        datetime.strptime(args.date, "%Y-%m-%d")
+        main_args = ["--date", args.date]
+    else:
+        if not args.start_date or not args.end_date:
+            raise SystemExit("Provide --start_date and --end_date, or a single date.")
+        datetime.strptime(args.start_date, "%Y-%m-%d")
+        datetime.strptime(args.end_date, "%Y-%m-%d")
+        main_args = ["--start_date", args.start_date, "--end_date", args.end_date]
+
+    if args.concat_with_latest_csv:
+        main_args.append("--concat_with_latest_csv")
+
+    repo_root = Path(__file__).resolve().parents[1]
+    snapshots_root = repo_root / ".snapshots"
+    snapshots_root.mkdir(exist_ok=True)
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+    snapshot_root = snapshots_root / f"run_{timestamp}"
+    snapshot_src = snapshot_root / "src"
+
+    exit_code = 0
+    try:
+        shutil.copytree(repo_root / "src", snapshot_src)
+
+        runner = (
+            "import sys, runpy; "
+            f"sys.path.insert(0, {repr(str(snapshot_root))}); "
+            f"sys.argv = ['src.adsb.main'] + {main_args!r}; "
+            "runpy.run_module('src.adsb.main', run_name='__main__')"
+        )
+        cmd = [sys.executable, "-c", runner]
+        run(cmd, cwd=repo_root)
+    except subprocess.CalledProcessError as exc:
+        exit_code = exc.returncode
+    finally:
+        shutil.rmtree(snapshot_root, ignore_errors=True)
+
+    return exit_code
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())