FIX: trigger for planequery-aircraft daily release workflow. Update contributions issue template.

2026-04-23 11:36:35 +02:00 · 2026-02-11 23:28:50 -05:00
parent 59c2aab5c7
commit b349c01d31
9 changed files with 529 additions and 398 deletions
@@ -43,7 +43,7 @@ body:
    id: contributor_name
    attributes:
      label: Contributor Name
-      description: Your display name for attribution. Leave blank to use your GitHub username. Max 150 characters.
+      description: Your display name for attribution. Leave blank for no attribution. Max 150 characters.
      placeholder: "e.g., JamesBerry.com or leave blank"
    validations:
      required: false
@@ -58,28 +58,6 @@ body:
    validations:
      required: true

-  - type: dropdown
-    id: submission_type
-    attributes:
-      label: What did you submit?
-      options:
-        - Single object
-        - Multiple objects (array)
-    validations:
-      required: true
-
-  - type: checkboxes
-    id: confirmations
-    attributes:
-      label: Confirmations
-      options:
-        - label: "I confirm this is valid JSON (not JSONL) and matches the field names exactly."
-          required: true
-        - label: "I confirm `transponder_code_hex` values (if provided) are 6 hex characters."
-          required: true
-        - label: "I understand submissions are reviewed and may be rejected or require changes."
-          required: true
-
  - type: textarea
    id: notes
    attributes:
@@ -0,0 +1,209 @@
+name: Historical ADS-B Processing
+
+on:
+  workflow_dispatch:
+    inputs:
+      start_date:
+        description: 'Start date (YYYY-MM-DD, inclusive)'
+        required: true
+        type: string
+      end_date:
+        description: 'End date (YYYY-MM-DD, exclusive)'
+        required: true
+        type: string
+      chunk_days:
+        description: 'Days per job chunk (default: 7)'
+        required: false
+        type: number
+        default: 7
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      chunks: ${{ steps.generate.outputs.chunks }}
+      global_start: ${{ inputs.start_date }}
+      global_end: ${{ inputs.end_date }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Generate date chunks
+        id: generate
+        env:
+          INPUT_START_DATE: ${{ inputs.start_date }}
+          INPUT_END_DATE: ${{ inputs.end_date }}
+          INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
+        run: python src/adsb/historical_generate_matrix.py
+
+  adsb-extract:
+    needs: generate-matrix
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      matrix:
+        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
+      max-parallel: 3
+      fail-fast: false
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+
+      - name: Download and extract ADS-B data
+        env:
+          START_DATE: ${{ matrix.chunk.start_date }}
+          END_DATE: ${{ matrix.chunk.end_date }}
+        run: |
+          python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
+          ls -lah data/output/
+
+      - name: Create tar of extracted data
+        run: |
+          cd data/output
+          tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "Some files may not exist"
+          ls -lah extracted_data.tar || echo "No tar created"
+
+      - name: Upload extracted data
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
+          path: data/output/extracted_data.tar
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: warn
+
+  adsb-map:
+    needs: [generate-matrix, adsb-extract]
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      fail-fast: false
+      matrix:
+        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
+        icao_chunk: [0, 1, 2, 3]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+
+      - name: Download extracted data
+        uses: actions/download-artifact@v4
+        with:
+          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
+          path: data/output/
+        continue-on-error: true
+
+      - name: Extract tar
+        id: extract
+        run: |
+          cd data/output
+          if [ -f extracted_data.tar ]; then
+            tar -xf extracted_data.tar
+            rm extracted_data.tar
+            echo "has_data=true" >> "$GITHUB_OUTPUT"
+            echo "=== Contents of data/output ==="
+            ls -lah
+          else
+            echo "No extracted_data.tar found"
+            echo "has_data=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Process ICAO chunk
+        if: steps.extract.outputs.has_data == 'true'
+        env:
+          START_DATE: ${{ matrix.chunk.start_date }}
+          END_DATE: ${{ matrix.chunk.end_date }}
+        run: |
+          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.icao_chunk }} --total-chunks 4 --start-date "$START_DATE" --end-date "$END_DATE"
+          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
+
+      - name: Upload chunk artifacts
+        if: steps.extract.outputs.has_data == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-map-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}-chunk-${{ matrix.icao_chunk }}
+          path: data/output/adsb_chunks/
+          retention-days: 1
+          if-no-files-found: ignore
+
+  adsb-reduce:
+    needs: [generate-matrix, adsb-map]
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download all chunk artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: adsb-map-*
+          path: data/output/adsb_chunks/
+          merge-multiple: true
+
+      - name: Debug downloaded files
+        run: |
+          echo "=== Listing data/output/adsb_chunks/ ==="
+          find data/output/adsb_chunks/ -type f 2>/dev/null | head -50 || echo "No files found"
+          echo "=== Looking for parquet files ==="
+          find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found"
+
+      - name: Combine chunks to CSV
+        env:
+          START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
+          END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
+        run: |
+          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base
+          ls -lah data/planequery_aircraft/
+
+      - name: Upload final artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: planequery_aircraft_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
+          path: data/planequery_aircraft/*.csv
+          retention-days: 30
@@ -8,6 +8,7 @@ on:

 permissions:
  contents: write
+  actions: write

 jobs:
  trigger-releases:
@@ -1,171 +0,0 @@
-name: Process Historical FAA Data
-
-on:
-  workflow_dispatch:  # Manual trigger
-
-jobs:
-  generate-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    steps:
-      - name: Generate date ranges
-        id: set-matrix
-        run: |
-          python3 << 'EOF'
-          import json
-          from datetime import datetime, timedelta
-          
-          start = datetime(2023, 8, 16)
-          end = datetime(2026, 1, 1)
-          
-          ranges = []
-          current = start
-          
-          # Process in 4-day chunks
-          while current < end:
-            chunk_end = current + timedelta(days=4)
-            # Don't go past the end date
-            if chunk_end > end:
-              chunk_end = end
-            
-            ranges.append({
-              "since": current.strftime("%Y-%m-%d"),
-              "until": chunk_end.strftime("%Y-%m-%d")
-            })
-            
-            current = chunk_end
-          
-          print(f"::set-output name=matrix::{json.dumps(ranges)}")
-          EOF
-
-  clone-faa-repo:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Cache FAA repository
-        id: cache-faa-repo
-        uses: actions/cache@v4
-        with:
-          path: data/scrape-faa-releasable-aircraft
-          key: faa-repo-v1
-          
-      - name: Clone FAA repository
-        if: steps.cache-faa-repo.outputs.cache-hit != 'true'
-        run: |
-          mkdir -p data
-          git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft
-          echo "Repository cloned successfully"
-
-  process-chunk:
-    needs: [generate-matrix, clone-faa-repo]
-    runs-on: ubuntu-latest
-    strategy:
-      max-parallel: 5  # Process 5 chunks at a time
-      matrix:
-        range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-      
-      - name: Restore FAA repository cache
-        uses: actions/cache/restore@v4
-        with:
-          path: data/scrape-faa-releasable-aircraft
-          key: faa-repo-v1
-          fail-on-cache-miss: true
-      
-      - name: Install dependencies
-        run: |
-          pip install -r requirements.txt
-      
-      - name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }}
-        run: |
-          python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}"
-      
-      - name: Upload CSV artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }}
-          path: data/faa_releasable_historical/*.csv
-          retention-days: 1
-
-  create-release:
-    needs: process-chunk
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-    steps:
-      - name: Download all artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: artifacts
-      
-      - name: Prepare release files
-        run: |
-          mkdir -p release-files
-          find artifacts -name "*.csv" -exec cp {} release-files/ \;
-          ls -lh release-files/
-      
-      - name: Create Release
-        uses: softprops/action-gh-release@v1
-        with:
-          tag_name: historical-faa-${{ github.run_number }}
-          name: Historical FAA Data Release ${{ github.run_number }}
-          body: |
-            Automated release of historical FAA aircraft data
-            Processing period: 2023-08-16 to 2026-01-01
-            Generated: ${{ github.event.repository.updated_at }}
-          files: release-files/*.csv
-          draft: false
-          prerelease: false
-
-  concatenate-and-release:
-    needs: process-chunk
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-      
-      - name: Install dependencies
-        run: |
-          pip install -r requirements.txt
-      
-      - name: Download all artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: artifacts
-      
-      - name: Prepare CSVs for concatenation
-        run: |
-          mkdir -p data/faa_releasable_historical
-          find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
-          ls -lh data/faa_releasable_historical/
-      
-      - name: Concatenate all CSVs
-        run: |
-          python scripts/concat_csvs.py
-      
-      - name: Create Combined Release
-        uses: softprops/action-gh-release@v1
-        with:
-          tag_name: historical-faa-combined-${{ github.run_number }}
-          name: Historical FAA Data Combined Release ${{ github.run_number }}
-          body: |
-            Combined historical FAA aircraft data (all chunks concatenated)
-            Processing period: 2023-08-16 to 2026-01-01
-            Generated: ${{ github.event.repository.updated_at }}
-          files: data/planequery_aircraft/*.csv
-          draft: false
-          prerelease: false
@@ -2,10 +2,16 @@
 Combines chunk parquet files and compresses to final aircraft CSV.
 This is the reduce phase of the map-reduce pipeline.

+Supports both single-day (daily) and multi-day (historical) modes.
+
 Memory-efficient: processes each chunk separately, compresses, then combines.

 Usage:
+    # Daily mode
    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
+    
+    # Historical mode
+    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
 """
 import gc
 import os
@@ -117,9 +123,9 @@ def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame
        return compressed_df


-def cleanup_chunks(date_str: str, chunks_dir: str):
+def cleanup_chunks(output_id: str, chunks_dir: str):
    """Delete chunk parquet files after successful merge."""
-    pattern = os.path.join(chunks_dir, f"chunk_*_{date_str}.parquet")
+    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
    chunk_files = glob.glob(pattern)
    for f in chunk_files:
        try:
@@ -129,32 +135,56 @@ def cleanup_chunks(date_str: str, chunks_dir: str):
            print(f"Failed to delete {f}: {e}")


+def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
+    """Find chunk parquet files matching the output ID."""
+    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
+    chunk_files = sorted(glob.glob(pattern))
+    
+    if not chunk_files:
+        # Try recursive search for historical mode with merged artifacts
+        pattern = os.path.join(chunks_dir, "**", "*.parquet")
+        chunk_files = sorted(glob.glob(pattern, recursive=True))
+    
+    return chunk_files
+
+
 def main():
    parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
-    parser.add_argument("--date", type=str, help="Date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
    parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
    parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
    args = parser.parse_args()
    
-    if args.date:
-        target_day = datetime.strptime(args.date, "%Y-%m-%d")
+    # Determine output ID and filename based on mode
+    if args.start_date and args.end_date:
+        # Historical mode
+        output_id = f"{args.start_date}_{args.end_date}"
+        output_filename = f"planequery_aircraft_adsb_{args.start_date}_{args.end_date}.csv"
+        print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
    else:
-        target_day = get_target_day()
+        # Daily mode
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        
+        date_str = target_day.strftime("%Y-%m-%d")
+        output_id = date_str
+        output_filename = f"planequery_aircraft_adsb_{date_str}.csv"
+        print(f"Combining chunks for {date_str}")
    
-    date_str = target_day.strftime("%Y-%m-%d")
    chunks_dir = args.chunks_dir
-    
-    print(f"Combining chunks for {date_str}")
    print(f"Chunks directory: {chunks_dir}")
    print(f"Resource usage at start: {get_resource_usage()}")
    
    # Find chunk files
-    pattern = os.path.join(chunks_dir, f"chunk_*_{date_str}.parquet")
-    chunk_files = sorted(glob.glob(pattern))
+    chunk_files = find_chunk_files(chunks_dir, output_id)
    
    if not chunk_files:
-        print(f"No chunk files found matching: {pattern}")
+        print(f"No chunk files found in: {chunks_dir}")
        sys.exit(1)
    
    print(f"Found {len(chunk_files)} chunk files")
@@ -174,7 +204,7 @@ def main():
    gc.collect()
    print(f"After combining: {get_resource_usage()}")
    
-    # Merge with base release
+    # Merge with base release (unless skipped)
    if not args.skip_base:
        combined = download_and_merge_base_release(combined)
    
@@ -190,13 +220,13 @@ def main():
        combined = combined.sort('time')
    
    # Write final CSV
-    output_path = os.path.join(FINAL_OUTPUT_DIR, f"planequery_aircraft_adsb_{date_str}.csv")
+    output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
    combined.write_csv(output_path)
    print(f"Wrote {len(combined)} records to {output_path}")
    
    # Cleanup
    if not args.keep_chunks:
-        cleanup_chunks(date_str, chunks_dir)
+        cleanup_chunks(output_id, chunks_dir)
    
    print(f"Done! | {get_resource_usage()}")

@@ -2,6 +2,8 @@
 Downloads and extracts adsb.lol tar files, then lists all ICAO folders.
 This is the first step of the map-reduce pipeline.

+Supports both single-day (daily) and multi-day (historical) modes.
+
 Outputs:
 - Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
 - ICAO manifest at data/output/icao_manifest_{date}.txt
@@ -25,7 +27,6 @@ from src.adsb.download_adsb_data_to_parquet import (

 def get_target_day() -> datetime:
    """Get yesterday's date (the day we're processing)."""
-    # return datetime.utcnow() - timedelta(days=1)
    return datetime.utcnow() - timedelta(days=1)


@@ -99,49 +100,111 @@ def list_icao_folders(extract_dir: str) -> list[str]:
    return icaos


-def write_manifest(icaos: list[str], date_str: str) -> str:
-    """Write ICAO list to manifest file."""
-    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{date_str}.txt")
+def write_manifest(icaos: list[str], manifest_id: str) -> str:
+    """Write ICAO list to manifest file.
+    
+    Args:
+        icaos: List of ICAO codes
+        manifest_id: Identifier for manifest file (date or date range)
+    """
+    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
    with open(manifest_path, "w") as f:
-        for icao in icaos:
+        for icao in sorted(icaos):
            f.write(f"{icao}\n")
    print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
    return manifest_path


-def main():
-    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
-    parser.add_argument("--date", type=str, help="Date in YYYY-MM-DD format (default: yesterday)")
-    args = parser.parse_args()
-    
-    if args.date:
-        target_day = datetime.strptime(args.date, "%Y-%m-%d")
-    else:
-        target_day = get_target_day()
+def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
+    """Process a single day: download, extract, list ICAOs.
    
+    Returns:
+        Tuple of (extract_dir, icaos)
+    """
    date_str = target_day.strftime("%Y-%m-%d")
    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
    
    print(f"Processing date: {date_str} (version: {version_date})")
    
-    # Download and extract
    extract_dir = download_and_extract(version_date)
    if not extract_dir:
-        print("Failed to download/extract data")
-        sys.exit(1)
+        print(f"Failed to download/extract data for {date_str}")
+        return None, []
    
-    # List ICAOs
    icaos = list_icao_folders(extract_dir)
-    if not icaos:
-        print("No ICAOs found")
-        sys.exit(1)
+    print(f"Found {len(icaos)} ICAOs for {date_str}")
    
-    # Write manifest
-    manifest_path = write_manifest(icaos, date_str)
+    return extract_dir, icaos
+
+
+def process_date_range(start_date: datetime, end_date: datetime) -> set[str]:
+    """Process multiple days: download, extract, combine ICAO lists.
    
-    print(f"\nDone! Extract dir: {extract_dir}")
-    print(f"Manifest: {manifest_path}")
-    print(f"Total ICAOs: {len(icaos)}")
+    Args:
+        start_date: Start date (inclusive)
+        end_date: End date (inclusive)
+    
+    Returns:
+        Combined set of all ICAOs across the date range
+    """
+    all_icaos: set[str] = set()
+    current = start_date
+    
+    # Both start and end are inclusive
+    while current <= end_date:
+        _, icaos = process_single_day(current)
+        all_icaos.update(icaos)
+        current += timedelta(days=1)
+    
+    return all_icaos
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
+    args = parser.parse_args()
+    
+    # Determine mode: single day or date range
+    if args.start_date and args.end_date:
+        # Historical mode: process date range
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+        
+        print(f"Processing date range: {args.start_date} to {args.end_date}")
+        
+        all_icaos = process_date_range(start_date, end_date)
+        
+        if not all_icaos:
+            print("No ICAOs found in date range")
+            sys.exit(1)
+        
+        # Write combined manifest with range identifier
+        manifest_id = f"{args.start_date}_{args.end_date}"
+        write_manifest(list(all_icaos), manifest_id)
+        
+        print(f"\nDone! Total ICAOs: {len(all_icaos)}")
+        
+    else:
+        # Daily mode: single day
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        
+        date_str = target_day.strftime("%Y-%m-%d")
+        
+        extract_dir, icaos = process_single_day(target_day)
+        
+        if not icaos:
+            print("No ICAOs found")
+            sys.exit(1)
+        
+        write_manifest(icaos, date_str)
+        
+        print(f"\nDone! Extract dir: {extract_dir}")
+        print(f"Total ICAOs: {len(icaos)}")


 if __name__ == "__main__":
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Generate date chunk matrix for historical ADS-B processing."""
+
+import json
+import os
+import sys
+from datetime import datetime, timedelta
+
+
+def generate_chunks(start_date: str, end_date: str, chunk_days: int) -> list[dict]:
+    """Generate date chunks for parallel processing.
+    
+    Args:
+        start_date: Start date in YYYY-MM-DD format (inclusive)
+        end_date: End date in YYYY-MM-DD format (exclusive)
+        chunk_days: Number of days per chunk
+        
+    Returns:
+        List of chunk dictionaries with start_date and end_date (both inclusive within chunk)
+    """
+    start = datetime.strptime(start_date, "%Y-%m-%d")
+    end = datetime.strptime(end_date, "%Y-%m-%d")
+    
+    chunks = []
+    current = start
+    
+    # end_date is exclusive, so we process up to but not including it
+    while current < end:
+        # chunk_end is inclusive, so subtract 1 from the next chunk start
+        chunk_end = min(current + timedelta(days=chunk_days - 1), end - timedelta(days=1))
+        chunks.append({
+            "start_date": current.strftime("%Y-%m-%d"),
+            "end_date": chunk_end.strftime("%Y-%m-%d"),
+        })
+        current = chunk_end + timedelta(days=1)
+    
+    return chunks
+
+
+def main() -> None:
+    """Main entry point for GitHub Actions."""
+    start_date = os.environ.get("INPUT_START_DATE")
+    end_date = os.environ.get("INPUT_END_DATE")
+    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
+    
+    if not start_date or not end_date:
+        print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
+        sys.exit(1)
+    
+    chunks = generate_chunks(start_date, end_date, chunk_days)
+    print(f"Generated {len(chunks)} chunks for {start_date} to {end_date}")
+    
+    # Write to GitHub Actions output
+    github_output = os.environ.get("GITHUB_OUTPUT")
+    if github_output:
+        with open(github_output, "a") as f:
+            f.write(f"chunks={json.dumps(chunks)}\n")
+    else:
+        # For local testing, just print
+        print(json.dumps(chunks, indent=2))
+
+
+if __name__ == "__main__":
+    main()
@@ -2,11 +2,17 @@
 Processes a chunk of ICAOs from pre-extracted trace files.
 This is the map phase of the map-reduce pipeline.

+Supports both single-day (daily) and multi-day (historical) modes.
+
 Expects extract_dir to already exist with trace files.
 Reads ICAO manifest to determine which ICAOs to process based on chunk-id.

 Usage:
+    # Daily mode (single day)
    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
+    
+    # Historical mode (date range)
+    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
 """
 import gc
 import os
@@ -43,9 +49,13 @@ def get_target_day() -> datetime:
    return datetime.utcnow() - timedelta(days=1)


-def read_manifest(date_str: str) -> list[str]:
-    """Read ICAO manifest file."""
-    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{date_str}.txt")
+def read_manifest(manifest_id: str) -> list[str]:
+    """Read ICAO manifest file.
+    
+    Args:
+        manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
+    """
+    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
    if not os.path.exists(manifest_path):
        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
    
@@ -119,9 +129,17 @@ def process_chunk(
    total_chunks: int,
    trace_map: dict[str, str],
    icaos: list[str],
-    date_str: str,
+    output_id: str,
 ) -> str | None:
-    """Process a chunk of ICAOs and write to parquet."""
+    """Process a chunk of ICAOs and write to parquet.
+    
+    Args:
+        chunk_id: This chunk's ID (0-indexed)
+        total_chunks: Total number of chunks
+        trace_map: Map of ICAO -> trace file path
+        icaos: Full list of ICAOs from manifest
+        output_id: Identifier for output file (date or date range)
+    """
    chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
    print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
    
@@ -142,7 +160,7 @@ def process_chunk(
        return None
    
    # Process files and write parquet in batches
-    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{date_str}.parquet")
+    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
    
    start_time = time.perf_counter()
    total_rows = 0
@@ -200,22 +218,95 @@ def process_chunk(
    return None


+def process_single_day(
+    chunk_id: int,
+    total_chunks: int,
+    target_day: datetime,
+) -> str | None:
+    """Process a single day for this chunk."""
+    date_str = target_day.strftime("%Y-%m-%d")
+    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
+    
+    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+    
+    if not os.path.isdir(extract_dir):
+        print(f"Extract directory not found: {extract_dir}")
+        return None
+    
+    trace_map = build_trace_file_map(extract_dir)
+    if not trace_map:
+        print("No trace files found")
+        return None
+    
+    icaos = read_manifest(date_str)
+    print(f"Total ICAOs in manifest: {len(icaos)}")
+    
+    return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
+
+
+def process_date_range(
+    chunk_id: int,
+    total_chunks: int,
+    start_date: datetime,
+    end_date: datetime,
+) -> str | None:
+    """Process a date range for this chunk.
+    
+    Combines trace files from all days in the range.
+    
+    Args:
+        chunk_id: This chunk's ID (0-indexed)
+        total_chunks: Total number of chunks
+        start_date: Start date (inclusive)
+        end_date: End date (inclusive)
+    """
+    start_str = start_date.strftime("%Y-%m-%d")
+    end_str = end_date.strftime("%Y-%m-%d")
+    manifest_id = f"{start_str}_{end_str}"
+    
+    print(f"Processing date range: {start_str} to {end_str}")
+    
+    # Build combined trace map from all days
+    combined_trace_map: dict[str, str] = {}
+    current = start_date
+    
+    # Both start and end are inclusive
+    while current <= end_date:
+        version_date = f"v{current.strftime('%Y.%m.%d')}"
+        extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+        
+        if os.path.isdir(extract_dir):
+            trace_map = build_trace_file_map(extract_dir)
+            # Later days override earlier days (use most recent trace file)
+            combined_trace_map.update(trace_map)
+            print(f"  {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
+        else:
+            print(f"  {current.strftime('%Y-%m-%d')}: no extract directory")
+        
+        current += timedelta(days=1)
+    
+    if not combined_trace_map:
+        print("No trace files found in date range")
+        return None
+    
+    print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
+    
+    icaos = read_manifest(manifest_id)
+    print(f"Total ICAOs in manifest: {len(icaos)}")
+    
+    return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)
+
+
 def main():
    parser = argparse.ArgumentParser(description="Process a chunk of ICAOs")
    parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
    parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks")
-    parser.add_argument("--date", type=str, help="Date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    args = parser.parse_args()
    
-    if args.date:
-        target_day = datetime.strptime(args.date, "%Y-%m-%d")
-    else:
-        target_day = get_target_day()
-    
-    date_str = target_day.strftime("%Y-%m-%d")
-    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
-    
-    print(f"Processing chunk {args.chunk_id}/{args.total_chunks} for {date_str}")
+    print(f"Processing chunk {args.chunk_id}/{args.total_chunks}")
    print(f"OUTPUT_DIR: {OUTPUT_DIR}")
    print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
    print(f"Resource usage at start: {get_resource_usage()}")
@@ -228,37 +319,19 @@ def main():
    else:
        print(f"  Directory does not exist!")
    
-    # Find extract directory
-    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
-    print(f"\nLooking for extract_dir: {extract_dir}")
-    if not os.path.isdir(extract_dir):
-        print(f"Extract directory not found: {extract_dir}")
-        # Try to find any extracted directory
-        import glob
-        pattern = os.path.join(OUTPUT_DIR, "*-planes-readsb-prod-0*")
-        matches = glob.glob(pattern)
-        print(f"Searching for pattern: {pattern}")
-        print(f"Found matches: {matches}")
-        sys.exit(1)
-    
-    # Build trace file map using find
-    trace_map = build_trace_file_map(extract_dir)
-    if not trace_map:
-        print("No trace files found in extract directory")
-        sys.exit(1)
-    
-    # Read manifest
-    icaos = read_manifest(date_str)
-    print(f"Total ICAOs in manifest: {len(icaos)}")
-    
-    # Process chunk
-    output_path = process_chunk(
-        args.chunk_id,
-        args.total_chunks,
-        trace_map,
-        icaos,
-        date_str,
-    )
+    # Determine mode: single day or date range
+    if args.start_date and args.end_date:
+        # Historical mode
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+        output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
+    else:
+        # Daily mode
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        output_path = process_single_day(args.chunk_id, args.total_chunks, target_day)
    
    if output_path:
        print(f"Output: {output_path}")
@@ -1,116 +0,0 @@
-"""
-For each commit-day in Feb 2024 (last commit per day):
- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
-    ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
- Recombine MASTER-*.txt into Master.txt
- Produce Master.csv via convert_faa_master_txt_to_csv
-
-Assumes the non-master files are present in every commit.
-"""
-import subprocess, re
-from pathlib import Path
-import shutil
-from collections import OrderedDict
-from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
-import zipfile
-import pandas as pd
-import argparse
-from datetime import datetime, timedelta
-
-# Parse command line arguments
-parser = argparse.ArgumentParser(description="Process historical FAA data from git commits")
-parser.add_argument("since", help="Start date (YYYY-MM-DD)")
-parser.add_argument("until", help="End date (YYYY-MM-DD)")
-args = parser.parse_args()
-
-# Clone repository if it doesn't exist
-REPO = Path("data/scrape-faa-releasable-aircraft")
-OUT_ROOT = Path("data/faa_releasable_historical")
-OUT_ROOT.mkdir(parents=True, exist_ok=True)
-
-def run_git_text(*args: str) -> str:
-    return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip()
-
-def run_git_bytes(*args: str) -> bytes:
-    return subprocess.check_output(["git", "-C", str(REPO), *args])
-
-# Parse dates and adjust --since to the day before
-since_date = datetime.strptime(args.since, "%Y-%m-%d")
-adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d")
-
-# All commits in specified date range (oldest -> newest)
-log = run_git_text(
-    "log",
-    "--reverse",
-    "--format=%H %cs",
-    f"--since={adjusted_since}",
-    f"--until={args.until}",
-)
-lines = [ln for ln in log.splitlines() if ln.strip()]
-if not lines:
-    raise SystemExit(f"No commits found between {args.since} and {args.until}.")
-
-# date -> last SHA that day
-date_to_sha = OrderedDict()
-for ln in lines:
-    sha, date = ln.split()
-    date_to_sha[date] = sha
-
-OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
-master_re = re.compile(r"^MASTER-(\d+)\.txt$")
-df_base = pd.DataFrame()
-start_date = None
-end_date = None
-for date, sha in date_to_sha.items():
-    if start_date is None:
-        start_date = date
-    end_date = date
-    day_dir = OUT_ROOT / date
-    day_dir.mkdir(parents=True, exist_ok=True)
-
-    # Write auxiliary files (assumed present)
-    for fname in OTHER_FILES:
-        (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
-
-    # Recombine MASTER parts
-    names = run_git_text("ls-tree", "--name-only", sha).splitlines()
-    parts = []
-    for n in names:
-        m = master_re.match(n)
-        if m:
-            parts.append((int(m.group(1)), n))
-    parts.sort()
-    if not parts:
-        raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
-
-    master_path = day_dir / "MASTER.txt"
-    with master_path.open("wb") as w:
-        for _, fname in parts:
-            data = run_git_bytes("show", f"{sha}:{fname}")
-            w.write(data)
-            if data and not data.endswith(b"\n"):
-                w.write(b"\n")
-
-    # 3) Zip the day's files
-    zip_path = day_dir / f"ReleasableAircraft.zip"
-    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
-        for p in day_dir.iterdir():
-            z.write(p, arcname=p.name)
-
-    print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
-    # 4) Convert ZIP -> CSV
-    df_new = convert_faa_master_txt_to_df(zip_path, date)
-    if df_base.empty:
-        df_base = df_new
-        print(len(df_base), "total entries so far")
-        # Delete all files in the day directory
-        shutil.rmtree(day_dir)
-        continue
-    
-    df_base = concat_faa_historical_df(df_base, df_new)
-    shutil.rmtree(day_dir)
-    print(len(df_base), "total entries so far")
-
-assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
-df_base.to_csv(OUT_ROOT / f"planequery_aircraft_faa_{start_date}_{end_date}.csv", index=False)
-# TODO: get average number of new rows per day.