Remove step-level timeouts to allow artifact downloads to complete

Co-authored-by: ggman12 <17393221+ggman12@users.noreply.github.com>
Fix YAML property ordering for timeout-minutes
2026-06-18 10:40:07 +02:00 · 2026-02-16 19:38:27 +00:00 · 2026-02-16 17:50:18 +00:00 · 2026-02-16 17:49:41 +00:00 · 2026-02-16 17:44:39 +00:00
17 changed files with 1284 additions and 841 deletions
@@ -1,115 +0,0 @@
-name: Historical ADS-B Run
-
-on:
-  workflow_dispatch:
-    inputs:
-      start_date:
-        description: 'YYYY-MM-DD (inclusive)'
-        required: true
-        type: string
-      end_date:
-        description: 'YYYY-MM-DD (exclusive)'
-        required: true
-        type: string
-
-jobs:
-  generate-dates:
-    runs-on: ubuntu-24.04-arm
-    outputs:
-      dates: ${{ steps.generate.outputs.dates }}
-    steps:
-      - name: Generate date list
-        id: generate
-        env:
-          START_DATE: ${{ inputs.start_date }}
-          END_DATE: ${{ inputs.end_date }}
-        run: |
-          python - <<'PY'
-          import json
-          from datetime import datetime, timedelta
-
-          start = datetime.strptime("${START_DATE}", "%Y-%m-%d")
-          end = datetime.strptime("${END_DATE}", "%Y-%m-%d")
-          if end <= start:
-            raise SystemExit("end_date must be after start_date")
-
-          dates = []
-          cur = start
-          while cur < end:
-            dates.append(cur.strftime("%Y-%m-%d"))
-            cur += timedelta(days=1)
-
-          with open("$GITHUB_OUTPUT", "a") as f:
-            f.write(f"dates={json.dumps(dates)}\n")
-          PY
-
-  adsb-day:
-    needs: generate-dates
-    strategy:
-      fail-fast: true
-      matrix:
-        date: ${{ fromJson(needs.generate-dates.outputs.dates) }}
-    uses: ./.github/workflows/historical-adsb.yaml
-    with:
-      date: ${{ matrix.date }}
-
-  adsb-final:
-    needs: adsb-day
-    runs-on: ubuntu-24.04-arm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.12'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Download daily CSVs
-        uses: actions/download-artifact@v4
-        with:
-          pattern: openairframes_adsb-*
-          path: outputs/daily/
-          merge-multiple: true
-
-      - name: Concatenate all days to final CSV
-        env:
-          START_DATE: ${{ inputs.start_date }}
-          END_DATE: ${{ inputs.end_date }}
-        run: |
-          python - <<'PY'
-          import re
-          from pathlib import Path
-          import polars as pl
-
-          start = "${START_DATE}"
-          end = "${END_DATE}"
-          daily_dir = Path("outputs/daily")
-          files = sorted(daily_dir.glob("openairframes_adsb_*.csv"))
-          if not files:
-            raise SystemExit("No daily CSVs found")
-
-          def date_key(path: Path) -> str:
-            m = re.match(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", path.name)
-            return m.group(1) if m else path.name
-
-          files = sorted(files, key=date_key)
-          frames = [pl.read_csv(p) for p in files]
-          df = pl.concat(frames, how="vertical", rechunk=True)
-
-          output_path = Path("outputs") / f"openairframes_adsb_{start}_{end}.csv"
-          df.write_csv(output_path)
-          print(f"Wrote {output_path} with {df.height} rows")
-          PY
-
-      - name: Upload final CSV
-        uses: actions/upload-artifact@v4
-        with:
-          name: openairframes_adsb-${{ inputs.start_date }}-${{ inputs.end_date }}
-          path: outputs/openairframes_adsb_${{ inputs.start_date }}_${{ inputs.end_date }}.csv
-          retention-days: 30
@@ -3,26 +3,58 @@ name: Historical ADS-B Processing
 on:
  workflow_dispatch:
    inputs:
-      date:
-        description: 'YYYY-MM-DD'
+      start_date:
+        description: 'Start date (YYYY-MM-DD, inclusive)'
        required: true
        type: string
-  workflow_call:
-    inputs:
-      date:
-        description: 'YYYY-MM-DD'
+      end_date:
+        description: 'End date (YYYY-MM-DD, exclusive)'
        required: true
        type: string
+      chunk_days:
+        description: 'Days per job chunk (default: 7)'
+        required: false
+        type: number
+        default: 7

 jobs:
-  adsb-extract:
-    runs-on: ubuntu-24.04-arm
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      chunks: ${{ steps.generate.outputs.chunks }}
+      global_start: ${{ inputs.start_date }}
+      global_end: ${{ inputs.end_date }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Generate date chunks
+        id: generate
+        env:
+          INPUT_START_DATE: ${{ inputs.start_date }}
+          INPUT_END_DATE: ${{ inputs.end_date }}
+          INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
+        run: python src/adsb/historical_generate_matrix.py
+
+  adsb-extract:
+    needs: generate-matrix
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      matrix:
+        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
+      max-parallel: 3
+      fail-fast: true
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
        with:
          python-version: '3.12'

@@ -31,36 +63,87 @@ jobs:
          python -m pip install --upgrade pip
          pip install -r requirements.txt

-      - name: Download and split ADS-B data
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+
+      - name: Download and extract ADS-B data
        env:
-          DATE: ${{ inputs.date }}
+          START_DATE: ${{ matrix.chunk.start_date }}
+          END_DATE: ${{ matrix.chunk.end_date }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          python -m src.adsb.download_and_list_icaos --date "$DATE"
-          ls -lah data/output/adsb_archives/"$DATE" || true
+          python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
+          ls -lah data/output/

-      - name: Upload archives
+      - name: Create tar of extracted data and split into chunks
+        run: |
+          cd data/output
+          echo "=== Disk space before tar ==="
+          df -h .
+          echo "=== Files to tar ==="
+          ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
+          
+          # Create tar with explicit error checking
+          if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
+            tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
+            echo "=== Tar file created ==="
+            ls -lah extracted_data.tar
+            # Verify tar integrity
+            tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
+            
+            # Record tar size and checksum for verification after reassembly
+            echo "=== Recording tar metadata ==="
+            ORIGINAL_SIZE=$(stat --format=%s extracted_data.tar)
+            ORIGINAL_SHA=$(sha256sum extracted_data.tar | awk '{print $1}')
+            echo "Size: $ORIGINAL_SIZE"
+            echo "SHA256: $ORIGINAL_SHA"
+            
+            # Split into 500MB chunks to avoid artifact upload issues
+            echo "=== Splitting tar into 500MB chunks ==="
+            mkdir -p tar_chunks
+            split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
+            rm extracted_data.tar
+            
+            # Write metadata file (plain text so artifact upload won't skip it)
+            echo "$ORIGINAL_SHA  extracted_data.tar" > tar_chunks/checksum.txt
+            echo "$ORIGINAL_SIZE" >> tar_chunks/checksum.txt
+            
+            echo "=== Chunks created ==="
+            ls -lah tar_chunks/
+            echo "=== Checksum file ==="
+            cat tar_chunks/checksum.txt
+          else
+            echo "ERROR: No extracted directories found, cannot create tar"
+            exit 1
+          fi
+
+      - name: Upload extracted data chunks
        uses: actions/upload-artifact@v4
        with:
-          name: adsb-archives-${{ inputs.date }}
-          path: data/output/adsb_archives/${{ inputs.date }}
+          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
+          path: data/output/tar_chunks/
          retention-days: 1
          compression-level: 0
-          if-no-files-found: error
+          if-no-files-found: warn

  adsb-map:
-    needs: adsb-extract
+    needs: [generate-matrix, adsb-extract]
    runs-on: ubuntu-24.04-arm
    strategy:
      fail-fast: true
      matrix:
-        part_id: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
+        icao_chunk: [0, 1, 2, 3]
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: '3.12'

@@ -69,36 +152,101 @@ jobs:
          python -m pip install --upgrade pip
          pip install -r requirements.txt

-      - name: Download archives
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+
+      - name: Download extracted data
        uses: actions/download-artifact@v4
        with:
-          name: adsb-archives-${{ inputs.date }}
-          path: data/output/adsb_archives/${{ inputs.date }}
+          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
+          path: data/output/tar_chunks/

-      - name: Process part
-        env:
-          DATE: ${{ inputs.date }}
+      - name: Reassemble and extract tar
+        id: extract
        run: |
-          python -m src.adsb.process_icao_chunk --part-id ${{ matrix.part_id }} --date "$DATE"
+          cd data/output
+          if [ -d tar_chunks ] && ls tar_chunks/extracted_data.tar.part_* 1>/dev/null 2>&1; then
+            echo "=== Chunk files info ==="
+            ls -lah tar_chunks/
+            
+            cd tar_chunks
+            
+            # Reassemble tar with explicit sorting
+            echo "=== Reassembling tar file ==="
+            ls -1 extracted_data.tar.part_?? | sort | while read part; do
+              echo "Appending $part..."
+              cat "$part" >> ../extracted_data.tar
+            done
+            cd ..
+            
+            echo "=== Reassembled tar file info ==="
+            ls -lah extracted_data.tar
+            
+            # Verify integrity
+            echo "=== Verifying reassembled tar ==="
+            if [ -f tar_chunks/checksum.txt ]; then
+              EXPECTED_SHA=$(head -1 tar_chunks/checksum.txt | awk '{print $1}')
+              EXPECTED_SIZE=$(sed -n '2p' tar_chunks/checksum.txt)
+              ACTUAL_SHA=$(sha256sum extracted_data.tar | awk '{print $1}')
+              ACTUAL_SIZE=$(stat --format=%s extracted_data.tar)
+              echo "Expected: SHA=$EXPECTED_SHA Size=$EXPECTED_SIZE"
+              echo "Actual:   SHA=$ACTUAL_SHA Size=$ACTUAL_SIZE"
+              if [ "$EXPECTED_SHA" != "$ACTUAL_SHA" ] || [ "$EXPECTED_SIZE" != "$ACTUAL_SIZE" ]; then
+                echo "ERROR: Reassembled tar does not match original - data corrupted during transfer"
+                exit 1
+              fi
+              echo "Checksum and size verified"
+            else
+              echo "WARNING: No checksum file found, falling back to tar integrity check"
+              tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
+              echo "Tar integrity check passed"
+            fi
+            
+            rm -rf tar_chunks
+            
+            echo "=== Extracting ==="
+            tar -xf extracted_data.tar
+            rm extracted_data.tar
+            echo "has_data=true" >> "$GITHUB_OUTPUT"
+            echo "=== Contents of data/output ==="
+            ls -lah
+          else
+            echo "No tar chunks found"
+            echo "has_data=false" >> "$GITHUB_OUTPUT"
+          fi

-      - name: Upload compressed outputs
+      - name: Process ICAO chunk
+        if: steps.extract.outputs.has_data == 'true'
+        env:
+          START_DATE: ${{ matrix.chunk.start_date }}
+          END_DATE: ${{ matrix.chunk.end_date }}
+        run: |
+          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.icao_chunk }} --total-chunks 4 --start-date "$START_DATE" --end-date "$END_DATE"
+          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
+
+      - name: Upload chunk artifacts
+        if: steps.extract.outputs.has_data == 'true'
        uses: actions/upload-artifact@v4
        with:
-          name: adsb-compressed-${{ inputs.date }}-part-${{ matrix.part_id }}
-          path: data/output/compressed/${{ inputs.date }}
+          name: adsb-map-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}-chunk-${{ matrix.icao_chunk }}
+          path: data/output/adsb_chunks/
          retention-days: 1
-          compression-level: 0
-          if-no-files-found: error
+          if-no-files-found: ignore

  adsb-reduce:
-    needs: adsb-map
+    needs: [generate-matrix, adsb-map]
    runs-on: ubuntu-24.04-arm
+    timeout-minutes: 120
    steps:
      - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v6
+        uses: actions/setup-python@v5
        with:
          python-version: '3.12'

@@ -107,23 +255,43 @@ jobs:
          python -m pip install --upgrade pip
          pip install -r requirements.txt

-      - name: Download compressed outputs
+      - name: Download all chunk artifacts
+        id: download
+        continue-on-error: true
        uses: actions/download-artifact@v4
        with:
-          pattern: adsb-compressed-${{ inputs.date }}-part-*
-          path: outputs/compressed/${{ inputs.date }}
+          pattern: adsb-map-*
+          path: data/output/adsb_chunks/
          merge-multiple: true

-      - name: Concatenate final outputs
-        env:
-          DATE: ${{ inputs.date }}
-        run: |
-          python src/adsb/concat_parquet_to_final.py --date "$DATE"
-          ls -lah outputs/ || true
+      - name: Retry artifact download on failure
+        if: steps.download.outcome == 'failure'
+        uses: actions/download-artifact@v4
+        with:
+          pattern: adsb-map-*
+          path: data/output/adsb_chunks/
+          merge-multiple: true

-      - name: Upload final artifacts
+      - name: Debug downloaded files
+        run: |
+          echo "=== Disk space before processing ==="
+          df -h
+          echo "=== Listing data/output/adsb_chunks/ ==="
+          find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
+          echo "=== Total parquet size ==="
+          du -sh data/output/adsb_chunks/ || echo "No chunks dir"
+
+      - name: Combine chunks to CSV
+        env:
+          START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
+          END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
+        run: |
+          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
+          ls -lah data/openairframes/
+
+      - name: Upload final artifact
        uses: actions/upload-artifact@v4
        with:
-          name: openairframes_adsb-${{ inputs.date }}
-          path: outputs/openairframes_adsb_${{ inputs.date }}_${{ inputs.date }}.*
+          name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
+          path: data/openairframes/*.csv.gz
          retention-days: 30
@@ -281,7 +281,4 @@ read*lock
 .nx/

 # jsii-rosetta files
-type-fingerprints.txt
-
-notebooks/whatever.ipynb
-.snapshots/
+type-fingerprints.txt
@@ -1,182 +0,0 @@
-#!/usr/bin/env python3
-"""
-Download and concatenate artifacts from a specific set of workflow runs.
-
-Usage:
-    python scripts/download_and_concat_runs.py triggered_runs_20260216_123456.json
-"""
-
-import argparse
-import json
-import os
-import subprocess
-import sys
-from pathlib import Path
-
-
-def download_run_artifact(run_id, output_dir):
-    """Download artifact from a specific workflow run."""
-    print(f"  Downloading artifacts from run {run_id}...")
-    
-    cmd = [
-        'gh', 'run', 'download', str(run_id),
-        '--pattern', 'openairframes_adsb-*',
-        '--dir', output_dir
-    ]
-    
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    
-    if result.returncode == 0:
-        print(f"  ✓ Downloaded")
-        return True
-    else:
-        if "no artifacts" in result.stderr.lower():
-            print(f"  ⚠ No artifacts found (workflow may still be running)")
-        else:
-            print(f"  ✗ Failed: {result.stderr}")
-        return False
-
-
-def find_csv_files(download_dir):
-    """Find all CSV.gz files in the download directory."""
-    csv_files = []
-    for root, dirs, files in os.walk(download_dir):
-        for file in files:
-            if file.endswith('.csv.gz'):
-                csv_files.append(os.path.join(root, file))
-    return sorted(csv_files)
-
-
-def concatenate_csv_files(csv_files, output_file):
-    """Concatenate CSV files in order, preserving headers."""
-    import gzip
-    
-    print(f"\nConcatenating {len(csv_files)} CSV files...")
-    
-    with gzip.open(output_file, 'wt') as outf:
-        header_written = False
-        
-        for i, csv_file in enumerate(csv_files, 1):
-            print(f"  [{i}/{len(csv_files)}] Processing {os.path.basename(csv_file)}")
-            
-            with gzip.open(csv_file, 'rt') as inf:
-                lines = inf.readlines()
-                
-                if not header_written:
-                    # Write header from first file
-                    outf.writelines(lines)
-                    header_written = True
-                else:
-                    # Skip header for subsequent files
-                    outf.writelines(lines[1:])
-    
-    print(f"\n✓ Concatenated CSV saved to: {output_file}")
-    
-    # Show file size
-    size_mb = os.path.getsize(output_file) / (1024 * 1024)
-    print(f"  Size: {size_mb:.1f} MB")
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Download and concatenate artifacts from workflow runs'
-    )
-    parser.add_argument(
-        'runs_file',
-        help='JSON file containing run IDs (from run_historical_adsb_action.py)'
-    )
-    parser.add_argument(
-        '--output-dir',
-        default='./downloads/historical_concat',
-        help='Directory for downloads (default: ./downloads/historical_concat)'
-    )
-    parser.add_argument(
-        '--wait',
-        action='store_true',
-        help='Wait for workflows to complete before downloading'
-    )
-    
-    args = parser.parse_args()
-    
-    # Load run IDs
-    if not os.path.exists(args.runs_file):
-        print(f"Error: File not found: {args.runs_file}")
-        sys.exit(1)
-    
-    with open(args.runs_file, 'r') as f:
-        data = json.load(f)
-    
-    runs = data['runs']
-    start_date = data['start_date']
-    end_date = data['end_date']
-    
-    print("=" * 60)
-    print("Download and Concatenate Historical Artifacts")
-    print("=" * 60)
-    print(f"Date range: {start_date} to {end_date}")
-    print(f"Workflow runs: {len(runs)}")
-    print(f"Output directory: {args.output_dir}")
-    print("=" * 60)
-    
-    # Create output directory
-    os.makedirs(args.output_dir, exist_ok=True)
-    
-    # Wait for workflows to complete if requested
-    if args.wait:
-        print("\nWaiting for workflows to complete...")
-        for run_info in runs:
-            run_id = run_info['run_id']
-            print(f"  Checking run {run_id}...")
-            
-            cmd = ['gh', 'run', 'watch', str(run_id)]
-            subprocess.run(cmd)
-    
-    # Download artifacts
-    print("\nDownloading artifacts...")
-    successful_downloads = 0
-    
-    for i, run_info in enumerate(runs, 1):
-        run_id = run_info['run_id']
-        print(f"\n[{i}/{len(runs)}] Run {run_id} ({run_info['start']} to {run_info['end']})")
-        
-        if download_run_artifact(run_id, args.output_dir):
-            successful_downloads += 1
-    
-    print(f"\n\nDownload Summary: {successful_downloads}/{len(runs)} artifacts downloaded")
-    
-    if successful_downloads == 0:
-        print("\nNo artifacts downloaded. Workflows may still be running.")
-        print("Use --wait to wait for completion, or try again later.")
-        sys.exit(1)
-    
-    # Find all CSV files
-    csv_files = find_csv_files(args.output_dir)
-    
-    if not csv_files:
-        print("\nError: No CSV files found in download directory")
-        sys.exit(1)
-    
-    print(f"\nFound {len(csv_files)} CSV file(s):")
-    for csv_file in csv_files:
-        print(f"  - {os.path.basename(csv_file)}")
-    
-    # Concatenate
-    # Calculate actual end date for filename (end_date - 1 day since it's exclusive)
-    from datetime import datetime, timedelta
-    end_dt = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
-    actual_end = end_dt.strftime('%Y-%m-%d')
-    
-    output_file = os.path.join(
-        args.output_dir,
-        f"openairframes_adsb_{start_date}_{actual_end}.csv.gz"
-    )
-    
-    concatenate_csv_files(csv_files, output_file)
-    
-    print("\n" + "=" * 60)
-    print("Done!")
-    print("=" * 60)
-
-
-if __name__ == '__main__':
-    main()
@@ -1,209 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to trigger historical-adsb workflow runs in 15-day chunks.
-
-Usage:
-    python scripts/run_historical_adsb_action.py --start-date 2025-01-01 --end-date 2025-06-01
-"""
-
-import argparse
-import subprocess
-import sys
-from datetime import datetime, timedelta
-
-
-def generate_date_chunks(start_date_str, end_date_str, chunk_days=15):
-    """Generate date ranges in fixed-day chunks from start to end date."""
-    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
-    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
-    
-    chunks = []
-    current = start_date
-    
-    while current < end_date:
-        # Calculate end of current chunk
-        chunk_end = current + timedelta(days=chunk_days)
-        
-        # Don't go past the global end date
-        if chunk_end > end_date:
-            chunk_end = end_date
-        
-        chunks.append({
-            'start': current.strftime('%Y-%m-%d'),
-            'end': chunk_end.strftime('%Y-%m-%d')
-        })
-        
-        current = chunk_end
-    
-    return chunks
-
-
-def trigger_workflow(start_date, end_date, chunk_days=1, branch='main', dry_run=False):
-    """Trigger the historical-adsb workflow via GitHub CLI."""
-    cmd = [
-        'gh', 'workflow', 'run', 'historical-adsb.yaml',
-        '--ref', branch,
-        '-f', f'start_date={start_date}',
-        '-f', f'end_date={end_date}',
-        '-f', f'chunk_days={chunk_days}'
-    ]
-    
-    if dry_run:
-        print(f"[DRY RUN] Would run: {' '.join(cmd)}")
-        return True, None
-    
-    print(f"Triggering workflow: {start_date} to {end_date} (on {branch})")
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    
-    if result.returncode == 0:
-        print(f"✓ Successfully triggered workflow for {start_date} to {end_date}")
-        
-        # Get the run ID of the workflow we just triggered
-        # Wait a moment for it to appear
-        import time
-        time.sleep(2)
-        
-        # Get the most recent run (should be the one we just triggered)
-        list_cmd = [
-            'gh', 'run', 'list',
-            '--workflow', 'historical-adsb.yaml',
-            '--branch', branch,
-            '--limit', '1',
-            '--json', 'databaseId',
-            '--jq', '.[0].databaseId'
-        ]
-        list_result = subprocess.run(list_cmd, capture_output=True, text=True)
-        run_id = list_result.stdout.strip() if list_result.returncode == 0 else None
-        
-        return True, run_id
-    else:
-        print(f"✗ Failed to trigger workflow for {start_date} to {end_date}")
-        print(f"Error: {result.stderr}")
-        return False, None
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Trigger historical-adsb workflow runs in monthly chunks'
-    )
-    parser.add_argument(
-        '--start-date',
-        required=True,
-        help='Start date in YYYY-MM-DD format (inclusive)'
-    )
-    parser.add_argument(
-        '--end-date',
-        required=True,
-        help='End date in YYYY-MM-DD format (exclusive)'
-    )
-    parser.add_argument(
-        '--chunk-days',
-        type=int,
-        default=1,
-        help='Days per job chunk within each workflow run (default: 1)'
-    )
-    parser.add_argument(
-        '--workflow-chunk-days',
-        type=int,
-        default=15,
-        help='Days per workflow run (default: 15)'
-    )
-    parser.add_argument(
-        '--branch',
-        type=str,
-        default='main',
-        help='Branch to run the workflow on (default: main)'
-    )
-    parser.add_argument(
-        '--dry-run',
-        action='store_true',
-        help='Print commands without executing them'
-    )
-    parser.add_argument(
-        '--delay',
-        type=int,
-        default=5,
-        help='Delay in seconds between workflow triggers (default: 5)'
-    )
-    
-    args = parser.parse_args()
-    
-    # Validate dates
-    try:
-        start = datetime.strptime(args.start_date, '%Y-%m-%d')
-        end = datetime.strptime(args.end_date, '%Y-%m-%d')
-        if start >= end:
-            print("Error: start_date must be before end_date")
-            sys.exit(1)
-    except ValueError as e:
-        print(f"Error: Invalid date format - {e}")
-        sys.exit(1)
-    
-    # Generate date chunks
-    chunks = generate_date_chunks(args.start_date, args.end_date, chunk_days=args.workflow_chunk_days)
-    
-    print(f"\nGenerating {len(chunks)} workflow runs ({args.workflow_chunk_days} days each) on branch '{args.branch}':")
-    for i, chunk in enumerate(chunks, 1):
-        print(f"  {i}. {chunk['start']} to {chunk['end']}")
-    
-    if not args.dry_run:
-        response = input(f"\nProceed with triggering {len(chunks)} workflows on '{args.branch}'? [y/N]: ")
-        if response.lower() != 'y':
-            print("Cancelled.")
-            sys.exit(0)
-    
-    print()
-    
-    # Trigger workflows
-    import time
-    success_count = 0
-    triggered_runs = []
-    
-    for i, chunk in enumerate(chunks, 1):
-        print(f"\n[{i}/{len(chunks)}] ", end='')
-        
-        success, run_id = trigger_workflow(
-            chunk['start'],
-            chunk['end'],
-            chunk_days=args.chunk_days,
-            branch=args.branch,
-            dry_run=args.dry_run
-        )
-        
-        if success:
-            success_count += 1
-            if run_id:
-                triggered_runs.append({
-                    'run_id': run_id,
-                    'start': chunk['start'],
-                    'end': chunk['end']
-                })
-        
-        # Add delay between triggers (except for last one)
-        if i < len(chunks) and not args.dry_run:
-            time.sleep(args.delay)
-    
-    print(f"\n\nSummary: {success_count}/{len(chunks)} workflows triggered successfully")
-    
-    # Save triggered run IDs to a file
-    if triggered_runs and not args.dry_run:
-        import json
-        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-        runs_file = f"./triggered_runs_{timestamp}.json"
-        with open(runs_file, 'w') as f:
-            json.dump({
-                'start_date': args.start_date,
-                'end_date': args.end_date,
-                'branch': args.branch,
-                'runs': triggered_runs
-            }, f, indent=2)
-        print(f"\nRun IDs saved to: {runs_file}")
-        print(f"\nTo download and concatenate these artifacts, run:")
-        print(f"  python scripts/download_and_concat_runs.py {runs_file}")
-    
-    if success_count < len(chunks):
-        sys.exit(1)
-
-
-if __name__ == '__main__':
-    main()
@@ -0,0 +1,261 @@
+"""
+Combines chunk parquet files and compresses to final aircraft CSV.
+This is the reduce phase of the map-reduce pipeline.
+
+Supports both single-day (daily) and multi-day (historical) modes.
+
+Memory-efficient: processes each chunk separately, compresses, then combines.
+
+Usage:
+    # Daily mode
+    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
+    
+    # Historical mode
+    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
+"""
+import gc
+import gzip
+import os
+import sys
+import glob
+import argparse
+from datetime import datetime, timedelta, timezone
+
+import polars as pl
+
+from src.adsb.download_adsb_data_to_parquet import OUTPUT_DIR, get_resource_usage
+from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLUMNS
+
+
+DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
+FINAL_OUTPUT_DIR = "./data/openairframes"
+os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
+
+
+def get_target_day() -> datetime:
+    """Get yesterday's date (the day we're processing)."""
+    return datetime.now(timezone.utc) - timedelta(days=1)
+
+
+def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
+    """Load and compress a single chunk parquet file.
+    
+    Args:
+        chunk_path: Path to parquet file
+        delete_after_load: If True, delete the parquet file after loading to free disk space
+    """
+    print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
+    
+    # Load chunk - only columns we need
+    needed_columns = ['time', 'icao'] + COLUMNS
+    df = pl.read_parquet(chunk_path, columns=needed_columns)
+    print(f"  Loaded {len(df)} rows")
+    
+    # Delete file immediately after loading to free disk space
+    if delete_after_load:
+        try:
+            os.remove(chunk_path)
+            print(f"  Deleted {chunk_path} to free disk space")
+        except Exception as e:
+            print(f"  Warning: Failed to delete {chunk_path}: {e}")
+    
+    # Compress to aircraft records (one per ICAO) using shared function
+    compressed = compress_multi_icao_df(df, verbose=True)
+    print(f"  Compressed to {len(compressed)} aircraft records")
+    
+    del df
+    gc.collect()
+    
+    return compressed
+
+
+def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFrame:
+    """Combine multiple compressed DataFrames.
+    
+    Since chunks are partitioned by ICAO hash, each ICAO only appears in one chunk.
+    No deduplication needed here - just concatenate.
+    """
+    print(f"Combining {len(compressed_dfs)} compressed chunks... | {get_resource_usage()}")
+    
+    # Concat all
+    combined = pl.concat(compressed_dfs)
+    print(f"Combined: {len(combined)} records")
+    
+    return combined
+
+
+def download_and_merge_base_release(compressed_df: pl.DataFrame) -> tuple[pl.DataFrame, str | None]:
+    """Download base release and merge with new data.
+    
+    Returns:
+        Tuple of (merged_df, earliest_date_str) where earliest_date_str is None if no base release was merged
+    """
+    from src.get_latest_release import download_latest_aircraft_adsb_csv
+    
+    print("Downloading base ADS-B release...")
+    base_path = download_latest_aircraft_adsb_csv(
+        output_dir="./data/openairframes_base"
+    )
+    print(f"Download returned: {base_path}")
+    
+    print(f"Loading base release from {base_path}")
+    
+    # Extract start date from filename (e.g., openairframes_adsb_2025-05-01_2026-02-14.csv.gz)
+    import re
+    filename = os.path.basename(str(base_path))
+    match = re.search(r'openairframes_adsb_(\d{4}-\d{2}-\d{2})_', filename)
+    earliest_date = match.group(1) if match else None
+    print(f"Start date from base filename: {earliest_date}")
+    
+    # Read CSV with schema matching the new data
+    base_df = pl.read_csv(base_path, schema=compressed_df.schema)
+    print(f"Base release has {len(base_df)} records")
+    
+    # Ensure columns match
+    base_cols = set(base_df.columns)
+    new_cols = set(compressed_df.columns)
+    print(f"Base columns: {sorted(base_cols)}")
+    print(f"New columns: {sorted(new_cols)}")
+    
+    # Add missing columns
+    for col in new_cols - base_cols:
+        base_df = base_df.with_columns(pl.lit(None).alias(col))
+    for col in base_cols - new_cols:
+        compressed_df = compressed_df.with_columns(pl.lit(None).alias(col))
+    
+    # Reorder columns to match
+    compressed_df = compressed_df.select(base_df.columns)
+    
+    # Concat and deduplicate by icao (keep new data - it comes last)
+    combined = pl.concat([base_df, compressed_df])
+    print(f"After concat: {len(combined)} records")
+
+    return combined, earliest_date
+
+def cleanup_chunks(output_id: str, chunks_dir: str):
+    """Delete chunk parquet files after successful merge."""
+    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
+    chunk_files = glob.glob(pattern)
+    for f in chunk_files:
+        try:
+            os.remove(f)
+            print(f"Deleted {f}")
+        except Exception as e:
+            print(f"Failed to delete {f}: {e}")
+
+
+def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
+    """Find chunk parquet files matching the output ID."""
+    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
+    chunk_files = sorted(glob.glob(pattern))
+    
+    if not chunk_files:
+        # Try recursive search for historical mode with merged artifacts
+        pattern = os.path.join(chunks_dir, "**", "*.parquet")
+        chunk_files = sorted(glob.glob(pattern, recursive=True))
+    
+    return chunk_files
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
+    parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
+    parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
+    parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
+    parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
+    args = parser.parse_args()
+    
+    # Determine output ID and filename based on mode
+    if args.start_date and args.end_date:
+        # Historical mode
+        output_id = f"{args.start_date}_{args.end_date}"
+        output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv.gz"
+        print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
+    else:
+        # Daily mode - use same date for start and end
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        
+        date_str = target_day.strftime("%Y-%m-%d")
+        output_id = date_str
+        output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv.gz"
+        print(f"Combining chunks for {date_str}")
+    
+    chunks_dir = args.chunks_dir
+    print(f"Chunks directory: {chunks_dir}")
+    print(f"Resource usage at start: {get_resource_usage()}")
+    
+    # Find chunk files
+    chunk_files = find_chunk_files(chunks_dir, output_id)
+    
+    if not chunk_files:
+        print(f"No chunk files found in: {chunks_dir}")
+        sys.exit(1)
+    
+    print(f"Found {len(chunk_files)} chunk files")
+    
+    # Process each chunk separately to save memory
+    # With --stream, delete parquet files immediately after loading to save disk space
+    compressed_chunks = []
+    for chunk_path in chunk_files:
+        compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
+        compressed_chunks.append(compressed)
+        gc.collect()
+    
+    # Combine all compressed chunks
+    combined = combine_compressed_chunks(compressed_chunks)
+    
+    # Free memory from individual chunks
+    del compressed_chunks
+    gc.collect()
+    print(f"After combining: {get_resource_usage()}")
+    
+    # Merge with base release (unless skipped)
+    base_start_date = None
+    if not args.skip_base:
+        combined, base_start_date = download_and_merge_base_release(combined)
+        
+        # Update filename if we merged with base release and got a start date
+        if base_start_date and not (args.start_date and args.end_date):
+            # Only update filename for daily mode when base was merged
+            output_filename = f"openairframes_adsb_{base_start_date}_{date_str}.csv.gz"
+            print(f"Updated filename to reflect date range: {output_filename}")
+    
+    # Convert list columns to strings for CSV compatibility
+    for col in combined.columns:
+        if combined[col].dtype == pl.List:
+            combined = combined.with_columns(
+                pl.col(col).list.join(",").alias(col)
+            )
+    
+    # Sort by time for consistent output
+    if 'time' in combined.columns:
+        combined = combined.sort('time')
+    
+    # Replace empty strings with null across all string columns to avoid quoted empty strings
+    for col in combined.columns:
+        if combined[col].dtype == pl.Utf8:
+            combined = combined.with_columns(
+                pl.when(pl.col(col) == "").then(None).otherwise(pl.col(col)).alias(col)
+            )
+    
+    # Write final CSV
+    output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
+    with gzip.open(output_path, "wb", compresslevel=9) as f:
+        combined.write_csv(f, null_value='', quote_style='necessary')
+    print(f"Wrote {len(combined)} records to {output_path}")
+    
+    # Cleanup
+    if not args.keep_chunks:
+        cleanup_chunks(output_id, chunks_dir)
+    
+    print(f"Done! | {get_resource_usage()}")
+
+
+if __name__ == "__main__":
+    main()
@@ -5,6 +5,23 @@ import polars as pl
 COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']


+def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
+    """For each icao, keep only the earliest row with each unique signature.
+    
+    This is used for deduplicating across multiple compressed chunks.
+    """
+    # Create signature column
+    df = df.with_columns(
+        pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
+    )
+    # Group by icao and signature, take first row (earliest due to time sort)
+    df = df.sort("time")
+    df_deduped = df.group_by(["icao", "_signature"]).first()
+    df_deduped = df_deduped.drop("_signature")
+    df_deduped = df_deduped.sort("time")
+    return df_deduped
+
+
 def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
    """Compress a single ICAO group to its most informative row using Polars."""
    # Create signature string
@@ -82,6 +99,9 @@ def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
 def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
    """Compress a DataFrame with multiple ICAOs to one row per ICAO.
    
+    This is the main entry point for compressing ADS-B data.
+    Used by both daily GitHub Actions runs and historical AWS runs.
+    
    Args:
        df: DataFrame with columns ['time', 'icao'] + COLUMNS
        verbose: Whether to print progress
@@ -100,27 +120,29 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
    
-    # Quick deduplication of exact duplicates
+    # First pass: quick deduplication of exact duplicates
    df = df.unique(subset=['icao'] + COLUMNS, keep='first')
    if verbose:
        print(f"After quick dedup: {df.height} records")
    
-    # Compress per ICAO
+    # Second pass: sophisticated compression per ICAO
    if verbose:
        print("Compressing per ICAO...")
    
+    # Process each ICAO group
    icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []
    
    for icao_key, group_df in icao_groups.items():
-        icao = icao_key[0]
+        # partition_by with as_dict=True returns tuple keys, extract first element
+        icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
        compressed = compress_df_polars(group_df, str(icao))
        compressed_dfs.append(compressed)
    
    if compressed_dfs:
        df_compressed = pl.concat(compressed_dfs)
    else:
-        df_compressed = df.head(0)
+        df_compressed = df.head(0)  # Empty with same schema
    
    if verbose:
        print(f"After compress: {df_compressed.height} records")
@@ -133,22 +155,45 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
    return df_compressed


-def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
-    """Load a single parquet part file for a date.
-    
-    Args:
-        part_id: Part ID (e.g., 1, 2, 3)
-        date: Date string in YYYY-MM-DD format
-    
-    Returns:
-        DataFrame with ADS-B data
-    """
+def load_raw_adsb_for_day(day):
+    """Load raw ADS-B data for a day from parquet file."""
+    from datetime import timedelta
    from pathlib import Path
    
-    parquet_file = Path(f"data/output/parquet_output/part_{part_id}_{date}.parquet")
+    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
+    
+    # Check for parquet file first
+    version_date = f"v{start_time.strftime('%Y.%m.%d')}"
+    parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
    
    if not parquet_file.exists():
-        print(f"Parquet file not found: {parquet_file}")
+        # Try to generate parquet file by calling the download function
+        print(f"  Parquet file not found: {parquet_file}")
+        print(f"  Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
+        
+        from download_adsb_data_to_parquet import create_parquet_for_day
+        result_path = create_parquet_for_day(start_time, keep_folders=False)
+        
+        if result_path:
+            print(f"  Successfully generated parquet file: {result_path}")
+        else:
+            raise Exception("Failed to generate parquet file")
+    
+    if parquet_file.exists():
+        print(f"  Loading from parquet: {parquet_file}")
+        df = pl.read_parquet(
+            parquet_file, 
+            columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
+        )
+        
+        # Convert to timezone-naive datetime
+        if df["time"].dtype == pl.Datetime:
+            df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
+        
+        return df
+    else:
+        # Return empty DataFrame if parquet file doesn't exist
+        print(f"  No data available for {start_time.strftime('%Y-%m-%d')}")
        return pl.DataFrame(schema={
            'time': pl.Datetime,
            'icao': pl.Utf8,
@@ -160,33 +205,17 @@ def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
            'desc': pl.Utf8,
            'aircraft_category': pl.Utf8
        })
-    
-    print(f"Loading from parquet: {parquet_file}")
-    df = pl.read_parquet(
-        parquet_file,
-        columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
-    )
-    
-    # Convert to timezone-naive datetime
-    if df["time"].dtype == pl.Datetime:
-        df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
-    
-    return df


-def compress_parquet_part(part_id: int, date: str) -> pl.DataFrame:
-    """Load and compress a single parquet part file."""
-    df = load_parquet_part(part_id, date)
-    
+def load_historical_for_day(day):
+    """Load and compress historical ADS-B data for a day."""
+    df = load_raw_adsb_for_day(day)
    if df.height == 0:
        return df
-
-    # Filter to rows within the given date (UTC-naive). This is because sometimes adsb.lol export can have rows at 00:00:00 of next day or similar.
-    date_lit = pl.lit(date).str.strptime(pl.Date, "%Y-%m-%d")
-    df = df.filter(pl.col("time").dt.date() == date_lit)
    
-    print(f"Loaded {df.height} raw records for part {part_id}, date {date}")
+    print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")
    
+    # Use shared compression function
    return compress_multi_icao_df(df, verbose=True)


@@ -207,9 +236,8 @@ def concat_compressed_dfs(df_base, df_new):
    icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []
    
-    for icao_key, group_df in icao_groups.items():
-        icao = icao_key[0]
-        compressed = compress_df_polars(group_df, str(icao))
+    for icao, group_df in icao_groups.items():
+        compressed = compress_df_polars(group_df, icao)
        compressed_dfs.append(compressed)
    
    if compressed_dfs:
@@ -1,34 +0,0 @@
-from pathlib import Path
-import polars as pl
-import argparse
-
-OUTPUT_DIR = Path("./outputs")
-
-def main():
-    parser = argparse.ArgumentParser(description="Concatenate compressed parquet files for a single day")
-    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
-    args = parser.parse_args()
-
-    compressed_dir = OUTPUT_DIR / "compressed"
-    date_dir = compressed_dir / args.date
-    if not date_dir.is_dir():
-        raise FileNotFoundError(f"No date folder found: {date_dir}")
-
-    parquet_files = sorted(date_dir.glob("*.parquet"))
-    if not parquet_files:
-        raise FileNotFoundError(f"No parquet files found in {date_dir}")
-
-    frames = [pl.read_parquet(p) for p in parquet_files]
-    df = pl.concat(frames, how="vertical", rechunk=True)
-
-    df = df.sort(["time", "icao"])
-    output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}_{args.date}.parquet"
-    print(f"Writing combined parquet to {output_path} with {df.height} rows")
-    df.write_parquet(output_path)
-
-    csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}_{args.date}.csv"
-    print(f"Writing combined csv to {csv_output_path} with {df.height} rows")
-    df.write_csv(csv_output_path)
-
-if __name__ == "__main__":
-    main()
@@ -1,33 +1,42 @@
 """
 Downloads adsb.lol data and writes to Parquet files.

-This file contains utility functions for downloading and processing adsb.lol trace data.
-Used by the historical ADS-B processing pipeline.
+Usage:
+    python -m src.process_historical_adsb_data.download_to_parquet 2025-01-01 2025-01-02
+
+This will download trace data for the specified date range and output Parquet files.
+
+This file is self-contained and does not import from other project modules.
 """
-import datetime as dt
+import gc
+import glob
 import gzip
-import os
-import re
 import resource
 import shutil
-import signal
-import subprocess
 import sys
-import urllib.error
+import logging
+import time
+import re
+import signal
+import concurrent.futures
+import subprocess
+import os
+import argparse
+import datetime as dt
+from datetime import datetime, timedelta, timezone
 import urllib.request
-from datetime import datetime
+import urllib.error

 import orjson
 import pyarrow as pa
 import pyarrow.parquet as pq
-from pathlib import Path


 # ============================================================================
 # Configuration
 # ============================================================================

-OUTPUT_DIR = Path("./data/output")
+OUTPUT_DIR = "./data/output"
 os.makedirs(OUTPUT_DIR, exist_ok=True)

 PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output")
@@ -76,7 +85,7 @@ def _fetch_releases_from_repo(year: str, version_date: str) -> list:
    
    while True:
        max_retries = 10
-        retry_delay = 60*5
+        retry_delay = 60
        
        for attempt in range(1, max_retries + 1):
            try:
@@ -395,6 +404,8 @@ COLUMNS = [

 OS_CPU_COUNT = os.cpu_count() or 1
 MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1
+CHUNK_SIZE = MAX_WORKERS * 500  # Reduced for lower RAM usage
+BATCH_SIZE = 250_000  # Fixed size for predictable memory usage (~500MB per batch)

 # PyArrow schema for efficient Parquet writing
 PARQUET_SCHEMA = pa.schema([
@@ -482,6 +493,217 @@ def collect_trace_files_with_find(root_dir):
    return trace_dict


+def generate_version_dates(start_date: str, end_date: str) -> list:
+    """Generate a list of dates from start_date to end_date inclusive."""
+    start = datetime.strptime(start_date, "%Y-%m-%d")
+    end = datetime.strptime(end_date, "%Y-%m-%d")
+    delta = end - start
+    return [start + timedelta(days=i) for i in range(delta.days + 1)]
+
+
+def safe_process(fp):
+    """Safely process a file, returning empty list on error."""
+    try:
+        return process_file(fp)
+    except Exception as e:
+        logging.error(f"Error processing {fp}: {e}")
+        return []
+
+
+def rows_to_arrow_table(rows: list) -> pa.Table:
+    """Convert list of rows to a PyArrow Table directly (no pandas)."""
+    # Transpose rows into columns
+    columns = list(zip(*rows))
+    
+    # Build arrays for each column according to schema
+    arrays = []
+    for i, field in enumerate(PARQUET_SCHEMA):
+        col_data = list(columns[i]) if i < len(columns) else [None] * len(rows)
+        arrays.append(pa.array(col_data, type=field.type))
+    
+    return pa.Table.from_arrays(arrays, schema=PARQUET_SCHEMA)
+
+
+def write_batch_to_parquet(rows: list, version_date: str, batch_idx: int):
+    """Write a batch of rows to a Parquet file."""
+    if not rows:
+        return
+    
+    table = rows_to_arrow_table(rows)
+    
+    parquet_path = os.path.join(PARQUET_DIR, f"{version_date}_batch_{batch_idx:04d}.parquet")
+    
+    pq.write_table(table, parquet_path, compression='snappy')
+    
+    print(f"Written parquet batch {batch_idx} ({len(rows)} rows) | {get_resource_usage()}")
+
+
+def merge_parquet_files(version_date: str, delete_batches: bool = True):
+    """Merge all batch parquet files for a version_date into a single file using streaming."""
+    pattern = os.path.join(PARQUET_DIR, f"{version_date}_batch_*.parquet")
+    batch_files = sorted(glob.glob(pattern))
+    
+    if not batch_files:
+        print(f"No batch files found for {version_date}")
+        return None
+    
+    print(f"Merging {len(batch_files)} batch files for {version_date} (streaming)...")
+    
+    merged_path = os.path.join(PARQUET_DIR, f"{version_date}.parquet")
+    total_rows = 0
+    
+    # Stream write: read one batch at a time to minimize RAM usage
+    writer = None
+    try:
+        for i, f in enumerate(batch_files):
+            table = pq.read_table(f)
+            total_rows += table.num_rows
+            
+            if writer is None:
+                writer = pq.ParquetWriter(merged_path, table.schema, compression='snappy')
+            
+            writer.write_table(table)
+            
+            # Delete batch file immediately after reading to free disk space
+            if delete_batches:
+                os.remove(f)
+            
+            # Free memory
+            del table
+            if (i + 1) % 10 == 0:
+                gc.collect()
+                print(f"  Merged {i + 1}/{len(batch_files)} batches... | {get_resource_usage()}")
+    finally:
+        if writer is not None:
+            writer.close()
+    
+    print(f"Merged parquet file written to {merged_path} ({total_rows} total rows) | {get_resource_usage()}")
+    
+    if delete_batches:
+        print(f"Deleted {len(batch_files)} batch files during merge")
+    
+    gc.collect()
+    return merged_path
+
+
+def process_version_date(version_date: str, keep_folders: bool = False):
+    """Download, extract, and process trace files for a single version date."""
+    print(f"\nProcessing version_date: {version_date}")
+    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+    
+    def collect_trace_files_for_version_date(vd):
+        releases = fetch_releases(vd)
+        if len(releases) == 0:
+            print(f"No releases found for {vd}.")
+            return None
+        
+        # Prefer non-tmp releases; only use tmp if no normal releases exist
+        normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
+        tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
+        releases = normal_releases if normal_releases else tmp_releases
+        print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
+        
+        downloaded_files = []
+        for release in releases:
+            tag_name = release["tag_name"]
+            print(f"Processing release: {tag_name}")
+
+            # Only download prod-0 if available, else prod-0tmp
+            assets = release.get("assets", [])
+            normal_assets = [
+                a for a in assets
+                if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
+            ]
+            tmp_assets = [
+                a for a in assets
+                if "planes-readsb-prod-0tmp" in a["name"]
+            ]
+            use_assets = normal_assets if normal_assets else tmp_assets
+
+            for asset in use_assets:
+                asset_name = asset["name"]
+                asset_url = asset["browser_download_url"]
+                file_path = os.path.join(OUTPUT_DIR, asset_name)
+                result = download_asset(asset_url, file_path)
+                if result:
+                    downloaded_files.append(file_path)
+
+        extract_split_archive(downloaded_files, extract_dir)
+        return collect_trace_files_with_find(extract_dir)
+
+    # Check if files already exist
+    pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
+    matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
+    
+    if matches:
+        print(f"Found existing files for {version_date}:")
+        # Prefer non-tmp slices when reusing existing files
+        normal_matches = [
+            p for p in matches
+            if "-planes-readsb-prod-0." in os.path.basename(p)
+            and "tmp" not in os.path.basename(p)
+        ]
+        downloaded_files = normal_matches if normal_matches else matches
+        
+        extract_split_archive(downloaded_files, extract_dir)
+        trace_files = collect_trace_files_with_find(extract_dir)
+    else:
+        trace_files = collect_trace_files_for_version_date(version_date)
+    
+    if trace_files is None or len(trace_files) == 0:
+        print(f"No trace files found for version_date: {version_date}")
+        return 0
+    
+    file_list = list(trace_files.values())
+    
+    start_time = time.perf_counter()
+    total_num_rows = 0
+    batch_rows = []
+    batch_idx = 0
+    
+    # Process files in chunks
+    for offset in range(0, len(file_list), CHUNK_SIZE):
+        chunk = file_list[offset:offset + CHUNK_SIZE]
+        with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_executor:
+            for rows in process_executor.map(safe_process, chunk):
+                if not rows:
+                    continue
+                batch_rows.extend(rows)
+                
+                if len(batch_rows) >= BATCH_SIZE:
+                    total_num_rows += len(batch_rows)
+                    write_batch_to_parquet(batch_rows, version_date, batch_idx)
+                    batch_idx += 1
+                    batch_rows = []
+                    
+                    elapsed = time.perf_counter() - start_time
+                    speed = total_num_rows / elapsed if elapsed > 0 else 0
+                    print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
+        
+        gc.collect()
+    
+    # Final batch
+    if batch_rows:
+        total_num_rows += len(batch_rows)
+        write_batch_to_parquet(batch_rows, version_date, batch_idx)
+        elapsed = time.perf_counter() - start_time
+        speed = total_num_rows / elapsed if elapsed > 0 else 0
+        print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
+    
+    print(f"Total rows processed for version_date {version_date}: {total_num_rows}")
+    
+    # Clean up extracted directory immediately after processing (before merging parquet files)
+    if not keep_folders and os.path.isdir(extract_dir):
+        print(f"Deleting extraction directory with 100,000+ files: {extract_dir}")
+        shutil.rmtree(extract_dir)
+        print(f"Successfully deleted extraction directory: {extract_dir} | {get_resource_usage()}")
+    
+    # Merge batch files into a single parquet file
+    merge_parquet_files(version_date, delete_batches=True)
+    
+    return total_num_rows
+
+
 def create_parquet_for_day(day, keep_folders: bool = False):
    """Create parquet file for a single day.
    
@@ -512,3 +734,35 @@ def create_parquet_for_day(day, keep_folders: bool = False):
        return parquet_path
    else:
        return None
+
+
+def main(start_date: str, end_date: str, keep_folders: bool = False):
+    """Main function to download and convert adsb.lol data to Parquet."""
+    version_dates = [f"v{date.strftime('%Y.%m.%d')}" for date in generate_version_dates(start_date, end_date)]
+    print(f"Processing dates: {version_dates}")
+    
+    total_rows_all = 0
+    for version_date in version_dates:
+        rows_processed = process_version_date(version_date, keep_folders)
+        total_rows_all += rows_processed
+    
+    print(f"\n=== Summary ===")
+    print(f"Total dates processed: {len(version_dates)}")
+    print(f"Total rows written to Parquet: {total_rows_all}")
+    print(f"Parquet files location: {PARQUET_DIR}")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
+    
+    parser = argparse.ArgumentParser(
+        description="Download adsb.lol data and write to Parquet files"
+    )
+    parser.add_argument("start_date", type=str, help="Start date in YYYY-MM-DD format")
+    parser.add_argument("end_date", type=str, help="End date in YYYY-MM-DD format")
+    parser.add_argument("--keep-folders", action="store_true", 
+                        help="Keep extracted folders after processing")
+    
+    args = parser.parse_args()
+    
+    main(args.start_date, args.end_date, args.keep_folders)
@@ -1,7 +1,9 @@
 """
-Downloads and extracts adsb.lol tar files for a single day, then lists all ICAO folders.
+Downloads and extracts adsb.lol tar files, then lists all ICAO folders.
 This is the first step of the map-reduce pipeline.

+Supports both single-day (daily) and multi-day (historical) modes.
+
 Outputs:
 - Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
 - ICAO manifest at data/output/icao_manifest_{date}.txt
@@ -23,6 +25,11 @@ from src.adsb.download_adsb_data_to_parquet import (
 )


+def get_target_day() -> datetime:
+    """Get yesterday's date (the day we're processing)."""
+    return datetime.utcnow() - timedelta(days=1)
+
+
 def download_and_extract(version_date: str) -> str | None:
    """Download and extract tar files, return extract directory path."""
    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
@@ -99,6 +106,21 @@ def list_icao_folders(extract_dir: str) -> list[str]:
    return icaos


+def write_manifest(icaos: list[str], manifest_id: str) -> str:
+    """Write ICAO list to manifest file.
+    
+    Args:
+        icaos: List of ICAO codes
+        manifest_id: Identifier for manifest file (date or date range)
+    """
+    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
+    with open(manifest_path, "w") as f:
+        for icao in sorted(icaos):
+            f.write(f"{icao}\n")
+    print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
+    return manifest_path
+
+
 def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    """Process a single day: download, extract, list ICAOs.
    
@@ -113,50 +135,82 @@ def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    extract_dir = download_and_extract(version_date)
    if not extract_dir:
        print(f"Failed to download/extract data for {date_str}")
-        raise Exception(f"No data available for {date_str}")
+        return None, []
    
    icaos = list_icao_folders(extract_dir)
    print(f"Found {len(icaos)} ICAOs for {date_str}")
    
    return extract_dir, icaos

-from pathlib import Path
-import tarfile
-NUMBER_PARTS = 16
-def split_folders_into_gzip_archives(extract_dir: Path, tar_output_dir: Path, icaos: list[str], parts = NUMBER_PARTS) -> list[str]:
-    traces_dir = extract_dir / "traces"
-    buckets = sorted(traces_dir.iterdir())
-    tars = []
-    for i in range(parts):
-        tar_path = tar_output_dir / f"{tar_output_dir.name}_part_{i}.tar.gz"
-        tars.append(tarfile.open(tar_path, "w:gz"))
-    for idx, bucket_path in enumerate(buckets):
-        tar_idx = idx % parts
-        tars[tar_idx].add(bucket_path, arcname=bucket_path.name)
-    for tar in tars:
-        tar.close()
+
+def process_date_range(start_date: datetime, end_date: datetime) -> set[str]:
+    """Process multiple days: download, extract, combine ICAO lists.
+    
+    Args:
+        start_date: Start date (inclusive)
+        end_date: End date (inclusive)
+    
+    Returns:
+        Combined set of all ICAOs across the date range
+    """
+    all_icaos: set[str] = set()
+    current = start_date
+    
+    # Both start and end are inclusive
+    while current <= end_date:
+        _, icaos = process_single_day(current)
+        all_icaos.update(icaos)
+        current += timedelta(days=1)
+    
+    return all_icaos


 def main():
-    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data for a single day")
+    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    args = parser.parse_args()
    
-    target_day = datetime.strptime(args.date, "%Y-%m-%d")
-    date_str = target_day.strftime("%Y-%m-%d")
-    tar_output_dir = Path(f"./data/output/adsb_archives/{date_str}")
-    
-    extract_dir, icaos = process_single_day(target_day)
-    extract_dir = Path(extract_dir)
-    print(extract_dir)
-    tar_output_dir.mkdir(parents=True, exist_ok=True)
-    split_folders_into_gzip_archives(extract_dir, tar_output_dir, icaos)
-    if not icaos:
-        print("No ICAOs found")
-        sys.exit(1)
-    
-    print(f"\nDone! Extract dir: {extract_dir}")
-    print(f"Total ICAOs: {len(icaos)}")
+    # Determine mode: single day or date range
+    if args.start_date and args.end_date:
+        # Historical mode: process date range
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+        
+        print(f"Processing date range: {args.start_date} to {args.end_date}")
+        
+        all_icaos = process_date_range(start_date, end_date)
+        
+        if not all_icaos:
+            print("No ICAOs found in date range")
+            sys.exit(1)
+        
+        # Write combined manifest with range identifier
+        manifest_id = f"{args.start_date}_{args.end_date}"
+        write_manifest(list(all_icaos), manifest_id)
+        
+        print(f"\nDone! Total ICAOs: {len(all_icaos)}")
+        
+    else:
+        # Daily mode: single day
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        
+        date_str = target_day.strftime("%Y-%m-%d")
+        
+        extract_dir, icaos = process_single_day(target_day)
+        
+        if not icaos:
+            print("No ICAOs found")
+            sys.exit(1)
+        
+        write_manifest(icaos, date_str)
+        
+        print(f"\nDone! Extract dir: {extract_dir}")
+        print(f"Total ICAOs: {len(icaos)}")


 if __name__ == "__main__":
@@ -41,7 +41,7 @@ def main() -> None:
    """Main entry point for GitHub Actions."""
    start_date = os.environ.get("INPUT_START_DATE")
    end_date = os.environ.get("INPUT_END_DATE")
-    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "1"))
+    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
    
    if not start_date or not end_date:
        print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
@@ -1,37 +0,0 @@
-"""
-Main pipeline for processing ADS-B data from adsb.lol.
-
-Usage:
-    python -m src.adsb.main --date 2026-01-01
-"""
-import argparse
-import subprocess
-import sys
-from datetime import datetime, timedelta
-
-from src.adsb.download_and_list_icaos import NUMBER_PARTS
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Process ADS-B data for a single day")
-    parser.add_argument("--date", type=str, required=True)
-    args = parser.parse_args()
-    
-    date_str = datetime.strptime(args.date, "%Y-%m-%d").strftime("%Y-%m-%d")
-    print(f"Processing day: {date_str}")
-    
-    # Download and split
-    subprocess.run([sys.executable, "-m", "src.adsb.download_and_list_icaos", "--date", date_str], check=True)
-    
-    # Process parts
-    for part_id in range(NUMBER_PARTS):
-        subprocess.run([sys.executable, "-m", "src.adsb.process_icao_chunk", "--part-id", str(part_id), "--date", date_str], check=True)
-    
-    # Concatenate
-    subprocess.run([sys.executable, "src/adsb/concat_parquet_to_final.py", "--date", date_str], check=True)
-    
-    print("Done")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,9 +1,18 @@
 """
-Processes trace files from a single archive part for a single day.
+Processes a chunk of ICAOs from pre-extracted trace files.
 This is the map phase of the map-reduce pipeline.

+Supports both single-day (daily) and multi-day (historical) modes.
+
+Expects extract_dir to already exist with trace files.
+Reads ICAO manifest to determine which ICAOs to process based on chunk-id.
+
 Usage:
-    python -m src.adsb.process_icao_chunk --part-id 1 --date 2026-01-01
+    # Daily mode (single day)
+    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
+    
+    # Historical mode (date range)
+    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
 """
 import gc
 import os
@@ -12,9 +21,6 @@ import argparse
 import time
 import concurrent.futures
 from datetime import datetime, timedelta
-import tarfile
-import tempfile
-import shutil

 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -37,18 +43,66 @@ os.makedirs(CHUNK_OUTPUT_DIR, exist_ok=True)
 # Smaller batch size for memory efficiency
 BATCH_SIZE = 100_000

-def build_trace_file_map(archive_path: str) -> dict[str, str]:
-    """Build a map of ICAO -> trace file path by extracting tar.gz archive."""
-    print(f"Extracting {archive_path}...")
+
+def get_target_day() -> datetime:
+    """Get yesterday's date (the day we're processing)."""
+    return datetime.utcnow() - timedelta(days=1)
+
+
+def read_manifest(manifest_id: str) -> list[str]:
+    """Read ICAO manifest file.
    
-    temp_dir = tempfile.mkdtemp(prefix="adsb_extract_")
+    Args:
+        manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
+    """
+    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
+    if not os.path.exists(manifest_path):
+        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
    
-    with tarfile.open(archive_path, 'r:gz') as tar:
-        tar.extractall(path=temp_dir, filter='data')
+    with open(manifest_path, "r") as f:
+        icaos = [line.strip() for line in f if line.strip()]
+    return icaos
+
+
+def deterministic_hash(s: str) -> int:
+    """Return a deterministic hash for a string (unlike Python's hash() which is randomized)."""
+    # Use sum of byte values - simple but deterministic
+    return sum(ord(c) for c in s)
+
+
+def get_chunk_icaos(icaos: list[str], chunk_id: int, total_chunks: int) -> list[str]:
+    """Get the subset of ICAOs for this chunk based on deterministic hash partitioning."""
+    return [icao for icao in icaos if deterministic_hash(icao) % total_chunks == chunk_id]
+
+
+def build_trace_file_map(extract_dir: str) -> dict[str, str]:
+    """Build a map of ICAO -> trace file path using find command."""
+    print(f"Building trace file map from {extract_dir}...")
    
-    trace_map = collect_trace_files_with_find(temp_dir)
+    # Debug: check what's in extract_dir
+    if os.path.isdir(extract_dir):
+        items = os.listdir(extract_dir)[:10]
+        print(f"First 10 items in extract_dir: {items}")
+        # Check if there are subdirectories
+        for item in items[:3]:
+            subpath = os.path.join(extract_dir, item)
+            if os.path.isdir(subpath):
+                subitems = os.listdir(subpath)[:5]
+                print(f"  Contents of {item}/: {subitems}")
+    
+    trace_map = collect_trace_files_with_find(extract_dir)
    print(f"Found {len(trace_map)} trace files")
    
+    if len(trace_map) == 0:
+        # Debug: try manual find
+        import subprocess
+        result = subprocess.run(
+            ['find', extract_dir, '-type', 'f', '-name', 'trace_full_*'],
+            capture_output=True, text=True
+        )
+        print(f"Manual find output (first 500 chars): {result.stdout[:500]}")
+        print(f"Manual find stderr: {result.stderr[:200]}")
+    
    return trace_map


@@ -71,13 +125,42 @@ def rows_to_table(rows: list) -> pa.Table:


 def process_chunk(
-    trace_files: list[str],
-    part_id: int,
-    date_str: str,
+    chunk_id: int,
+    total_chunks: int,
+    trace_map: dict[str, str],
+    icaos: list[str],
+    output_id: str,
 ) -> str | None:
-    """Process trace files and write to a single parquet file."""
+    """Process a chunk of ICAOs and write to parquet.
    
-    output_path = os.path.join(PARQUET_DIR, f"part_{part_id}_{date_str}.parquet")
+    Args:
+        chunk_id: This chunk's ID (0-indexed)
+        total_chunks: Total number of chunks
+        trace_map: Map of ICAO -> trace file path
+        icaos: Full list of ICAOs from manifest
+        output_id: Identifier for output file (date or date range)
+    """
+    chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
+    print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
+    
+    if not chunk_icaos:
+        print(f"Chunk {chunk_id}: No ICAOs to process")
+        return None
+    
+    # Get trace file paths from the map
+    trace_files = []
+    for icao in chunk_icaos:
+        if icao in trace_map:
+            trace_files.append(trace_map[icao])
+    
+    print(f"Chunk {chunk_id}: Found {len(trace_files)} trace files")
+    
+    if not trace_files:
+        print(f"Chunk {chunk_id}: No trace files found")
+        return None
+    
+    # Process files and write parquet in batches
+    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
    
    start_time = time.perf_counter()
    total_rows = 0
@@ -85,8 +168,7 @@ def process_chunk(
    writer = None
    
    try:
-        writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
-        
+        # Process in parallel batches
        files_per_batch = MAX_WORKERS * 100
        for offset in range(0, len(trace_files), files_per_batch):
            batch_files = trace_files[offset:offset + files_per_batch]
@@ -96,63 +178,166 @@ def process_chunk(
                    if rows:
                        batch_rows.extend(rows)
                        
+                        # Write when batch is full
                        if len(batch_rows) >= BATCH_SIZE:
-                            writer.write_table(rows_to_table(batch_rows))
+                            table = rows_to_table(batch_rows)
                            total_rows += len(batch_rows)
+                            
+                            if writer is None:
+                                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
+                            writer.write_table(table)
+                            
                            batch_rows = []
+                            del table
                            gc.collect()
+                            
+                            elapsed = time.perf_counter() - start_time
+                            print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
+            
            gc.collect()
        
+        # Write remaining rows
        if batch_rows:
-            writer.write_table(rows_to_table(batch_rows))
+            table = rows_to_table(batch_rows)
            total_rows += len(batch_rows)
+            
+            if writer is None:
+                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
+            writer.write_table(table)
+            del table
    
    finally:
        if writer:
            writer.close()
    
-    print(f"Part {part_id}: Done! {total_rows} rows in {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
+    elapsed = time.perf_counter() - start_time
+    print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
    
-    return output_path if total_rows > 0 else None
+    if total_rows > 0:
+        return output_path
+    return None
+
+
+def process_single_day(
+    chunk_id: int,
+    total_chunks: int,
+    target_day: datetime,
+) -> str | None:
+    """Process a single day for this chunk."""
+    date_str = target_day.strftime("%Y-%m-%d")
+    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
+    
+    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+    
+    if not os.path.isdir(extract_dir):
+        print(f"Extract directory not found: {extract_dir}")
+        return None
+    
+    trace_map = build_trace_file_map(extract_dir)
+    if not trace_map:
+        print("No trace files found")
+        return None
+    
+    icaos = read_manifest(date_str)
+    print(f"Total ICAOs in manifest: {len(icaos)}")
+    
+    return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
+
+
+def process_date_range(
+    chunk_id: int,
+    total_chunks: int,
+    start_date: datetime,
+    end_date: datetime,
+) -> str | None:
+    """Process a date range for this chunk.
+    
+    Combines trace files from all days in the range.
+    
+    Args:
+        chunk_id: This chunk's ID (0-indexed)
+        total_chunks: Total number of chunks
+        start_date: Start date (inclusive)
+        end_date: End date (inclusive)
+    """
+    start_str = start_date.strftime("%Y-%m-%d")
+    end_str = end_date.strftime("%Y-%m-%d")
+    manifest_id = f"{start_str}_{end_str}"
+    
+    print(f"Processing date range: {start_str} to {end_str}")
+    
+    # Build combined trace map from all days
+    combined_trace_map: dict[str, str] = {}
+    current = start_date
+    
+    # Both start and end are inclusive
+    while current <= end_date:
+        version_date = f"v{current.strftime('%Y.%m.%d')}"
+        extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+        
+        if os.path.isdir(extract_dir):
+            trace_map = build_trace_file_map(extract_dir)
+            # Later days override earlier days (use most recent trace file)
+            combined_trace_map.update(trace_map)
+            print(f"  {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
+        else:
+            print(f"  {current.strftime('%Y-%m-%d')}: no extract directory")
+        
+        current += timedelta(days=1)
+    
+    if not combined_trace_map:
+        print("No trace files found in date range")
+        return None
+    
+    print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
+    
+    icaos = read_manifest(manifest_id)
+    print(f"Total ICAOs in manifest: {len(icaos)}")
+    
+    return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)

-from pathlib import Path

 def main():
-    parser = argparse.ArgumentParser(description="Process a single archive part for a day")
-    parser.add_argument("--part-id", type=int, required=True, help="Part ID (1-indexed)")
-    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
+    parser = argparse.ArgumentParser(description="Process a chunk of ICAOs")
+    parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
+    parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    args = parser.parse_args()
    
-    print(f"Processing part {args.part_id} for {args.date}")
+    print(f"Processing chunk {args.chunk_id}/{args.total_chunks}")
+    print(f"OUTPUT_DIR: {OUTPUT_DIR}")
+    print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
+    print(f"Resource usage at start: {get_resource_usage()}")
    
-    # Get specific archive file for this part
-    archive_path = os.path.join(OUTPUT_DIR, "adsb_archives", args.date, f"{args.date}_part_{args.part_id}.tar.gz")
+    # Debug: List what's in OUTPUT_DIR
+    print(f"\nContents of {OUTPUT_DIR}:")
+    if os.path.isdir(OUTPUT_DIR):
+        for item in os.listdir(OUTPUT_DIR)[:20]:
+            print(f"  - {item}")
+    else:
+        print(f"  Directory does not exist!")
    
-    # Extract and collect trace files
-    trace_map = build_trace_file_map(archive_path)
-    all_trace_files = list(trace_map.values())
+    # Determine mode: single day or date range
+    if args.start_date and args.end_date:
+        # Historical mode
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+        output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
+    else:
+        # Daily mode
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        output_path = process_single_day(args.chunk_id, args.total_chunks, target_day)
    
-    print(f"Total trace files: {len(all_trace_files)}")
-    
-    # Process and write output
-    output_path = process_chunk(all_trace_files, args.part_id, args.date)
-    
-    from src.adsb.compress_adsb_to_aircraft_data import compress_parquet_part
-    df_compressed = compress_parquet_part(args.part_id, args.date)
-    
-    # Write parquet
-    df_compressed_output = OUTPUT_DIR / "compressed" / args.date/ f"part_{args.part_id}_{args.date}.parquet"
-    os.makedirs(df_compressed_output.parent, exist_ok=True)
-    df_compressed.write_parquet(df_compressed_output, compression='snappy')
-    
-    # Write CSV
-    csv_output = OUTPUT_DIR / "compressed" / args.date / f"part_{args.part_id}_{args.date}.csv"
-    df_compressed.write_csv(csv_output)
-    
-    print(f"Raw output: {output_path}" if output_path else "No raw output generated")
-    print(f"Compressed parquet: {df_compressed_output}")
-    print(f"Compressed CSV: {csv_output}")
+    if output_path:
+        print(f"Output: {output_path}")
+    else:
+        print("No output generated")


 if __name__ == "__main__":
-    main()
+    main()
@@ -0,0 +1,2 @@
+polars>=1.0
+boto3>=1.34
@@ -0,0 +1,5 @@
+polars>=1.0
+pyarrow>=14.0
+orjson>=3.9
+boto3>=1.34
+zstandard>=0.22
@@ -8,10 +8,10 @@ Usage:
    # Single day (yesterday by default)
    python -m src.adsb.run_local
    
-    # Single day (specific date, processes 2024-01-15 only)
-    python -m src.adsb.run_local 2024-01-15 2024-01-16
+    # Single day (specific date)
+    python -m src.adsb.run_local 2024-01-15
    
-    # Date range (end date is exclusive)
+    # Date range (inclusive)
    python -m src.adsb.run_local 2024-01-01 2024-01-07
 """
 import argparse
@@ -38,12 +38,12 @@ def main():
    parser.add_argument(
        "start_date",
        nargs="?",
-        help="Start date (YYYY-MM-DD, inclusive). Default: yesterday"
+        help="Start date (YYYY-MM-DD). Default: yesterday"
    )
    parser.add_argument(
        "end_date",
        nargs="?",
-        help="End date (YYYY-MM-DD, exclusive). If omitted, processes single day (start_date + 1)"
+        help="End date (YYYY-MM-DD, inclusive). If omitted, processes single day"
    )
    parser.add_argument(
        "--chunks",
@@ -51,17 +51,10 @@ def main():
        default=4,
        help="Number of parallel chunks (default: 4)"
    )
-    parser.add_argument(
-        "--chunk-days",
-        type=int,
-        default=1,
-        help="Days per chunk for date range processing (default: 1)"
-    )
    parser.add_argument(
        "--skip-base",
        action="store_true",
-        default=True,
-        help="Skip downloading and merging with base release (default: True for historical runs)"
+        help="Skip downloading and merging with base release"
    )
    args = parser.parse_args()

@@ -71,81 +64,69 @@ def main():
    else:
        start_date = datetime.utcnow() - timedelta(days=1)
    
+    end_date = None
    if args.end_date:
        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
-    else:
-        # Default: process single day (end = start + 1 day, exclusive)
-        end_date = start_date + timedelta(days=1)
    
    start_str = start_date.strftime("%Y-%m-%d")
-    end_str = end_date.strftime("%Y-%m-%d")
-    
-    # Generate date chunks
-    date_chunks = []
-    current = start_date
-    while current < end_date:
-        chunk_end = min(current + timedelta(days=args.chunk_days), end_date)
-        date_chunks.append({
-            'start': current.strftime("%Y-%m-%d"),
-            'end': chunk_end.strftime("%Y-%m-%d")
-        })
-        current = chunk_end
+    end_str = end_date.strftime("%Y-%m-%d") if end_date else None
    
    print("=" * 60)
    print("ADS-B Processing Pipeline")
    print("=" * 60)
-    print(f"Date range: {start_str} to {end_str} (exclusive)")
-    print(f"Date chunks: {len(date_chunks)} ({args.chunk_days} days each)")
-    print(f"ICAO chunks: {args.chunks}")
+    if end_str:
+        print(f"Date range: {start_str} to {end_str}")
+    else:
+        print(f"Date: {start_str}")
+    print(f"Chunks: {args.chunks}")
    print("=" * 60)
    
-    # Process each date chunk
-    for idx, date_chunk in enumerate(date_chunks, 1):
-        chunk_start = date_chunk['start']
-        chunk_end = date_chunk['end']
-        
-        # Convert exclusive end date to inclusive for subcommands
-        # download_and_list_icaos and process_icao_chunk treat both dates as inclusive
-        chunk_end_inclusive = (datetime.strptime(chunk_end, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d")
-        
-        print(f"\n{'=' * 60}")
-        print(f"Processing Date Chunk {idx}/{len(date_chunks)}: {chunk_start} to {chunk_end_inclusive} (inclusive)")
-        print('=' * 60)
-        
-        # Step 1: Download and extract
-        print("\n" + "=" * 60)
-        print("Step 1: Download and Extract")
-        print("=" * 60)
-        
+    # Step 1: Download and extract
+    print("\n" + "=" * 60)
+    print("Step 1: Download and Extract")
+    print("=" * 60)
+    
+    if end_str:
        cmd = ["python", "-m", "src.adsb.download_and_list_icaos",
-               "--start-date", chunk_start, "--end-date", chunk_end_inclusive]
-        run_cmd(cmd, "Download and extract")
-        
-        # Step 2: Process chunks
-        print("\n" + "=" * 60)
-        print("Step 2: Process Chunks")
-        print("=" * 60)
-        
-        for chunk_id in range(args.chunks):
-            print(f"\n--- ICAO Chunk {chunk_id + 1}/{args.chunks} ---")
+               "--start-date", start_str, "--end-date", end_str]
+    else:
+        cmd = ["python", "-m", "src.adsb.download_and_list_icaos",
+               "--date", start_str]
+    run_cmd(cmd, "Download and extract")
+    
+    # Step 2: Process chunks
+    print("\n" + "=" * 60)
+    print("Step 2: Process Chunks")
+    print("=" * 60)
+    
+    for chunk_id in range(args.chunks):
+        print(f"\n--- Chunk {chunk_id + 1}/{args.chunks} ---")
+        if end_str:
            cmd = ["python", "-m", "src.adsb.process_icao_chunk",
                   "--chunk-id", str(chunk_id),
                   "--total-chunks", str(args.chunks),
-                   "--start-date", chunk_start,
-                   "--end-date", chunk_end_inclusive]
-            run_cmd(cmd, f"Process ICAO chunk {chunk_id}")
+                   "--start-date", start_str,
+                   "--end-date", end_str]
+        else:
+            cmd = ["python", "-m", "src.adsb.process_icao_chunk",
+                   "--chunk-id", str(chunk_id),
+                   "--total-chunks", str(args.chunks),
+                   "--date", start_str]
+        run_cmd(cmd, f"Process chunk {chunk_id}")
    
-    # Step 3: Combine all chunks to CSV
+    # Step 3: Combine chunks to CSV
    print("\n" + "=" * 60)
-    print("Step 3: Combine All Chunks to CSV")
+    print("Step 3: Combine to CSV")
    print("=" * 60)
    
    chunks_dir = "./data/output/adsb_chunks"
    cmd = ["python", "-m", "src.adsb.combine_chunks_to_csv",
-           "--chunks-dir", chunks_dir,
-           "--start-date", start_str,
-           "--end-date", end_str,
-           "--stream"]
+           "--chunks-dir", chunks_dir]
+    
+    if end_str:
+        cmd.extend(["--start-date", start_str, "--end-date", end_str])
+    else:
+        cmd.extend(["--date", start_str])
    
    if args.skip_base:
        cmd.append("--skip-base")
@@ -158,9 +139,10 @@ def main():
    
    # Show output
    output_dir = "./data/openairframes"
-    # Calculate actual end date for filename (end_date - 1 day since it's exclusive)
-    actual_end = (end_date - timedelta(days=1)).strftime("%Y-%m-%d")
-    output_file = f"openairframes_adsb_{start_str}_{actual_end}.csv.gz"
+    if end_str:
+        output_file = f"openairframes_adsb_{start_str}_{end_str}.csv"
+    else:
+        output_file = f"openairframes_adsb_{start_str}_{start_str}.csv"
    
    output_path = os.path.join(output_dir, output_file)
    if os.path.exists(output_path):
@@ -0,0 +1,84 @@
+from pathlib import Path
+from datetime import datetime, timezone, timedelta
+import sys
+
+import polars as pl
+
+# Add adsb directory to path
+sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation
+
+from adsb.compress_adsb_to_aircraft_data import (
+    load_historical_for_day,
+    concat_compressed_dfs,
+    get_latest_aircraft_adsb_csv_df,
+)
+
+if __name__ == '__main__':
+    # Get yesterday's date (data for the previous day)
+    day = datetime.now(timezone.utc) - timedelta(days=1)
+
+    # Find a day with complete data
+    max_attempts = 2  # Don't look back more than a week
+    for attempt in range(max_attempts):
+        date_str = day.strftime("%Y-%m-%d")
+        print(f"Processing ADS-B data for {date_str}")
+        
+        print("Loading new ADS-B data...")
+        df_new = load_historical_for_day(day)
+        if df_new.height == 0:
+            day = day - timedelta(days=1)
+            continue
+        max_time = df_new['time'].max()
+        if max_time is not None:
+            # Handle timezone
+            max_time_dt = max_time
+            if hasattr(max_time_dt, 'replace'):
+                max_time_dt = max_time_dt.replace(tzinfo=timezone.utc)
+            
+            end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5)
+            
+            # Convert polars datetime to python datetime if needed
+            if isinstance(max_time_dt, datetime):
+                if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day:
+                    break
+            else:
+                # Polars returns python datetime already
+                if max_time >= day.replace(hour=23, minute=54, second=59):
+                    break
+        
+        print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.")
+        day = day - timedelta(days=1)
+    else:
+        raise RuntimeError(f"Could not find complete data in the last {max_attempts} days")
+
+    try:
+        # Get the latest release data
+        print("Downloading latest ADS-B release...")
+        df_base, start_date_str = get_latest_aircraft_adsb_csv_df()
+        # Combine with historical data
+        print("Combining with historical data...")
+        df_combined = concat_compressed_dfs(df_base, df_new)
+    except Exception as e:
+        print(f"Error downloading latest ADS-B release: {e}")
+        df_combined = df_new
+        start_date_str = date_str
+
+    # Sort by time for consistent ordering
+    df_combined = df_combined.sort('time')
+    
+    # Convert any list columns to strings for CSV compatibility
+    for col in df_combined.columns:
+        if df_combined[col].dtype == pl.List:
+            df_combined = df_combined.with_columns(
+                pl.col(col).list.join(",").alias(col)
+            )
+
+    # Save the result
+    OUT_ROOT = Path("data/openairframes")
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+
+    output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv.gz"
+    df_combined.write_csv(output_file)
+
+    print(f"Saved: {output_file}")
+    print(f"Total aircraft: {df_combined.height}")
Author	SHA1	Message	Date
copilot-swe-agent[bot]	2b720f3d88	Remove step-level timeouts to allow artifact downloads to complete Co-authored-by: ggman12 <17393221+ggman12@users.noreply.github.com>	2026-02-16 19:38:27 +00:00
copilot-swe-agent[bot]	e65da2cd95	Fix YAML property ordering for timeout-minutes Co-authored-by: ggman12 <17393221+ggman12@users.noreply.github.com>	2026-02-16 17:50:18 +00:00
copilot-swe-agent[bot]	7ea815732c	Add timeout and retry logic to adsb-reduce artifact download step Co-authored-by: ggman12 <17393221+ggman12@users.noreply.github.com>	2026-02-16 17:49:41 +00:00
copilot-swe-agent[bot]	7fd5df8777	Initial plan	2026-02-16 17:44:39 +00:00