Merge pull request #46 psimpson routes

add psimpson routes
2026-07-28 04:30:54 +02:00 · 2026-05-25 18:59:28 -04:00 · 2026-05-25 18:54:57 -04:00 · 2026-03-18 14:31:29 -04:00 · 2026-03-18 14:29:13 -04:00 · 2026-03-10 05:12:11 -04:00
47 changed files with 2435 additions and 1746 deletions
@@ -0,0 +1,182 @@
 name: Historical ADS-B Processing
 on:
  workflow_dispatch:
    inputs:
      date:
        description: 'YYYY-MM-DD'
        required: true
        type: string
      concat_with_latest_csv:
        description: 'Also concatenate with latest CSV from GitHub releases'
        required: false
        type: boolean
        default: false
  workflow_call:
    inputs:
      date:
        description: 'YYYY-MM-DD'
        required: true
        type: string
      concat_with_latest_csv:
        description: 'Also concatenate with latest CSV from GitHub releases'
        required: false
        type: boolean
        default: false
 jobs:
  adsb-extract:
    runs-on: ubuntu-24.04-arm
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Download and split ADS-B data
        env:
          DATE: ${{ inputs.date }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          python -m src.adsb.download_and_list_icaos --date "$DATE"
          ls -lah data/output/adsb_archives/"$DATE" || true
      - name: Upload archive part 0
        uses: actions/upload-artifact@v4
        with:
          name: adsb-archive-${{ inputs.date }}-part-0
          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_0.tar.gz
          retention-days: 1
          compression-level: 0
          if-no-files-found: error
      - name: Upload archive part 1
        uses: actions/upload-artifact@v4
        with:
          name: adsb-archive-${{ inputs.date }}-part-1
          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_1.tar.gz
          retention-days: 1
          compression-level: 0
          if-no-files-found: error
      - name: Upload archive part 2
        uses: actions/upload-artifact@v4
        with:
          name: adsb-archive-${{ inputs.date }}-part-2
          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_2.tar.gz
          retention-days: 1
          compression-level: 0
          if-no-files-found: error
      - name: Upload archive part 3
        uses: actions/upload-artifact@v4
        with:
          name: adsb-archive-${{ inputs.date }}-part-3
          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_3.tar.gz
          retention-days: 1
          compression-level: 0
          if-no-files-found: error
  adsb-map:
    needs: adsb-extract
    runs-on: ubuntu-24.04-arm
    strategy:
      fail-fast: true
      matrix:
        part_id: [0, 1, 2, 3]
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Download archive part
        uses: actions/download-artifact@v4
        with:
          name: adsb-archive-${{ inputs.date }}-part-${{ matrix.part_id }}
          path: data/output/adsb_archives/${{ inputs.date }}
      - name: Verify archive
        run: |
          FILE="data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_${{ matrix.part_id }}.tar.gz"
          ls -lah data/output/adsb_archives/${{ inputs.date }}/
          if [ ! -f "$FILE" ]; then
            echo "::error::Archive not found: $FILE"
            exit 1
          fi
          echo "Verified: $(du -h "$FILE")"
      - name: Process part
        env:
          DATE: ${{ inputs.date }}
        run: |
          python -m src.adsb.process_icao_chunk --part-id ${{ matrix.part_id }} --date "$DATE"
      - name: Upload compressed outputs
        uses: actions/upload-artifact@v4
        with:
          name: adsb-compressed-${{ inputs.date }}-part-${{ matrix.part_id }}
          path: data/output/compressed/${{ inputs.date }}
          retention-days: 1
          compression-level: 0
          if-no-files-found: error
  adsb-reduce:
    needs: adsb-map
    runs-on: ubuntu-24.04-arm
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Download compressed outputs
        uses: actions/download-artifact@v4
        with:
          pattern: adsb-compressed-${{ inputs.date }}-part-*
          path: data/output/compressed/${{ inputs.date }}
          merge-multiple: true
      - name: Concatenate final outputs
        env:
          DATE: ${{ inputs.date }}
          CONCAT_WITH_LATEST_CSV: ${{ inputs.concat_with_latest_csv }}
        run: |
          EXTRA=""
          if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
            EXTRA="--concat_with_latest_csv"
          fi
          python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
          ls -lah data/output/ || true
      - name: Upload final artifacts
        uses: actions/upload-artifact@v4
        with:
          name: openairframes_adsb-${{ inputs.date }}
          path: data/output/openairframes_adsb_*
          retention-days: 30
          if-no-files-found: error
@@ -0,0 +1,118 @@
 name: adsb-to-aircraft-multiple-day-run
 on:
  workflow_dispatch:
    inputs:
      start_date:
        description: 'YYYY-MM-DD (inclusive)'
        required: true
        type: string
      end_date:
        description: 'YYYY-MM-DD (exclusive)'
        required: true
        type: string
 jobs:
  generate-dates:
    runs-on: ubuntu-24.04-arm
    outputs:
      dates: ${{ steps.generate.outputs.dates }}
    steps:
      - name: Generate date list
        id: generate
        env:
          START_DATE: ${{ inputs.start_date }}
          END_DATE: ${{ inputs.end_date }}
        run: |
          python - <<'PY'
          import json
          import os
          from datetime import datetime, timedelta
          start = datetime.strptime(os.environ["START_DATE"], "%Y-%m-%d")
          end = datetime.strptime(os.environ["END_DATE"], "%Y-%m-%d")
          if end <= start:
            raise SystemExit("end_date must be after start_date")
          dates = []
          cur = start
          while cur < end:
            dates.append(cur.strftime("%Y-%m-%d"))
            cur += timedelta(days=1)
          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
            f.write(f"dates={json.dumps(dates)}\n")
          PY
  adsb-day:
    needs: generate-dates
    strategy:
      fail-fast: true
      matrix:
        date: ${{ fromJson(needs.generate-dates.outputs.dates) }}
    uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
    with:
      date: ${{ matrix.date }}
  adsb-final:
    needs: adsb-day
    runs-on: ubuntu-24.04-arm
    steps:
      - name: Checkout
        uses: actions/checkout@v6
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Download daily CSVs
        uses: actions/download-artifact@v4
        with:
          pattern: openairframes_adsb-*
          path: outputs/daily/
          merge-multiple: true
      - name: Concatenate all days to final CSV
        env:
          START_DATE: ${{ inputs.start_date }}
          END_DATE: ${{ inputs.end_date }}
        run: |
          python - <<'PY'
          import os
          import re
          from pathlib import Path
          import polars as pl
          start = os.environ["START_DATE"]
          end = os.environ["END_DATE"]
          daily_dir = Path("outputs/daily")
          files = sorted(daily_dir.glob("openairframes_adsb_*.csv.gz"))
          if not files:
            raise SystemExit("No daily CSVs found")
          def date_key(path: Path) -> str:
            m = re.match(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", path.name)
            return m.group(1) if m else path.name
          files = sorted(files, key=date_key)
          frames = [pl.read_csv(p) for p in files]
          df = pl.concat(frames, how="vertical", rechunk=True)
          output_path = Path("outputs") / f"openairframes_adsb_{start}_{end}.csv.gz"
          df.write_csv(output_path, compression="gzip")
          print(f"Wrote {output_path} with {df.height} rows")
          PY
      - name: Upload final CSV
        uses: actions/upload-artifact@v4
        with:
          name: openairframes_adsb-${{ inputs.start_date }}-${{ inputs.end_date }}
          path: outputs/openairframes_adsb_${{ inputs.start_date }}_${{ inputs.end_date }}.csv.gz
          retention-days: 30
 # gh workflow run adsb-to-aircraft-multiple-day-run.yaml --repo ggman12/OpenAirframes --ref jonah/fix-historical-proper -f start_date=2025-12-31 -f end_date=2026-01-02
@@ -1,268 +0,0 @@
 name: Historical ADS-B Processing
 on:
  workflow_dispatch:
    inputs:
      start_date:
        description: 'Start date (YYYY-MM-DD, inclusive)'
        required: true
        type: string
      end_date:
        description: 'End date (YYYY-MM-DD, exclusive)'
        required: true
        type: string
      chunk_days:
        description: 'Days per job chunk (default: 7)'
        required: false
        type: number
        default: 7
 jobs:
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      chunks: ${{ steps.generate.outputs.chunks }}
      global_start: ${{ inputs.start_date }}
      global_end: ${{ inputs.end_date }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      - name: Generate date chunks
        id: generate
        env:
          INPUT_START_DATE: ${{ inputs.start_date }}
          INPUT_END_DATE: ${{ inputs.end_date }}
          INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
        run: python src/adsb/historical_generate_matrix.py
  adsb-extract:
    needs: generate-matrix
    runs-on: ubuntu-24.04-arm
    strategy:
      matrix:
        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
      max-parallel: 3
      fail-fast: true
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Free disk space
        run: |
          sudo rm -rf /usr/share/dotnet
          sudo rm -rf /opt/ghc
          sudo rm -rf /usr/local/share/boost
          df -h
      - name: Download and extract ADS-B data
        env:
          START_DATE: ${{ matrix.chunk.start_date }}
          END_DATE: ${{ matrix.chunk.end_date }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
          ls -lah data/output/
      - name: Create tar of extracted data and split into chunks
        run: |
          cd data/output
          echo "=== Disk space before tar ==="
          df -h .
          echo "=== Files to tar ==="
          ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
          # Create tar with explicit error checking
          if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
            tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
            echo "=== Tar file created ==="
            ls -lah extracted_data.tar
            # Verify tar integrity
            tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
            # Create checksum of the FULL tar before splitting (for verification after reassembly)
            echo "=== Creating checksum of full tar ==="
            sha256sum extracted_data.tar > full_tar.sha256
            cat full_tar.sha256
            # Split into 500MB chunks to avoid artifact upload issues
            echo "=== Splitting tar into 500MB chunks ==="
            mkdir -p tar_chunks
            split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
            rm extracted_data.tar
            mv full_tar.sha256 tar_chunks/
            echo "=== Chunks created ==="
            ls -lah tar_chunks/
          else
            echo "ERROR: No extracted directories found, cannot create tar"
            exit 1
          fi
      - name: Upload extracted data chunks
        uses: actions/upload-artifact@v4
        with:
          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
          path: data/output/tar_chunks/
          retention-days: 1
          compression-level: 0
          if-no-files-found: warn
  adsb-map:
    needs: [generate-matrix, adsb-extract]
    runs-on: ubuntu-24.04-arm
    strategy:
      fail-fast: true
      matrix:
        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
        icao_chunk: [0, 1, 2, 3]
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Free disk space
        run: |
          sudo rm -rf /usr/share/dotnet
          sudo rm -rf /opt/ghc
          sudo rm -rf /usr/local/share/boost
          df -h
      - name: Download extracted data
        uses: actions/download-artifact@v4
        with:
          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
          path: data/output/tar_chunks/
      - name: Reassemble and extract tar
        id: extract
        run: |
          cd data/output
          if [ -d tar_chunks ] && ls tar_chunks/extracted_data.tar.part_* 1>/dev/null 2>&1; then
            echo "=== Chunk files info ==="
            ls -lah tar_chunks/
            cd tar_chunks
            # Reassemble tar with explicit sorting
            echo "=== Reassembling tar file ==="
            ls -1 extracted_data.tar.part_?? | sort | while read part; do
              echo "Appending $part..."
              cat "$part" >> ../extracted_data.tar
            done
            cd ..
            echo "=== Reassembled tar file info ==="
            ls -lah extracted_data.tar
            # Verify checksum of reassembled tar matches original
            echo "=== Verifying reassembled tar checksum ==="
            echo "Original checksum:"
            cat tar_chunks/full_tar.sha256
            echo "Reassembled checksum:"
            sha256sum extracted_data.tar
            sha256sum -c tar_chunks/full_tar.sha256 || { echo "ERROR: Reassembled tar checksum mismatch - data corrupted during transfer"; exit 1; }
            echo "Checksum verified - data integrity confirmed"
            rm -rf tar_chunks
            echo "=== Extracting ==="
            tar -xvf extracted_data.tar
            rm extracted_data.tar
            echo "has_data=true" >> "$GITHUB_OUTPUT"
            echo "=== Contents of data/output ==="
            ls -lah
          else
            echo "No tar chunks found"
            echo "has_data=false" >> "$GITHUB_OUTPUT"
          fi
      - name: Process ICAO chunk
        if: steps.extract.outputs.has_data == 'true'
        env:
          START_DATE: ${{ matrix.chunk.start_date }}
          END_DATE: ${{ matrix.chunk.end_date }}
        run: |
          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.icao_chunk }} --total-chunks 4 --start-date "$START_DATE" --end-date "$END_DATE"
          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
      - name: Upload chunk artifacts
        if: steps.extract.outputs.has_data == 'true'
        uses: actions/upload-artifact@v4
        with:
          name: adsb-map-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}-chunk-${{ matrix.icao_chunk }}
          path: data/output/adsb_chunks/
          retention-days: 1
          if-no-files-found: ignore
  adsb-reduce:
    needs: [generate-matrix, adsb-map]
    runs-on: ubuntu-24.04-arm
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Download all chunk artifacts
        uses: actions/download-artifact@v4
        with:
          pattern: adsb-map-*
          path: data/output/adsb_chunks/
          merge-multiple: true
      - name: Debug downloaded files
        run: |
          echo "=== Disk space before processing ==="
          df -h
          echo "=== Listing data/output/adsb_chunks/ ==="
          find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
          echo "=== Total parquet size ==="
          du -sh data/output/adsb_chunks/ || echo "No chunks dir"
      - name: Combine chunks to CSV
        env:
          START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
          END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
        run: |
          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
          ls -lah data/openairframes/
      - name: Upload final artifact
        uses: actions/upload-artifact@v4
        with:
          name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
          path: data/openairframes/*.csv
          retention-days: 30
@@ -1,4 +1,4 @@
-name: OpenAirframes Daily Release
+name: openairframes-daily-release
 on:
  schedule:
@@ -76,159 +76,75 @@ jobs:
            data/faa_releasable/ReleasableAircraft_*.zip
          retention-days: 1
-  adsb-extract:
+  resolve-dates:
-    runs-on: ubuntu-24.04-arm
+    runs-on: ubuntu-latest
    if: github.event_name != 'schedule'
    outputs:
-      manifest-exists: ${{ steps.check.outputs.exists }}
+      date: ${{ steps.out.outputs.date }}
      adsb_date: ${{ steps.out.outputs.adsb_date }}
    steps:
-      - name: Checkout
+      - id: out
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.14"
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
+          if [ -n "${{ inputs.date }}" ]; then
-          pip install -r requirements.txt
+            echo "date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
-
+            echo "adsb_date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
      - name: Download and extract ADS-B data
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          python -m src.adsb.download_and_list_icaos ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          ls -lah data/output/
      - name: Check manifest exists
        id: check
        run: |
          if ls data/output/icao_manifest_*.txt 1>/dev/null 2>&1; then
            echo "exists=true" >> "$GITHUB_OUTPUT"
          else
-            echo "exists=false" >> "$GITHUB_OUTPUT"
+            echo "date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
            echo "adsb_date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
          fi
-      - name: Create tar of extracted data
+  adsb-to-aircraft:
-        run: |
+    needs: resolve-dates
-          cd data/output
+    if: github.event_name != 'schedule'
-          tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
+    uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
-          ls -lah extracted_data.tar
+    with:
-
+      date: ${{ needs.resolve-dates.outputs.adsb_date }}
-      - name: Upload extracted data
+      concat_with_latest_csv: true
        uses: actions/upload-artifact@v4
        with:
          name: adsb-extracted
          path: data/output/extracted_data.tar
          retention-days: 1
          compression-level: 0  # Already compressed trace files
  adsb-map:
    runs-on: ubuntu-24.04-arm
    needs: adsb-extract
    if: github.event_name != 'schedule' && needs.adsb-extract.outputs.manifest-exists == 'true'
    strategy:
      fail-fast: false
      matrix:
        chunk: [0, 1, 2, 3]
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.14"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Download extracted data
        uses: actions/download-artifact@v4
        with:
          name: adsb-extracted
          path: data/output/
      - name: Extract tar
        run: |
          cd data/output
          tar -xf extracted_data.tar
          rm extracted_data.tar
          echo "=== Contents of data/output ==="
          ls -lah
          echo "=== Looking for manifest ==="
          cat icao_manifest_*.txt | head -20 || echo "No manifest found"
          echo "=== Looking for extracted dirs ==="
          ls -d *-planes-readsb-prod-0* 2>/dev/null || echo "No extracted dirs"
      - name: Process chunk ${{ matrix.chunk }}
        run: |
          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4 ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          mkdir -p data/output/adsb_chunks
          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
      - name: Upload chunk artifacts
        uses: actions/upload-artifact@v4
        with:
          name: adsb-chunk-${{ matrix.chunk }}
          path: data/output/adsb_chunks/
          retention-days: 1
  adsb-reduce:
    needs: [resolve-dates, adsb-to-aircraft]
    if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
    runs-on: ubuntu-24.04-arm
    needs: adsb-map
    if: github.event_name != 'schedule'
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
-          python-version: "3.14"
+          python-version: '3.12'
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
-      - name: Download all chunk artifacts
+      - name: Download compressed outputs
        uses: actions/download-artifact@v4
        with:
-          pattern: adsb-chunk-*
+          pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
-          path: data/output/adsb_chunks/
+          path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
          merge-multiple: true
-      - name: Debug downloaded files
+      - name: Concatenate final outputs
        env:
          DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
          CONCAT_WITH_LATEST_CSV: true
        run: |
-          echo "=== Listing data/ ==="
+          EXTRA=""
-          find data/ -type f 2>/dev/null | head -50 || echo "No files in data/"
+          if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
-          echo "=== Looking for parquet files ==="
+            EXTRA="--concat_with_latest_csv"
-          find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found"
+          fi
          python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
          ls -lah data/output/ || true
-      - name: Combine chunks to CSV
+      - name: Upload final artifacts
        run: |
          mkdir -p data/output/adsb_chunks
          ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          ls -lah data/openairframes/
      - name: Upload ADS-B artifacts
        uses: actions/upload-artifact@v4
        with:
-          name: adsb-release
+          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
-          path: data/openairframes/openairframes_adsb_*.csv
+          path: data/output/openairframes_adsb_*
-          retention-days: 1
+          retention-days: 30
          if-no-files-found: error
  build-community:
    runs-on: ubuntu-latest
@@ -261,11 +177,70 @@ jobs:
          path: data/openairframes/openairframes_community_*.csv
          retention-days: 1
-  create-release:
+  build-adsbexchange-json:
    runs-on: ubuntu-latest
    needs: [build-faa, adsb-reduce, build-community]
    if: github.event_name != 'schedule'
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.14"
      - name: Run ADS-B Exchange JSON release script
        run: |
          python -m src.contributions.create_daily_adsbexchange_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          ls -lah data/openairframes
      - name: Upload ADS-B Exchange JSON artifact
        uses: actions/upload-artifact@v4
        with:
          name: adsbexchange-json
          path: data/openairframes/basic-ac-db_*.json.gz
          retention-days: 1
  build-mictronics-db:
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule'
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.14"
      - name: Run Mictronics DB release script
        continue-on-error: true
        run: |
          python -m src.contributions.create_daily_microtonics_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          ls -lah data/openairframes
      - name: Upload Mictronics DB artifact
        uses: actions/upload-artifact@v4
        with:
          name: mictronics-db
          path: data/openairframes/mictronics-db_*.zip
          retention-days: 1
          if-no-files-found: ignore
  create-release:
    runs-on: ubuntu-latest
    needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
    if: github.event_name != 'schedule' && !cancelled()
    steps:
      - name: Check ADS-B workflow status
        if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
        run: |
          echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"
      - name: Checkout for gh CLI
        uses: actions/checkout@v4
        with:
@@ -274,23 +249,38 @@ jobs:
          sparse-checkout-cone-mode: false
      - name: Download FAA artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
        with:
          name: faa-release
          path: artifacts/faa
      - name: Download ADS-B artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
        if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
        continue-on-error: true
        with:
-          name: adsb-release
+          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
          path: artifacts/adsb
      - name: Download Community artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
        with:
          name: community-release
          path: artifacts/community
      - name: Download ADS-B Exchange JSON artifact
        uses: actions/download-artifact@v5
        with:
          name: adsbexchange-json
          path: artifacts/adsbexchange
      - name: Download Mictronics DB artifact
        uses: actions/download-artifact@v5
        continue-on-error: true
        with:
          name: mictronics-db
          path: artifacts/mictronics
      - name: Debug artifact structure
        run: |
          echo "=== Full artifacts tree ==="
@@ -301,6 +291,10 @@ jobs:
          find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
          echo "=== Community artifacts ==="
          find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
          echo "=== ADS-B Exchange JSON artifacts ==="
          find artifacts/adsbexchange -type f 2>/dev/null || echo "No files found in artifacts/adsbexchange"
          echo "=== Mictronics DB artifacts ==="
          find artifacts/mictronics -type f 2>/dev/null || echo "No files found in artifacts/mictronics"
      - name: Prepare release metadata
        id: meta
@@ -317,35 +311,66 @@ jobs:
          # Find files from artifacts using find (handles nested structures)
          CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
-          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" -type f 2>/dev/null | head -1)
+          # Prefer concatenated file (with date range) over single-day file
          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*_*.csv.gz" -type f 2>/dev/null | head -1)
          if [ -z "$CSV_FILE_ADSB" ]; then
            CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
          fi
          CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
          ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
          JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1)
          ZIP_FILE_MICTRONICS=$(find artifacts/mictronics -name "mictronics-db_*.zip" -type f 2>/dev/null | head -1)
          # Validate required files exist
          MISSING_FILES=""
          if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then
            MISSING_FILES="$MISSING_FILES FAA_CSV"
          fi
          if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
            MISSING_FILES="$MISSING_FILES ADSB_CSV"
          fi
          if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then
            MISSING_FILES="$MISSING_FILES FAA_ZIP"
          fi
          if [ -z "$JSON_FILE_ADSBX" ] || [ ! -f "$JSON_FILE_ADSBX" ]; then
            MISSING_FILES="$MISSING_FILES ADSBX_JSON"
          fi
          # Optional files - warn but don't fail
          OPTIONAL_MISSING=""
          if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
            OPTIONAL_MISSING="$OPTIONAL_MISSING ADSB_CSV"
            CSV_FILE_ADSB=""
            CSV_BASENAME_ADSB=""
          fi
          if [ -z "$ZIP_FILE_MICTRONICS" ] || [ ! -f "$ZIP_FILE_MICTRONICS" ]; then
            OPTIONAL_MISSING="$OPTIONAL_MISSING MICTRONICS_ZIP"
            ZIP_FILE_MICTRONICS=""
          fi
          if [ -n "$MISSING_FILES" ]; then
            echo "ERROR: Missing required release files:$MISSING_FILES"
            echo "FAA CSV: $CSV_FILE_FAA"
            echo "ADSB CSV: $CSV_FILE_ADSB"
            echo "ZIP: $ZIP_FILE"
            echo "ADSBX JSON: $JSON_FILE_ADSBX"
            echo "MICTRONICS ZIP: $ZIP_FILE_MICTRONICS"
            exit 1
          fi
          # Get basenames for display
          CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
-          CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
+          if [ -n "$CSV_FILE_ADSB" ]; then
            CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
          fi
          CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
          ZIP_BASENAME=$(basename "$ZIP_FILE")
          JSON_BASENAME_ADSBX=$(basename "$JSON_FILE_ADSBX")
          ZIP_BASENAME_MICTRONICS=""
          if [ -n "$ZIP_FILE_MICTRONICS" ]; then
            ZIP_BASENAME_MICTRONICS=$(basename "$ZIP_FILE_MICTRONICS")
          fi
          if [ -n "$OPTIONAL_MISSING" ]; then
            echo "WARNING: Optional files missing:$OPTIONAL_MISSING (will continue without them)"
          fi
          echo "date=$DATE" >> "$GITHUB_OUTPUT"
          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
@@ -357,6 +382,10 @@ jobs:
          echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT"
          echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
          echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
          echo "json_file_adsbx=$JSON_FILE_ADSBX" >> "$GITHUB_OUTPUT"
          echo "json_basename_adsbx=$JSON_BASENAME_ADSBX" >> "$GITHUB_OUTPUT"
          echo "zip_file_mictronics=$ZIP_FILE_MICTRONICS" >> "$GITHUB_OUTPUT"
          echo "zip_basename_mictronics=$ZIP_BASENAME_MICTRONICS" >> "$GITHUB_OUTPUT"
          echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
          echo "Found files:"
@@ -364,6 +393,8 @@ jobs:
          echo "  ADSB CSV: $CSV_FILE_ADSB"
          echo "  Community CSV: $CSV_FILE_COMMUNITY"
          echo "  ZIP: $ZIP_FILE"
          echo "  ADSBX JSON: $JSON_FILE_ADSBX"
          echo "  MICTRONICS ZIP: $ZIP_FILE_MICTRONICS"
      - name: Delete existing release if exists
        run: |
@@ -377,19 +408,23 @@ jobs:
        with:
          tag_name: ${{ steps.meta.outputs.tag }}
          name: ${{ steps.meta.outputs.name }}
-          fail_on_unmatched_files: true
+          fail_on_unmatched_files: false
          body: |
            Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
            Assets:
            - ${{ steps.meta.outputs.csv_basename_faa }}
-            - ${{ steps.meta.outputs.csv_basename_adsb }}
+            ${{ steps.meta.outputs.csv_basename_adsb && format('- {0}', steps.meta.outputs.csv_basename_adsb) || '' }}
            - ${{ steps.meta.outputs.csv_basename_community }}
            - ${{ steps.meta.outputs.zip_basename }}
            - ${{ steps.meta.outputs.json_basename_adsbx }}
            ${{ steps.meta.outputs.zip_basename_mictronics && format('- {0}', steps.meta.outputs.zip_basename_mictronics) || '' }}
          files: |
            ${{ steps.meta.outputs.csv_file_faa }}
            ${{ steps.meta.outputs.csv_file_adsb }}
            ${{ steps.meta.outputs.csv_file_community }}
            ${{ steps.meta.outputs.zip_file }}
            ${{ steps.meta.outputs.json_file_adsbx }}
            ${{ steps.meta.outputs.zip_file_mictronics }}
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -281,4 +281,7 @@ read*lock
 .nx/
 # jsii-rosetta files
-type-fingerprints.txt
+type-fingerprints.txt
 notebooks/whatever.ipynb
 .snapshots/
@@ -16,11 +16,19 @@ A daily release is created at **06:00 UTC** and includes:
 - **openairframes_community.csv**  
  All community submissions
 - **openairframes_adsb.csv**  
  Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that day’s ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
 Example Usage:
 ```python
 import pandas as pd
 url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
 df = pd.read_csv(url)
 df
 ```
 ![](docs/images/df_adsb_example_0.png)
 - **openairframes_faa.csv**  
  All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)
 - **openairframes_adsb.csv**  
  Airframe information derived from ADS-B messages on the [ADSB.lol](https://www.adsb.lol/) network, from 2026-02-12 to present (will be from 2024-01-01 soon). The airframe information originates from [mictronics aircraft database](https://www.mictronics.de/aircraft-database/) (~5 MB).
 - **ReleasableAircraft_{date}.zip**  
  A daily snapshot of the FAA database, which updates at **05:30 UTC**
@@ -0,0 +1,36 @@
 TAP50Y  lis lhr
 EXS96WT man ibz
 baw837  dbv lhr
 exs6yr  nce lba
 tom1lx  ncl ibz
 exs62vc edi pmi
 tom35j  boj lgw
 tom509  dlm lgw
 afr902  cdg ndj nsi cdg
 tom71a  spc man
 tom8ke  man her
 nsz3868 bll opo
 exs95wl mah ncl
 exs18rk stn reu
 tom9db  mah bhx
 tom2bw  reu bhx
 kac113  kwi man
 tom18e  ibz gla
 ocn8k   snn fra
 tfl365  ams cur bon ams
 exs29y  zth bhx
 exs79cf olb man
 asl508  beg yyz
 tom4nw  pmi man
 exs3uq  zth ema
 exs23ml her man
 gfa003  bah lhr
 baw703  bjv lhr
 tom2fb  mme pmi
 tom7el  ibz lgw
 tom7bd  lba pmi
 ual967  nap ewr
 ein4ec  dub cfu
 tom78v  lgw lca
 eva067  tpe bkk lhr
 ezy85xv nce lpl
@@ -0,0 +1,31 @@
 efw979y klx lgw
 ezy74wg ayt lgw
 ezy95yg ibz sen
 exs65lg kgs bhx
 tom5ky  her lgw
 tom213  dlm man
 jbu1990 sju ewr
 exs68pv pmi stn
 ice48p  kef cdg
 exs45ra man spu
 klm741  ams bog ctg ams
 exs42nu man olb
 ein55g  lys dub
 baw538  lhr bds
 uae74w  lgw dxb
 ely312  ltn tlv
 tfl757  ams puj cur ams
 wja41   lgw yhx
 tom7pj  reu man
 ryr817l bzr stn
 ein429  psa dub
 exs3lf  olb bhx
 ezy38en lrh lgw
 ezy85wd rmu man
 apo7579 lgw los
 tom13a  mah man
 baw2279 lgw yvr
 exs406p gro edi
 tom5jl  pmi stn
 ein42m  dub vce
@@ -0,0 +1,30 @@
 klm1045 ams bhx
 dhk591  hkg del ema
 sht22a  lhr gci
 etd75f  auh lhr
 tom92g  pmi ema
 tom767  nbe brs
 qtr28u  doh lhr
 tom56m  pmi ncl
 aca883  nap yul
 tsc691  ath yul
 srr902  hgh nvi bhx bll
 kac109  kwi lhr
 cfe4ed  ibz lcy
 exs628  dbv ema
 tom581  nbe ema
 exs86j  ema puy
 exs67am skg lgw
 tom37d  kva bhx
 tom9dy  pmi bhx
 qtr72b  doh stn
 exs52cj efl brs
 ezy2816 pvk brs
 tom2bk  mah ncl
 exs86pf jsi bhx
 exs39yr jsi brs
 exs17j  mah lpl
 qtr2c   doh dub
 cfe979  pmi lcy
 sht21b  gci lhr
 exs12lf spu bhx
@@ -0,0 +1,32 @@
 tom8ax  kva man
 tom6nk  cfu man
 gfa003  bah lhr
 sxs7by  adb dub
 tom5gk  ext kgs
 tom62w  efl brs
 exs77j  stn zth
 tom4lw  boh her
 exs916  spu man
 tom54y  zth man
 tom34g  brs pfo
 exs3th  nap gla
 exs9dw  nap man
 tom24m  kgs lgw
 tom748  ema sid
 exs718d nte edi
 exs53ru brs kgs
 exs9eh  pvk brs
 etd71m  lhr auh
 exs29wk zth bhx
 exs46qw kgs stn
 wuk369  ltn pmi
 tom5dc  her brs
 wuk9768 ltn jmk
 tom5gl  lba pmi
 exs21dw bhx klx
 tom5ka  cwl lca
 ein46p  dub cta
 tom73e  efl stn
 ely316  lhr tlv
 efw26pp lgw mah
 qtr47y  lhr doh
@@ -0,0 +1,30 @@
 kmm3118 mla lgw
 baw539  bds lhr
 tom29k  zth brs
 tom32x  rho bhx
 ezy71zj lgw pvk
 baw536  lhr bds
 ent429  lgw pvk
 tom7cl  brs cfu
 qtr1f   doh lhr
 sxs5mq  man ayt
 klm767  ams aua bon ams
 vlg5ml  lcg lhr
 exs92se puy ema
 tom850  man nbe
 exs5sq  ncl pmi
 apo7576 abv lhr
 tom1an  stn her
 exs1kp  gla pmi
 ryr1794 ibz stn
 kmm3119 lgw mla
 isr116  ltn tlv
 sht9f   edi lhr
 baw9cj  lhr bru
 ezy93wm brs pmo
 ezy42eu ltn bsl
 bbc201  dac zyl lhr
 bbc202  lhr zyl dac
 tom3lw  rho lgw
 sxs7fz  ayt stn
 exs71mf stn pmi
@@ -0,0 +1,33 @@
 qtr33w  doh lhr
 tom86d  cwl her
 ezy49zc lgw bjv
 cpa008  lhr cdg hkg
 ely317  tlv lhr
 tom4ej  bhx cfu
 tom93j  gla ibz
 sva117  jed lhr
 ely313  tlv ltn
 qtr67h  lhr doh
 box442  fra yyz ord
 baw710c lhr lca
 wuk784  pmi ltn
 tom23m  efl man
 baw455  ibz lhr
 baw595  olb lhr
 baw621  peg lhr
 azg394  bhx gyd
 tom47x  zth bhx
 ezy45rl bsl ltn
 tom10y  ibz man
 baw663  zth lhr
 tom6en  bhx pmi
 efw74v  kgs lgw
 sva118  lhr jed
 ezy38xg bod bhx
 ein463  cta dub
 baw537  bds lhr
 exs689l pmi ncl
 tom43j  kgs ext
 tom9gx  pmi lgw
 apo7577 lhr abv
 ely318  lhr tlv
@@ -0,0 +1,33 @@
 exs42m  pmi stn
 exs51nw ibz man
 tom68h  pfo brs
 qtr61c  doh lgg ord
 wja51   lgw yyt
 exs93pk gro ema
 uae34y  dxb man
 cfe38z  lcy fao
 tom33j  pmi ema
 tom82k  mah stn
 tom8ya  pmi bhx
 exs1386 puy bhx
 sva119  jed lhr
 tom25a  mah man
 etd75f  auh lhr
 tom6ev  mah bhx
 efw16yk cag lgw
 ajt8620 bru mia
 uae9j   dxb stn
 tom429  nbe cwl
 sxs9gg  ayt bhx
 baw947l spu lhr
 tom7dm  cfu ema
 tom3nh  skg brs
 tom5hy  ibz man
 tom7hk  pmi gla
 tom9jw  boj cwl
 tom8be  bud bhx
 exs32y  brs zth
 tom7an  spu man
 tom84y  pmi ncl
 exs5qd  efl lba
 tom58h  bhx zth
@@ -0,0 +1,38 @@
 exs3uq  zth ema
 efw67a  lgw ayt
 tom657  nbe gla
 tom7cd  cfu gla
 exs79ue pmi man
 tom3lk  nap ema
 exs732  zth edi
 cfe12g  olb lcy
 tom2xj  jsi lgw
 gfa003  bah lhr
 gfa006  lhr bah
 tfl4mh  ams lpa
 exs9dw  nap man
 tom6ym  cwl cfu
 cfe316  ibz lcy
 qtr2c   doh dub
 exs48rz jsi man
 afr018  cdg lax ppt
 ewg8gj  str lgw str
 exs6yr  nce lba
 ely313  tlv ltn
 ely317  tlv lhr
 wuk13gw ltn tia
 baw58xp mxp lhr
 noz38w  aes lgw
 exs98dm ema fao
 eju15uv lpl mxp
 tom15x  cfu man
 ezy81qh ltn ibz
 wuk784  pmi ltn
 exs1898 zth brs
 baw841  dbv lhr
 sht6d   lhr gla
 tom2bg  cfu cwl
 exs45yk stn pmi
 ezy36ep pmi lgw
 tom9yg  zth bhx
 tom30w  spu lgw
@@ -0,0 +1,31 @@
 baw693  jtr lhr
 baw58xp mxp lhr
 uae9393 dwc lgg ord
 exs91au ncl pmi
 baw699w her lhr
 cfe91g  mah gla
 vlg49uc lhr lcg
 tom2nh  pmi man
 ezy56rd spu ltn
 tom3fa  reu bhx
 eag8sb  bhd sou
 cfe31y  pmi gla
 cfe92y  pmi edi
 ibs18my lgw mad
 exs1418 spu stn
 exs41m  vrn stn
 tom2wt  ibz nwi
 baw661  efl lhr
 wuk2818 zth ltn
 eag9st  sou bhd
 tom2ga  kgs brs
 exs25db pmi edi
 dhk812  bah lej ema
 baw15   lhr sin syd
 baw16   syd sin lhr
 tom59a  jtr man
 exs45yk stn pmi
 apo7577 lhr abv
 tom6aw  man pmi
 baw675  pvk lhr
@@ -0,0 +1,40 @@
 [
  {
    "contributor_name": "JohnSmith.com",
    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
    "creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
    "registration_number": "ZM146",
    "tags": {
      "citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
      "icao_aircraft_type": "L1J",
      "manufacturer_icao": "LOCKHEED MARTIN",
      "manufacturer_name": "Lockheed-martin",
      "model": "F-35B Lightning II",
      "operator": "Royal Air Force",
      "operator_callsign": "RAFAIR",
      "operator_icao": "RFR",
      "serial_number": "BK-12",
      "type_code": "VF35"
    },
    "transponder_code_hex": "43C81C"
  },
  {
    "contributor_name": "JohnSmith.com",
    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
    "creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
    "registration_number": "ZM148",
    "tags": {
      "citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
      "icao_aircraft_type": "L1J",
      "manufacturer_icao": "LOCKHEED MARTIN",
      "manufacturer_name": "Lockheed-martin",
      "model": "F-35B Lightning II",
      "operator": "Royal Air Force",
      "operator_callsign": "RAFAIR",
      "operator_icao": "RFR",
      "serial_number": "BK-14",
      "type_code": "VF35"
    },
    "transponder_code_hex": "43C811"
  }
 ]
@@ -54,7 +54,38 @@
      "additionalProperties": {
        "$ref": "#/$defs/tagValue"
      },
-      "properties": {}
+      "properties": {
        "citation_0": {
          "type": "string"
        },
        "icao_aircraft_type": {
          "type": "string"
        },
        "manufacturer_icao": {
          "type": "string"
        },
        "manufacturer_name": {
          "type": "string"
        },
        "model": {
          "type": "string"
        },
        "operator": {
          "type": "string"
        },
        "operator_callsign": {
          "type": "string"
        },
        "operator_icao": {
          "type": "string"
        },
        "serial_number": {
          "type": "string"
        },
        "type_code": {
          "type": "string"
        }
      }
    }
  },
  "allOf": [
@@ -0,0 +1,49 @@
 #!/usr/bin/env python3
 import re
 from pathlib import Path
 import polars as pl
 # Find all CSV.gz files in the downloaded artifacts
 artifacts_dir = Path("downloads/adsb_artifacts")
 files = sorted(artifacts_dir.glob("*/openairframes_adsb_*.csv.gz"))
 if not files:
    raise SystemExit("No CSV.gz files found in downloads/adsb_artifacts/")
 print(f"Found {len(files)} files to concatenate")
 # Extract dates from filenames to determine range
 def extract_dates(path: Path) -> tuple[str, str]:
    """Extract start and end dates from filename"""
    m = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv\.gz", path.name)
    if m:
        return m.group(1), m.group(2)
    return None, None
 # Collect all dates
 all_dates = []
 for f in files:
    start, end = extract_dates(f)
    if start and end:
        all_dates.extend([start, end])
        print(f"  {f.name}: {start} to {end}")
 if not all_dates:
    raise SystemExit("Could not extract dates from filenames")
 # Find earliest and latest dates
 earliest = min(all_dates)
 latest = max(all_dates)
 print(f"\nDate range: {earliest} to {latest}")
 # Read and concatenate all files
 print("\nReading and concatenating files...")
 frames = [pl.read_csv(f) for f in files]
 df = pl.concat(frames, how="vertical", rechunk=True)
 # Write output
 output_path = Path("downloads") / f"openairframes_adsb_{earliest}_{latest}.csv.gz"
 output_path.parent.mkdir(parents=True, exist_ok=True)
 df.write_csv(output_path, compression="gzip")
 print(f"\nWrote {output_path} with {df.height:,} rows")
@@ -0,0 +1,40 @@
 #!/bin/bash
 # Create download directory
 mkdir -p downloads/adsb_artifacts
 # Repository from the workflow comment
 REPO="ggman12/OpenAirframes"
 # Get last 15 runs of the workflow and download matching artifacts
 gh run list \
  --repo "$REPO" \
  --workflow adsb-to-aircraft-multiple-day-run.yaml \
  --limit 15 \
  --json databaseId \
  --jq '.[].databaseId' | while read -r run_id; do
  echo "Checking run ID: $run_id"
  # List artifacts for this run using the API
  # Match pattern: openairframes_adsb-YYYY-MM-DD-YYYY-MM-DD (with second date)
  gh api \
    --paginate \
    "repos/$REPO/actions/runs/$run_id/artifacts" \
    --jq '.artifacts[] | select(.name | test("^openairframes_adsb-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}$")) | .name' | while read -r artifact_name; do
    # Check if artifact directory already exists and has files
    if [ -d "downloads/adsb_artifacts/$artifact_name" ] && [ -n "$(ls -A "downloads/adsb_artifacts/$artifact_name" 2>/dev/null)" ]; then
      echo "  Skipping (already exists): $artifact_name"
      continue
    fi
    echo "  Downloading: $artifact_name"
    gh run download "$run_id" \
      --repo "$REPO" \
      --name "$artifact_name" \
      --dir "downloads/adsb_artifacts/$artifact_name"
  done
 done
 echo "Download complete! Files saved to downloads/adsb_artifacts/"
@@ -0,0 +1,182 @@
 #!/usr/bin/env python3
 """
 Download and concatenate artifacts from a specific set of workflow runs.
 Usage:
    python scripts/download_and_concat_runs.py triggered_runs_20260216_123456.json
 """
 import argparse
 import json
 import os
 import subprocess
 import sys
 from pathlib import Path
 def download_run_artifact(run_id, output_dir):
    """Download artifact from a specific workflow run."""
    print(f"  Downloading artifacts from run {run_id}...")
    cmd = [
        'gh', 'run', 'download', str(run_id),
        '--pattern', 'openairframes_adsb-*',
        '--dir', output_dir
    ]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"  ✓ Downloaded")
        return True
    else:
        if "no artifacts" in result.stderr.lower():
            print(f"  ⚠ No artifacts found (workflow may still be running)")
        else:
            print(f"  ✗ Failed: {result.stderr}")
        return False
 def find_csv_files(download_dir):
    """Find all CSV.gz files in the download directory."""
    csv_files = []
    for root, dirs, files in os.walk(download_dir):
        for file in files:
            if file.endswith('.csv.gz'):
                csv_files.append(os.path.join(root, file))
    return sorted(csv_files)
 def concatenate_csv_files(csv_files, output_file):
    """Concatenate CSV files in order, preserving headers."""
    import gzip
    print(f"\nConcatenating {len(csv_files)} CSV files...")
    with gzip.open(output_file, 'wt') as outf:
        header_written = False
        for i, csv_file in enumerate(csv_files, 1):
            print(f"  [{i}/{len(csv_files)}] Processing {os.path.basename(csv_file)}")
            with gzip.open(csv_file, 'rt') as inf:
                lines = inf.readlines()
                if not header_written:
                    # Write header from first file
                    outf.writelines(lines)
                    header_written = True
                else:
                    # Skip header for subsequent files
                    outf.writelines(lines[1:])
    print(f"\n✓ Concatenated CSV saved to: {output_file}")
    # Show file size
    size_mb = os.path.getsize(output_file) / (1024 * 1024)
    print(f"  Size: {size_mb:.1f} MB")
 def main():
    parser = argparse.ArgumentParser(
        description='Download and concatenate artifacts from workflow runs'
    )
    parser.add_argument(
        'runs_file',
        help='JSON file containing run IDs (from run_historical_adsb_action.py)'
    )
    parser.add_argument(
        '--output-dir',
        default='./downloads/historical_concat',
        help='Directory for downloads (default: ./downloads/historical_concat)'
    )
    parser.add_argument(
        '--wait',
        action='store_true',
        help='Wait for workflows to complete before downloading'
    )
    args = parser.parse_args()
    # Load run IDs
    if not os.path.exists(args.runs_file):
        print(f"Error: File not found: {args.runs_file}")
        sys.exit(1)
    with open(args.runs_file, 'r') as f:
        data = json.load(f)
    runs = data['runs']
    start_date = data['start_date']
    end_date = data['end_date']
    print("=" * 60)
    print("Download and Concatenate Historical Artifacts")
    print("=" * 60)
    print(f"Date range: {start_date} to {end_date}")
    print(f"Workflow runs: {len(runs)}")
    print(f"Output directory: {args.output_dir}")
    print("=" * 60)
    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)
    # Wait for workflows to complete if requested
    if args.wait:
        print("\nWaiting for workflows to complete...")
        for run_info in runs:
            run_id = run_info['run_id']
            print(f"  Checking run {run_id}...")
            cmd = ['gh', 'run', 'watch', str(run_id)]
            subprocess.run(cmd)
    # Download artifacts
    print("\nDownloading artifacts...")
    successful_downloads = 0
    for i, run_info in enumerate(runs, 1):
        run_id = run_info['run_id']
        print(f"\n[{i}/{len(runs)}] Run {run_id} ({run_info['start']} to {run_info['end']})")
        if download_run_artifact(run_id, args.output_dir):
            successful_downloads += 1
    print(f"\n\nDownload Summary: {successful_downloads}/{len(runs)} artifacts downloaded")
    if successful_downloads == 0:
        print("\nNo artifacts downloaded. Workflows may still be running.")
        print("Use --wait to wait for completion, or try again later.")
        sys.exit(1)
    # Find all CSV files
    csv_files = find_csv_files(args.output_dir)
    if not csv_files:
        print("\nError: No CSV files found in download directory")
        sys.exit(1)
    print(f"\nFound {len(csv_files)} CSV file(s):")
    for csv_file in csv_files:
        print(f"  - {os.path.basename(csv_file)}")
    # Concatenate
    # Calculate actual end date for filename (end_date - 1 day since it's exclusive)
    from datetime import datetime, timedelta
    end_dt = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
    actual_end = end_dt.strftime('%Y-%m-%d')
    output_file = os.path.join(
        args.output_dir,
        f"openairframes_adsb_{start_date}_{actual_end}.csv.gz"
    )
    concatenate_csv_files(csv_files, output_file)
    print("\n" + "=" * 60)
    print("Done!")
    print("=" * 60)
 if __name__ == '__main__':
    main()
@@ -0,0 +1,215 @@
 #!/usr/bin/env python3
 """
 Script to trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks.
 Usage:
    python scripts/run_historical_adsb_action.py --start-date 2025-01-01 --end-date 2025-06-01
 """
 import argparse
 import subprocess
 import sys
 from datetime import datetime, timedelta
 from calendar import monthrange
 def generate_monthly_chunks(start_date_str, end_date_str):
    """Generate date ranges in monthly chunks from start to end date.
    End dates are exclusive (e.g., to process Jan 1-31, end_date should be Feb 1).
    """
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    chunks = []
    current = start_date
    while current < end_date:
        # Get the first day of the next month (exclusive end)
        _, days_in_month = monthrange(current.year, current.month)
        month_end = current.replace(day=days_in_month)
        next_month_start = month_end + timedelta(days=1)
        # Don't go past the global end date
        chunk_end = min(next_month_start, end_date)
        chunks.append({
            'start': current.strftime('%Y-%m-%d'),
            'end': chunk_end.strftime('%Y-%m-%d')
        })
        # Move to first day of next month
        if next_month_start >= end_date:
            break
        current = next_month_start
    return chunks
 def trigger_workflow(start_date, end_date, repo='ggman12/OpenAirframes', branch='main', dry_run=False):
    """Trigger the adsb-to-aircraft-multiple-day-run workflow via GitHub CLI."""
    cmd = [
        'gh', 'workflow', 'run', 'adsb-to-aircraft-multiple-day-run.yaml',
        '--repo', repo,
        '--ref', branch,
        '-f', f'start_date={start_date}',
        '-f', f'end_date={end_date}'
    ]
    if dry_run:
        print(f"[DRY RUN] Would run: {' '.join(cmd)}")
        return True, None
    print(f"Triggering workflow: {start_date} to {end_date} (on {branch})")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"✓ Successfully triggered workflow for {start_date} to {end_date}")
        # Get the run ID of the workflow we just triggered
        # Wait a moment for it to appear
        import time
        time.sleep(2)
        # Get the most recent run (should be the one we just triggered)
        list_cmd = [
            'gh', 'run', 'list',
            '--repo', repo,
            '--workflow', 'adsb-to-aircraft-multiple-day-run.yaml',
            '--branch', branch,
            '--limit', '1',
            '--json', 'databaseId',
            '--jq', '.[0].databaseId'
        ]
        list_result = subprocess.run(list_cmd, capture_output=True, text=True)
        run_id = list_result.stdout.strip() if list_result.returncode == 0 else None
        return True, run_id
    else:
        print(f"✗ Failed to trigger workflow for {start_date} to {end_date}")
        print(f"Error: {result.stderr}")
        return False, None
 def main():
    parser = argparse.ArgumentParser(
        description='Trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks'
    )
    parser.add_argument(
        '--start-date', '--start_date',
        dest='start_date',
        required=True,
        help='Start date in YYYY-MM-DD format (inclusive)'
    )
    parser.add_argument(
        '--end-date', '--end_date',
        dest='end_date',
        required=True,
        help='End date in YYYY-MM-DD format (exclusive)'
    )
    parser.add_argument(
        '--repo',
        type=str,
        default='ggman12/OpenAirframes',
        help='GitHub repository (default: ggman12/OpenAirframes)'
    )
    parser.add_argument(
        '--branch',
        type=str,
        default='main',
        help='Branch to run the workflow on (default: main)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Print commands without executing them'
    )
    parser.add_argument(
        '--delay',
        type=int,
        default=5,
        help='Delay in seconds between workflow triggers (default: 5)'
    )
    args = parser.parse_args()
    # Validate dates
    try:
        start = datetime.strptime(args.start_date, '%Y-%m-%d')
        end = datetime.strptime(args.end_date, '%Y-%m-%d')
        if start > end:
            print("Error: start_date must be before or equal to end_date")
            sys.exit(1)
    except ValueError as e:
        print(f"Error: Invalid date format - {e}")
        sys.exit(1)
    # Generate monthly chunks
    chunks = generate_monthly_chunks(args.start_date, args.end_date)
    print(f"\nGenerating {len(chunks)} monthly workflow runs on branch '{args.branch}' (repo: {args.repo}):")
    for i, chunk in enumerate(chunks, 1):
        print(f"  {i}. {chunk['start']} to {chunk['end']}")
    if not args.dry_run:
        response = input(f"\nProceed with triggering {len(chunks)} workflows on '{args.branch}'? [y/N]: ")
        if response.lower() != 'y':
            print("Cancelled.")
            sys.exit(0)
    print()
    # Trigger workflows
    import time
    success_count = 0
    triggered_runs = []
    for i, chunk in enumerate(chunks, 1):
        print(f"\n[{i}/{len(chunks)}] ", end='')
        success, run_id = trigger_workflow(
            chunk['start'],
            chunk['end'],
            repo=args.repo,
            branch=args.branch,
            dry_run=args.dry_run
        )
        if success:
            success_count += 1
            if run_id:
                triggered_runs.append({
                    'run_id': run_id,
                    'start': chunk['start'],
                    'end': chunk['end']
                })
        # Add delay between triggers (except for last one)
        if i < len(chunks) and not args.dry_run:
            time.sleep(args.delay)
    print(f"\n\nSummary: {success_count}/{len(chunks)} workflows triggered successfully")
    # Save triggered run IDs to a file
    if triggered_runs and not args.dry_run:
        import json
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        runs_file = f"./output/triggered_runs_{timestamp}.json"
        with open(runs_file, 'w') as f:
            json.dump({
                'start_date': args.start_date,
                'end_date': args.end_date,
                'repo': args.repo,
                'branch': args.branch,
                'runs': triggered_runs
            }, f, indent=2)
        print(f"\nRun IDs saved to: {runs_file}")
        print(f"\nTo download and concatenate these artifacts, run:")
        print(f"  python scripts/download_and_concat_runs.py {runs_file}")
    if success_count < len(chunks):
        sys.exit(1)
 if __name__ == '__main__':
    main()
@@ -0,0 +1,82 @@
 #!/usr/bin/env python3
 """
 Run src.adsb.main in an isolated git worktree so edits in the main
 working tree won't affect subprocess imports during the run.
 Usage:
    python scripts/run_main_isolated.py 2026-01-01
    python scripts/run_main_isolated.py --start_date 2026-01-01 --end_date 2026-01-03
 """
 import argparse
 import os
 import shutil
 import subprocess
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
 def run(
    cmd: list[str],
    *,
    cwd: Path | None = None,
    check: bool = True,
 ) -> subprocess.CompletedProcess:
    print(f"\n>>> {' '.join(cmd)}")
    return subprocess.run(cmd, cwd=cwd, check=check)
 def main() -> int:
    parser = argparse.ArgumentParser(description="Run src.adsb.main in an isolated worktree")
    parser.add_argument("date", nargs="?", help="Single date to process (YYYY-MM-DD)")
    parser.add_argument("--start_date", help="Start date (inclusive, YYYY-MM-DD)")
    parser.add_argument("--end_date", help="End date (exclusive, YYYY-MM-DD)")
    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
    args = parser.parse_args()
    if args.date and (args.start_date or args.end_date):
        raise SystemExit("Use a single date or --start_date/--end_date, not both.")
    if args.date:
        datetime.strptime(args.date, "%Y-%m-%d")
        main_args = ["--date", args.date]
    else:
        if not args.start_date or not args.end_date:
            raise SystemExit("Provide --start_date and --end_date, or a single date.")
        datetime.strptime(args.start_date, "%Y-%m-%d")
        datetime.strptime(args.end_date, "%Y-%m-%d")
        main_args = ["--start_date", args.start_date, "--end_date", args.end_date]
    if args.concat_with_latest_csv:
        main_args.append("--concat_with_latest_csv")
    repo_root = Path(__file__).resolve().parents[1]
    snapshots_root = repo_root / ".snapshots"
    snapshots_root.mkdir(exist_ok=True)
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    snapshot_root = snapshots_root / f"run_{timestamp}"
    snapshot_src = snapshot_root / "src"
    exit_code = 0
    try:
        shutil.copytree(repo_root / "src", snapshot_src)
        runner = (
            "import sys, runpy; "
            f"sys.path.insert(0, {repr(str(snapshot_root))}); "
            f"sys.argv = ['src.adsb.main'] + {main_args!r}; "
            "runpy.run_module('src.adsb.main', run_name='__main__')"
        )
        cmd = [sys.executable, "-c", runner]
        run(cmd, cwd=repo_root)
    except subprocess.CalledProcessError as exc:
        exit_code = exc.returncode
    finally:
        shutil.rmtree(snapshot_root, ignore_errors=True)
    return exit_code
 if __name__ == "__main__":
    raise SystemExit(main())
@@ -0,0 +1,242 @@
 #!/usr/bin/env python3
 """
 Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
 Source: "TheAirTraffic Database - Aircraft 2.csv"
 Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
 Categories in the spreadsheet columns (paired: name, registrations, separator):
  Col  1-3:  Business
  Col  4-6:  Government
  Col  7-9:  People
  Col 10-12: Sports
  Col 13-15: Celebrity
  Col 16-18: State Govt./Law
  Col 19-21: Other
  Col 22-24: Test Aircraft
  Col 25-27: YouTubers
  Col 28-30: Formula 1 VIP's
  Col 31-33: Active GII's and GIII's  (test/demo aircraft)
  Col 34-37: Russia & Ukraine          (extra col for old/new)
  Col 38-40: Helicopters & Blimps
  Col 41-43: Unique Reg's
  Col 44-46: Saudi & UAE
  Col 47-49: Schools
  Col 50-52: Special Charter
  Col 53-55: Unknown Owners
  Col 56-59: Frequent Flyers           (extra cols: name, aircraft, logged, hours)
 """
 import csv
 import json
 import hashlib
 import re
 import sys
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
 # ── Category mapping ────────────────────────────────────────────────────────
 # Each entry: (name_col, reg_col, owner_category_tags)
 # owner_category_tags is a dict of tag keys to add beyond "owner"
 CATEGORY_COLUMNS = [
    # (name_col, reg_col, {tag_key: tag_value, ...})
    (1,  2,  {"owner_category_0": "business"}),
    (4,  5,  {"owner_category_0": "government"}),
    (7,  8,  {"owner_category_0": "celebrity"}),
    (10, 11, {"owner_category_0": "sports"}),
    (13, 14, {"owner_category_0": "celebrity"}),
    (16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
    (19, 20, {"owner_category_0": "other"}),
    (22, 23, {"owner_category_0": "test_aircraft"}),
    (25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
    (28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
    (31, 32, {"owner_category_0": "test_aircraft"}),
    # Russia & Ukraine: col 34=name, col 35 or 36 may have reg
    (34, 35, {"owner_category_0": "russia_ukraine"}),
    (38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
    (41, 42, {"owner_category_0": "other"}),
    (44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
    (47, 48, {"owner_category_0": "education"}),
    (50, 51, {"owner_category_0": "charter"}),
    (53, 54, {"owner_category_0": "unknown"}),
    (56, 57, {"owner_category_0": "celebrity"}),   # Frequent Flyers name col, aircraft col
 ]
 # First data row index (0-based) in the CSV
 DATA_START_ROW = 4
 # ── Contributor info ────────────────────────────────────────────────────────
 CONTRIBUTOR_NAME = "TheAirTraffic"
 # Deterministic UUID v5 from contributor name
 CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
 # Citation
 CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
 def looks_like_military_serial(reg: str) -> bool:
    """
    Detect military-style serials like 92-9000, 82-8000, 98-0001
    or pure numeric IDs like 929000, 828000, 980001.
    These aren't standard civil registrations; use openairframes_id.
    """
    # Pattern: NN-NNNN
    if re.match(r'^\d{2}-\d{4}$', reg):
        return True
    # Pure 6-digit numbers (likely ICAO hex or military mode-S)
    if re.match(r'^\d{6}$', reg):
        return True
    # Short numeric-only (1-5 digits) like "01", "02", "676"
    if re.match(r'^\d{1,5}$', reg):
        return True
    return False
 def normalize_reg(raw: str) -> str:
    """Clean up a registration string."""
    reg = raw.strip().rstrip(',').strip()
    # Remove carriage returns and other whitespace
    reg = reg.replace('\r', '').replace('\n', '').strip()
    return reg
 def parse_regs(cell_value: str) -> list[str]:
    """
    Parse a cell that may contain one or many registrations,
    separated by commas, possibly wrapped in quotes.
    """
    if not cell_value or not cell_value.strip():
        return []
    # Some cells have ADS-B exchange URLs – skip those
    if 'globe.adsbexchange.com' in cell_value:
        return []
    if cell_value.strip() in ('.', ',', ''):
        return []
    results = []
    # Split on comma
    parts = cell_value.split(',')
    for part in parts:
        reg = normalize_reg(part)
        if not reg:
            continue
        # Skip URLs, section labels, etc.
        if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
            continue
        # Skip if it's just whitespace or dots
        if reg in ('.', '..', '...'):
            continue
        results.append(reg)
    return results
 def make_submission(
    reg: str,
    owner: str,
    category_tags: dict[str, str],
 ) -> dict:
    """Build a single community_submission.v1 object."""
    entry: dict = {}
    # Decide identifier field
    if looks_like_military_serial(reg):
        entry["openairframes_id"] = reg
    else:
        entry["registration_number"] = reg
    # Tags
    tags: dict = {
        "citation_0": CITATION,
    }
    if owner:
        tags["owner"] = owner.strip()
    tags.update(category_tags)
    entry["tags"] = tags
    return entry
 def main():
    csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
        "/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
    )
    if not csv_path.exists():
        print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
        sys.exit(1)
    # Read CSV
    with open(csv_path, 'r', encoding='utf-8-sig') as f:
        reader = csv.reader(f)
        rows = list(reader)
    print(f"Read {len(rows)} rows from {csv_path.name}")
    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    submissions: list[dict] = []
    seen: set[tuple] = set()  # (reg, owner) dedup
    for row_idx in range(DATA_START_ROW, len(rows)):
        row = rows[row_idx]
        if len(row) < 3:
            continue
        for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
            if reg_col >= len(row) or name_col >= len(row):
                continue
            owner_raw = row[name_col].strip().rstrip(',').strip()
            reg_raw = row[reg_col]
            # Clean owner name
            owner = owner_raw.replace('\r', '').replace('\n', '').strip()
            if not owner or owner in ('.', ',', 'Section 1'):
                continue
            # Skip header-like values
            if owner.startswith('http') or owner.startswith('Link '):
                continue
            regs = parse_regs(reg_raw)
            if not regs:
                # For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
                if name_col == 34 and reg_col + 1 < len(row):
                    regs = parse_regs(row[reg_col + 1])
            for reg in regs:
                key = (reg, owner)
                if key in seen:
                    continue
                seen.add(key)
                submissions.append(make_submission(reg, owner, cat_tags))
    print(f"Generated {len(submissions)} submissions")
    # Write output
    proj_root = Path(__file__).resolve().parent.parent
    out_dir = proj_root / "community" / date_str
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"theairtraffic_{date_str}.json"
    with open(out_file, 'w', encoding='utf-8') as f:
        json.dump(submissions, f, indent=2, ensure_ascii=False)
    print(f"Written to {out_file}")
    print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
    # Quick stats
    cats = {}
    for s in submissions:
        c = s['tags'].get('owner_category_0', 'NONE')
        cats[c] = cats.get(c, 0) + 1
    print("\nCategory breakdown:")
    for c, n in sorted(cats.items(), key=lambda x: -x[1]):
        print(f"  {c}: {n}")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,69 @@
 #!/usr/bin/env python3
 """Validate the generated theairtraffic JSON output."""
 import json
 import glob
 import sys
 # Find the latest output
 files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
 if not files:
    print("No output files found!")
    sys.exit(1)
 path = files[-1]
 print(f"Validating: {path}")
 with open(path) as f:
    data = json.load(f)
 print(f"Total entries: {len(data)}")
 # Check military serial handling
 mil = [d for d in data if "openairframes_id" in d]
 print(f"\nEntries using openairframes_id: {len(mil)}")
 for m in mil[:10]:
    print(f"  {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
 # Check youtuber entries
 yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
 print(f"\nYouTuber entries: {len(yt)}")
 for y in yt[:5]:
    reg = y.get("registration_number", y.get("openairframes_id"))
    c0 = y["tags"].get("owner_category_0")
    c1 = y["tags"].get("owner_category_1")
    print(f"  {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
 # Check US Govt / military
 gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
 print(f"\nUSA 747/757 entries: {len(gov)}")
 for g in gov:
    oid = g.get("openairframes_id", g.get("registration_number"))
    print(f"  {oid}")
 # Schema validation
 issues = 0
 for i, d in enumerate(data):
    has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
    if not has_id:
        print(f"  Entry {i}: no identifier!")
        issues += 1
    if "tags" not in d:
        print(f"  Entry {i}: no tags!")
        issues += 1
    # Check tag key format
    for k in d.get("tags", {}):
        import re
        if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
            print(f"  Entry {i}: invalid tag key '{k}'")
            issues += 1
 print(f"\nSchema issues: {issues}")
 # Category breakdown
 cats = {}
 for s in data:
    c = s["tags"].get("owner_category_0", "NONE")
    cats[c] = cats.get(c, 0) + 1
 print("\nCategory breakdown:")
 for c, n in sorted(cats.items(), key=lambda x: -x[1]):
    print(f"  {c}: {n}")
@@ -1,11 +0,0 @@
 FROM --platform=linux/arm64 python:3.12-slim
 WORKDIR /app
 COPY requirements.reducer.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 COPY compress_adsb_to_aircraft_data.py .
 COPY reducer.py .
 CMD ["python", "-u", "reducer.py"]
@@ -1,12 +0,0 @@
 FROM --platform=linux/arm64 python:3.12-slim
 WORKDIR /app
 COPY requirements.worker.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 COPY compress_adsb_to_aircraft_data.py .
 COPY download_adsb_data_to_parquet.py .
 COPY worker.py .
 CMD ["python", "-u", "worker.py"]
@@ -1,250 +0,0 @@
 """
 Combines chunk parquet files and compresses to final aircraft CSV.
 This is the reduce phase of the map-reduce pipeline.
 Supports both single-day (daily) and multi-day (historical) modes.
 Memory-efficient: processes each chunk separately, compresses, then combines.
 Usage:
    # Daily mode
    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
    # Historical mode
    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
 """
 import gc
 import os
 import sys
 import glob
 import argparse
 from datetime import datetime, timedelta
 import polars as pl
 from src.adsb.download_adsb_data_to_parquet import OUTPUT_DIR, get_resource_usage
 from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLUMNS
 DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
 FINAL_OUTPUT_DIR = "./data/openairframes"
 os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
 def get_target_day() -> datetime:
    """Get yesterday's date (the day we're processing)."""
    return datetime.utcnow() - timedelta(days=1)
 def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
    """Load and compress a single chunk parquet file.
    Args:
        chunk_path: Path to parquet file
        delete_after_load: If True, delete the parquet file after loading to free disk space
    """
    print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
    # Load chunk - only columns we need
    needed_columns = ['time', 'icao'] + COLUMNS
    df = pl.read_parquet(chunk_path, columns=needed_columns)
    print(f"  Loaded {len(df)} rows")
    # Delete file immediately after loading to free disk space
    if delete_after_load:
        try:
            os.remove(chunk_path)
            print(f"  Deleted {chunk_path} to free disk space")
        except Exception as e:
            print(f"  Warning: Failed to delete {chunk_path}: {e}")
    # Compress to aircraft records (one per ICAO) using shared function
    compressed = compress_multi_icao_df(df, verbose=True)
    print(f"  Compressed to {len(compressed)} aircraft records")
    del df
    gc.collect()
    return compressed
 def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFrame:
    """Combine multiple compressed DataFrames.
    Since chunks are partitioned by ICAO hash, each ICAO only appears in one chunk.
    No deduplication needed here - just concatenate.
    """
    print(f"Combining {len(compressed_dfs)} compressed chunks... | {get_resource_usage()}")
    # Concat all
    combined = pl.concat(compressed_dfs)
    print(f"Combined: {len(combined)} records")
    return combined
 def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame:
    """Download base release and merge with new data."""
    from src.get_latest_release import download_latest_aircraft_adsb_csv
    print("Downloading base ADS-B release...")
    try:
        base_path = download_latest_aircraft_adsb_csv(
            output_dir="./data/openairframes_base"
        )
        print(f"Download returned: {base_path}")
        if base_path and os.path.exists(str(base_path)):
            print(f"Loading base release from {base_path}")
            base_df = pl.read_csv(base_path)
            print(f"Base release has {len(base_df)} records")
            # Ensure columns match
            base_cols = set(base_df.columns)
            new_cols = set(compressed_df.columns)
            print(f"Base columns: {sorted(base_cols)}")
            print(f"New columns: {sorted(new_cols)}")
            # Add missing columns
            for col in new_cols - base_cols:
                base_df = base_df.with_columns(pl.lit(None).alias(col))
            for col in base_cols - new_cols:
                compressed_df = compressed_df.with_columns(pl.lit(None).alias(col))
            # Reorder columns to match
            compressed_df = compressed_df.select(base_df.columns)
            # Concat and deduplicate by icao (keep new data - it comes last)
            combined = pl.concat([base_df, compressed_df])
            print(f"After concat: {len(combined)} records")
            deduplicated = combined.unique(subset=["icao"], keep="last")
            print(f"Combined with base: {len(combined)} -> {len(deduplicated)} after dedup")
            del base_df, combined
            gc.collect()
            return deduplicated
        else:
            print(f"No base release found at {base_path}, using only new data")
            return compressed_df
    except Exception as e:
        import traceback
        print(f"Failed to download base release: {e}")
        traceback.print_exc()
        return compressed_df
 def cleanup_chunks(output_id: str, chunks_dir: str):
    """Delete chunk parquet files after successful merge."""
    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
    chunk_files = glob.glob(pattern)
    for f in chunk_files:
        try:
            os.remove(f)
            print(f"Deleted {f}")
        except Exception as e:
            print(f"Failed to delete {f}: {e}")
 def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
    """Find chunk parquet files matching the output ID."""
    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
    chunk_files = sorted(glob.glob(pattern))
    if not chunk_files:
        # Try recursive search for historical mode with merged artifacts
        pattern = os.path.join(chunks_dir, "**", "*.parquet")
        chunk_files = sorted(glob.glob(pattern, recursive=True))
    return chunk_files
 def main():
    parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
    parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
    parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
    parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
    args = parser.parse_args()
    # Determine output ID and filename based on mode
    if args.start_date and args.end_date:
        # Historical mode
        output_id = f"{args.start_date}_{args.end_date}"
        output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv"
        print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
    else:
        # Daily mode - use same date for start and end
        if args.date:
            target_day = datetime.strptime(args.date, "%Y-%m-%d")
        else:
            target_day = get_target_day()
        date_str = target_day.strftime("%Y-%m-%d")
        output_id = date_str
        output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv"
        print(f"Combining chunks for {date_str}")
    chunks_dir = args.chunks_dir
    print(f"Chunks directory: {chunks_dir}")
    print(f"Resource usage at start: {get_resource_usage()}")
    # Find chunk files
    chunk_files = find_chunk_files(chunks_dir, output_id)
    if not chunk_files:
        print(f"No chunk files found in: {chunks_dir}")
        sys.exit(1)
    print(f"Found {len(chunk_files)} chunk files")
    # Process each chunk separately to save memory
    # With --stream, delete parquet files immediately after loading to save disk space
    compressed_chunks = []
    for chunk_path in chunk_files:
        compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
        compressed_chunks.append(compressed)
        gc.collect()
    # Combine all compressed chunks
    combined = combine_compressed_chunks(compressed_chunks)
    # Free memory from individual chunks
    del compressed_chunks
    gc.collect()
    print(f"After combining: {get_resource_usage()}")
    # Merge with base release (unless skipped)
    if not args.skip_base:
        combined = download_and_merge_base_release(combined)
    # Convert list columns to strings for CSV compatibility
    for col in combined.columns:
        if combined[col].dtype == pl.List:
            combined = combined.with_columns(
                pl.col(col).list.join(",").alias(col)
            )
    # Sort by time for consistent output
    if 'time' in combined.columns:
        combined = combined.sort('time')
    # Write final CSV
    output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
    combined.write_csv(output_path)
    print(f"Wrote {len(combined)} records to {output_path}")
    # Cleanup
    if not args.keep_chunks:
        cleanup_chunks(output_id, chunks_dir)
    print(f"Done! | {get_resource_usage()}")
 if __name__ == "__main__":
    main()
@@ -5,23 +5,6 @@ import polars as pl
 COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']
 def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
    """For each icao, keep only the earliest row with each unique signature.
    This is used for deduplicating across multiple compressed chunks.
    """
    # Create signature column
    df = df.with_columns(
        pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
    )
    # Group by icao and signature, take first row (earliest due to time sort)
    df = df.sort("time")
    df_deduped = df.group_by(["icao", "_signature"]).first()
    df_deduped = df_deduped.drop("_signature")
    df_deduped = df_deduped.sort("time")
    return df_deduped
 def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
    """Compress a single ICAO group to its most informative row using Polars."""
    # Create signature string
@@ -99,9 +82,6 @@ def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
 def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
    """Compress a DataFrame with multiple ICAOs to one row per ICAO.
    This is the main entry point for compressing ADS-B data.
    Used by both daily GitHub Actions runs and historical AWS runs.
    Args:
        df: DataFrame with columns ['time', 'icao'] + COLUMNS
        verbose: Whether to print progress
@@ -120,29 +100,27 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
-    # First pass: quick deduplication of exact duplicates
+    # Quick deduplication of exact duplicates
    df = df.unique(subset=['icao'] + COLUMNS, keep='first')
    if verbose:
        print(f"After quick dedup: {df.height} records")
-    # Second pass: sophisticated compression per ICAO
+    # Compress per ICAO
    if verbose:
        print("Compressing per ICAO...")
    # Process each ICAO group
    icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []
    for icao_key, group_df in icao_groups.items():
-        # partition_by with as_dict=True returns tuple keys, extract first element
+        icao = icao_key[0]
        icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
        compressed = compress_df_polars(group_df, str(icao))
        compressed_dfs.append(compressed)
    if compressed_dfs:
        df_compressed = pl.concat(compressed_dfs)
    else:
-        df_compressed = df.head(0)  # Empty with same schema
+        df_compressed = df.head(0)
    if verbose:
        print(f"After compress: {df_compressed.height} records")
@@ -155,45 +133,22 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
    return df_compressed
-def load_raw_adsb_for_day(day):
+def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
-    """Load raw ADS-B data for a day from parquet file."""
+    """Load a single parquet part file for a date.
-    from datetime import timedelta
+    
    Args:
        part_id: Part ID (e.g., 1, 2, 3)
        date: Date string in YYYY-MM-DD format
    Returns:
        DataFrame with ADS-B data
    """
    from pathlib import Path
-    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
+    parquet_file = Path(f"data/output/parquet_output/part_{part_id}_{date}.parquet")
    # Check for parquet file first
    version_date = f"v{start_time.strftime('%Y.%m.%d')}"
    parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
    if not parquet_file.exists():
-        # Try to generate parquet file by calling the download function
+        print(f"Parquet file not found: {parquet_file}")
        print(f"  Parquet file not found: {parquet_file}")
        print(f"  Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
        from download_adsb_data_to_parquet import create_parquet_for_day
        result_path = create_parquet_for_day(start_time, keep_folders=False)
        if result_path:
            print(f"  Successfully generated parquet file: {result_path}")
        else:
            raise Exception("Failed to generate parquet file")
    if parquet_file.exists():
        print(f"  Loading from parquet: {parquet_file}")
        df = pl.read_parquet(
            parquet_file, 
            columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
        )
        # Convert to timezone-naive datetime
        if df["time"].dtype == pl.Datetime:
            df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
        return df
    else:
        # Return empty DataFrame if parquet file doesn't exist
        print(f"  No data available for {start_time.strftime('%Y-%m-%d')}")
        return pl.DataFrame(schema={
            'time': pl.Datetime,
            'icao': pl.Utf8,
@@ -205,17 +160,33 @@ def load_raw_adsb_for_day(day):
            'desc': pl.Utf8,
            'aircraft_category': pl.Utf8
        })
    print(f"Loading from parquet: {parquet_file}")
    df = pl.read_parquet(
        parquet_file,
        columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
    )
    # Convert to timezone-naive datetime
    if df["time"].dtype == pl.Datetime:
        df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
    os.remove(parquet_file)
    return df
-def load_historical_for_day(day):
+def compress_parquet_part(part_id: int, date: str) -> pl.DataFrame:
-    """Load and compress historical ADS-B data for a day."""
+    """Load and compress a single parquet part file."""
-    df = load_raw_adsb_for_day(day)
+    df = load_parquet_part(part_id, date)
    if df.height == 0:
        return df
    # Filter to rows within the given date (UTC-naive). This is because sometimes adsb.lol export can have rows at 00:00:00 of next day or similar.
    date_lit = pl.lit(date).str.strptime(pl.Date, "%Y-%m-%d")
    df = df.filter(pl.col("time").dt.date() == date_lit)
-    print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")
+    print(f"Loaded {df.height} raw records for part {part_id}, date {date}")
    # Use shared compression function
    return compress_multi_icao_df(df, verbose=True)
@@ -223,52 +194,4 @@ def concat_compressed_dfs(df_base, df_new):
    """Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
    # Combine both dataframes
    df_combined = pl.concat([df_base, df_new])
-    
+    return df_combined
    # Sort by ICAO and time
    df_combined = df_combined.sort(['icao', 'time'])
    # Fill null values
    for col in COLUMNS:
        if col in df_combined.columns:
            df_combined = df_combined.with_columns(pl.col(col).fill_null(""))
    # Apply compression logic per ICAO to get the best row
    icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []
    for icao, group_df in icao_groups.items():
        compressed = compress_df_polars(group_df, icao)
        compressed_dfs.append(compressed)
    if compressed_dfs:
        df_compressed = pl.concat(compressed_dfs)
    else:
        df_compressed = df_combined.head(0)
    # Sort by time
    df_compressed = df_compressed.sort('time')
    return df_compressed
 def get_latest_aircraft_adsb_csv_df():
    """Download and load the latest ADS-B CSV from GitHub releases."""
    from get_latest_release import download_latest_aircraft_adsb_csv
    import re
    csv_path = download_latest_aircraft_adsb_csv()
    df = pl.read_csv(csv_path, null_values=[""])
    # Fill nulls with empty strings
    for col in df.columns:
        if df[col].dtype == pl.Utf8:
            df = df.with_columns(pl.col(col).fill_null(""))
    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
    date_str = match.group(1)
    return df, date_str
@@ -0,0 +1,67 @@
 from pathlib import Path
 import polars as pl
 import argparse
 import os
 OUTPUT_DIR = Path("./data/output")
 CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
 def main():
    parser = argparse.ArgumentParser(description="Concatenate compressed parquet files for a single day")
    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Whether to also concatenate with the latest CSV from GitHub releases")
    args = parser.parse_args()
    compressed_dir = OUTPUT_DIR / "compressed"
    date_dir = compressed_dir / args.date
    parquet_files = sorted(date_dir.glob("*.parquet"))
    df = None
    if parquet_files: # TODO: This logic could be updated slightly.
        print(f"No parquet files found in {date_dir}")
        frames = [pl.read_parquet(p) for p in parquet_files]
        df = pl.concat(frames, how="vertical", rechunk=True)
        df = df.sort(["time", "icao"])
        df = df.select(CORRECT_ORDER_OF_COLUMNS)
        output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
        print(f"Writing combined parquet to {output_path} with {df.height} rows")
        df.write_parquet(output_path)
        csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
        print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
        df.write_csv(csv_output_path, compression="gzip")
    if args.concat_with_latest_csv:
        print("Loading latest CSV from GitHub releases to concatenate with...")
        from src.get_latest_release import get_latest_aircraft_adsb_csv_df
        from datetime import datetime
        df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
        # Compare dates: end_date is exclusive, so if csv_end_date > args.date, 
        # the latest CSV already includes this day's data
        csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
        args_dt = datetime.strptime(args.date, "%Y-%m-%d")
        if df is None or csv_end_dt >= args_dt:
            print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
            print("Writing latest CSV directly without concatenation to avoid duplicates")
            os.makedirs(OUTPUT_DIR, exist_ok=True)
            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
            df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
        else:
            print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
            # Ensure column order matches before concatenating
            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
            from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
            df_final = concat_compressed_dfs(df_latest_csv, df)
            df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
            df_final.write_csv(final_csv_output_path, compression="gzip")
        print(f"Final CSV written to {final_csv_output_path}")
 if __name__ == "__main__":
    main()
@@ -1,42 +1,33 @@
 """
 Downloads adsb.lol data and writes to Parquet files.
-Usage:
+This file contains utility functions for downloading and processing adsb.lol trace data.
-    python -m src.process_historical_adsb_data.download_to_parquet 2025-01-01 2025-01-02
+Used by the historical ADS-B processing pipeline.
 This will download trace data for the specified date range and output Parquet files.
 This file is self-contained and does not import from other project modules.
 """
-import gc
+import datetime as dt
 import glob
 import gzip
 import os
 import re
 import resource
 import shutil
 import sys
 import logging
 import time
 import re
 import signal
 import concurrent.futures
 import subprocess
-import os
+import sys
 import argparse
 import datetime as dt
 from datetime import datetime, timedelta, timezone
 import urllib.request
 import urllib.error
-
+import urllib.request
 from datetime import datetime
 import time
 import orjson
 import pyarrow as pa
 import pyarrow.parquet as pq
 from pathlib import Path
 # ============================================================================
 # Configuration
 # ============================================================================
-OUTPUT_DIR = "./data/output"
+OUTPUT_DIR = Path("./data/output")
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output")
@@ -76,20 +67,16 @@ def timeout_handler(signum, frame):
    raise DownloadTimeoutException("Download timed out after 40 seconds")
-def fetch_releases(version_date: str) -> list:
+def _fetch_releases_from_repo(year: str, version_date: str) -> list:
-    """Fetch GitHub releases for a given version date from adsblol."""
+    """Fetch GitHub releases for a given version date from a specific year's adsblol repo."""
    year = version_date.split('.')[0][1:]
    if version_date == "v2024.12.31":
        year = "2025"
    BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
-    # Match exact release name, exclude tmp releases
+    PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+(tmp)?$"
    PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+$"
    releases = []
    page = 1
    while True:
        max_retries = 10
-        retry_delay = 60
+        retry_delay = 60*5
        for attempt in range(1, max_retries + 1):
            try:
@@ -101,7 +88,7 @@ def fetch_releases(version_date: str) -> list:
                    else:
                        print(f"Failed to fetch releases (attempt {attempt}/{max_retries}): {response.status} {response.reason}")
                        if attempt < max_retries:
-                            print(f"Waiting {retry_delay} seconds before retry...")
+                            print(f"Waiting {retry_delay} seconds before retry")
                            time.sleep(retry_delay)
                        else:
                            print(f"Giving up after {max_retries} attempts")
@@ -109,7 +96,7 @@ def fetch_releases(version_date: str) -> list:
            except Exception as e:
                print(f"Request exception (attempt {attempt}/{max_retries}): {e}")
                if attempt < max_retries:
-                    print(f"Waiting {retry_delay} seconds before retry...")
+                    print(f"Waiting {retry_delay} seconds before retry")
                    time.sleep(retry_delay)
                else:
                    print(f"Giving up after {max_retries} attempts")
@@ -123,41 +110,118 @@ def fetch_releases(version_date: str) -> list:
    return releases
-def download_asset(asset_url: str, file_path: str) -> bool:
+def fetch_releases(version_date: str) -> list:
-    """Download a single release asset."""
+    """Fetch GitHub releases for a given version date from adsblol.
    For Dec 31 dates, if no releases are found in the current year's repo,
    also checks the next year's repo (adsblol sometimes publishes Dec 31
    data in the following year's repository).
    """
    year = version_date.split('.')[0][1:]
    releases = _fetch_releases_from_repo(year, version_date)
    # For last day of year, also check next year's repo if nothing found
    if not releases and version_date.endswith(".12.31"):
        next_year = str(int(year) + 1)
        print(f"No releases found for {version_date} in {year} repo, checking {next_year} repo")
        releases = _fetch_releases_from_repo(next_year, version_date)
    return releases
 def download_asset(asset_url: str, file_path: str, expected_size: int | None = None) -> bool:
    """Download a single release asset with size verification.
    Args:
        asset_url: URL to download from
        file_path: Local path to save to
        expected_size: Expected file size in bytes (for verification)
    Returns:
        True if download succeeded and size matches (if provided), False otherwise
    """
    os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
    # Check if file exists and has correct size
    if os.path.exists(file_path):
-        print(f"[SKIP] {file_path} already downloaded.")
+        if expected_size is not None:
-        return True
+            actual_size = os.path.getsize(file_path)
-    
+            if actual_size == expected_size:
-    print(f"Downloading {asset_url}...")
+                print(f"[SKIP] {file_path} already downloaded and verified ({actual_size} bytes).")
    try:
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(40)  # 40-second timeout
        req = urllib.request.Request(asset_url, headers=HEADERS)
        with urllib.request.urlopen(req) as response:
            signal.alarm(0)
            if response.status == 200:
                with open(file_path, "wb") as file:
                    while True:
                        chunk = response.read(8192)
                        if not chunk:
                            break
                        file.write(chunk)
                print(f"Saved {file_path}")
                return True
            else:
-                print(f"Failed to download {asset_url}: {response.status} {response.msg}")
+                print(f"[WARN] {file_path} exists but size mismatch (expected {expected_size}, got {actual_size}). Re-downloading.")
                os.remove(file_path)
        else:
            print(f"[SKIP] {file_path} already downloaded.")
            return True
    max_retries = 2
    retry_delay = 30
    timeout_seconds = 140
    for attempt in range(1, max_retries + 1):
        print(f"Downloading {asset_url} (attempt {attempt}/{max_retries})")
        try:
            req = urllib.request.Request(asset_url, headers=HEADERS)
            with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
                if response.status == 200:
                    with open(file_path, "wb") as file:
                        while True:
                            chunk = response.read(8192)
                            if not chunk:
                                break
                            file.write(chunk)
                    # Verify file size if expected_size was provided
                    if expected_size is not None:
                        actual_size = os.path.getsize(file_path)
                        if actual_size != expected_size:
                            print(f"[ERROR] Size mismatch for {file_path}: expected {expected_size} bytes, got {actual_size} bytes")
                            os.remove(file_path)
                            if attempt < max_retries:
                                print(f"Waiting {retry_delay} seconds before retry")
                                time.sleep(retry_delay)
                                continue
                            return False
                        print(f"Saved {file_path} ({actual_size} bytes, verified)")
                    else:
                        print(f"Saved {file_path}")
                    return True
                else:
                    print(f"Failed to download {asset_url}: {response.status} {response.msg}")
                    if attempt < max_retries:
                        print(f"Waiting {retry_delay} seconds before retry")
                        time.sleep(retry_delay)
                    else:
                        return False
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print(f"404 Not Found: {asset_url}")
                raise Exception(f"Asset not found (404): {asset_url}")
            else:
                print(f"HTTP error occurred (attempt {attempt}/{max_retries}): {e.code} {e.reason}")
                if attempt < max_retries:
                    print(f"Waiting {retry_delay} seconds before retry")
                    time.sleep(retry_delay)
                else:
                    return False
        except urllib.error.URLError as e:
            print(f"URL/Timeout error (attempt {attempt}/{max_retries}): {e}")
            if attempt < max_retries:
                print(f"Waiting {retry_delay} seconds before retry")
                time.sleep(retry_delay)
            else:
                return False
-    except DownloadTimeoutException as e:
+        except Exception as e:
-        print(f"Download aborted for {asset_url}: {e}")
+            print(f"An error occurred (attempt {attempt}/{max_retries}): {e}")
-        return False
+            if attempt < max_retries:
-    except Exception as e:
+                print(f"Waiting {retry_delay} seconds before retry")
-        print(f"An error occurred while downloading {asset_url}: {e}")
+                time.sleep(retry_delay)
-        return False
+            else:
                return False
    return False
 def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
@@ -196,7 +260,6 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
            stdin=cat_proc.stdout,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True
        )
        cat_proc.stdout.close()
        cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
@@ -205,6 +268,24 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        if cat_stderr:
            print(f"cat stderr: {cat_stderr}")
        tar_stderr = result.stderr.decode() if result.stderr else ""
        if result.returncode != 0:
            # GNU tar exits non-zero for format issues that BSD tar silently
            # tolerates (e.g. trailing junk after the last valid entry).
            # Check whether files were actually extracted before giving up.
            extracted_items = os.listdir(extract_dir)
            if extracted_items:
                print(f"[WARN] tar exited {result.returncode} but extracted "
                      f"{len(extracted_items)} items — treating as success")
                if tar_stderr:
                    print(f"tar stderr: {tar_stderr}")
            else:
                print(f"Failed to extract split archive (tar exit {result.returncode})")
                if tar_stderr:
                    print(f"tar stderr: {tar_stderr}")
                shutil.rmtree(extract_dir, ignore_errors=True)
                return False
        print(f"Successfully extracted archive to {extract_dir}")
        # Delete tar files immediately after extraction
@@ -221,11 +302,9 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        print(f"Disk space after tar deletion: {free_gb:.1f}GB free")
        return True
-    except subprocess.CalledProcessError as e:
+    except Exception as e:
        stderr_output = e.stderr.decode() if e.stderr else ""
        print(f"Failed to extract split archive: {e}")
-        if stderr_output:
+        shutil.rmtree(extract_dir, ignore_errors=True)
            print(f"tar stderr: {stderr_output}")
        return False
@@ -389,8 +468,6 @@ COLUMNS = [
 OS_CPU_COUNT = os.cpu_count() or 1
 MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1
 CHUNK_SIZE = MAX_WORKERS * 500  # Reduced for lower RAM usage
 BATCH_SIZE = 250_000  # Fixed size for predictable memory usage (~500MB per batch)
 # PyArrow schema for efficient Parquet writing
 PARQUET_SCHEMA = pa.schema([
@@ -478,211 +555,6 @@ def collect_trace_files_with_find(root_dir):
    return trace_dict
 def generate_version_dates(start_date: str, end_date: str) -> list:
    """Generate a list of dates from start_date to end_date inclusive."""
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    delta = end - start
    return [start + timedelta(days=i) for i in range(delta.days + 1)]
 def safe_process(fp):
    """Safely process a file, returning empty list on error."""
    try:
        return process_file(fp)
    except Exception as e:
        logging.error(f"Error processing {fp}: {e}")
        return []
 def rows_to_arrow_table(rows: list) -> pa.Table:
    """Convert list of rows to a PyArrow Table directly (no pandas)."""
    # Transpose rows into columns
    columns = list(zip(*rows))
    # Build arrays for each column according to schema
    arrays = []
    for i, field in enumerate(PARQUET_SCHEMA):
        col_data = list(columns[i]) if i < len(columns) else [None] * len(rows)
        arrays.append(pa.array(col_data, type=field.type))
    return pa.Table.from_arrays(arrays, schema=PARQUET_SCHEMA)
 def write_batch_to_parquet(rows: list, version_date: str, batch_idx: int):
    """Write a batch of rows to a Parquet file."""
    if not rows:
        return
    table = rows_to_arrow_table(rows)
    parquet_path = os.path.join(PARQUET_DIR, f"{version_date}_batch_{batch_idx:04d}.parquet")
    pq.write_table(table, parquet_path, compression='snappy')
    print(f"Written parquet batch {batch_idx} ({len(rows)} rows) | {get_resource_usage()}")
 def merge_parquet_files(version_date: str, delete_batches: bool = True):
    """Merge all batch parquet files for a version_date into a single file using streaming."""
    pattern = os.path.join(PARQUET_DIR, f"{version_date}_batch_*.parquet")
    batch_files = sorted(glob.glob(pattern))
    if not batch_files:
        print(f"No batch files found for {version_date}")
        return None
    print(f"Merging {len(batch_files)} batch files for {version_date} (streaming)...")
    merged_path = os.path.join(PARQUET_DIR, f"{version_date}.parquet")
    total_rows = 0
    # Stream write: read one batch at a time to minimize RAM usage
    writer = None
    try:
        for i, f in enumerate(batch_files):
            table = pq.read_table(f)
            total_rows += table.num_rows
            if writer is None:
                writer = pq.ParquetWriter(merged_path, table.schema, compression='snappy')
            writer.write_table(table)
            # Delete batch file immediately after reading to free disk space
            if delete_batches:
                os.remove(f)
            # Free memory
            del table
            if (i + 1) % 10 == 0:
                gc.collect()
                print(f"  Merged {i + 1}/{len(batch_files)} batches... | {get_resource_usage()}")
    finally:
        if writer is not None:
            writer.close()
    print(f"Merged parquet file written to {merged_path} ({total_rows} total rows) | {get_resource_usage()}")
    if delete_batches:
        print(f"Deleted {len(batch_files)} batch files during merge")
    gc.collect()
    return merged_path
 def process_version_date(version_date: str, keep_folders: bool = False):
    """Download, extract, and process trace files for a single version date."""
    print(f"\nProcessing version_date: {version_date}")
    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
    def collect_trace_files_for_version_date(vd):
        releases = fetch_releases(vd)
        if len(releases) == 0:
            print(f"No releases found for {vd}.")
            return None
        downloaded_files = []
        for release in releases:
            tag_name = release["tag_name"]
            print(f"Processing release: {tag_name}")
            # Only download prod-0 if available, else prod-0tmp
            assets = release.get("assets", [])
            normal_assets = [
                a for a in assets
                if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
            ]
            tmp_assets = [
                a for a in assets
                if "planes-readsb-prod-0tmp" in a["name"]
            ]
            use_assets = normal_assets if normal_assets else tmp_assets
            for asset in use_assets:
                asset_name = asset["name"]
                asset_url = asset["browser_download_url"]
                file_path = os.path.join(OUTPUT_DIR, asset_name)
                result = download_asset(asset_url, file_path)
                if result:
                    downloaded_files.append(file_path)
        extract_split_archive(downloaded_files, extract_dir)
        return collect_trace_files_with_find(extract_dir)
    # Check if files already exist
    pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
    matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
    if matches:
        print(f"Found existing files for {version_date}:")
        # Prefer non-tmp slices when reusing existing files
        normal_matches = [
            p for p in matches
            if "-planes-readsb-prod-0." in os.path.basename(p)
            and "tmp" not in os.path.basename(p)
        ]
        downloaded_files = normal_matches if normal_matches else matches
        extract_split_archive(downloaded_files, extract_dir)
        trace_files = collect_trace_files_with_find(extract_dir)
    else:
        trace_files = collect_trace_files_for_version_date(version_date)
    if trace_files is None or len(trace_files) == 0:
        print(f"No trace files found for version_date: {version_date}")
        return 0
    file_list = list(trace_files.values())
    start_time = time.perf_counter()
    total_num_rows = 0
    batch_rows = []
    batch_idx = 0
    # Process files in chunks
    for offset in range(0, len(file_list), CHUNK_SIZE):
        chunk = file_list[offset:offset + CHUNK_SIZE]
        with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_executor:
            for rows in process_executor.map(safe_process, chunk):
                if not rows:
                    continue
                batch_rows.extend(rows)
                if len(batch_rows) >= BATCH_SIZE:
                    total_num_rows += len(batch_rows)
                    write_batch_to_parquet(batch_rows, version_date, batch_idx)
                    batch_idx += 1
                    batch_rows = []
                    elapsed = time.perf_counter() - start_time
                    speed = total_num_rows / elapsed if elapsed > 0 else 0
                    print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
        gc.collect()
    # Final batch
    if batch_rows:
        total_num_rows += len(batch_rows)
        write_batch_to_parquet(batch_rows, version_date, batch_idx)
        elapsed = time.perf_counter() - start_time
        speed = total_num_rows / elapsed if elapsed > 0 else 0
        print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
    print(f"Total rows processed for version_date {version_date}: {total_num_rows}")
    # Clean up extracted directory immediately after processing (before merging parquet files)
    if not keep_folders and os.path.isdir(extract_dir):
        print(f"Deleting extraction directory with 100,000+ files: {extract_dir}")
        shutil.rmtree(extract_dir)
        print(f"Successfully deleted extraction directory: {extract_dir} | {get_resource_usage()}")
    # Merge batch files into a single parquet file
    merge_parquet_files(version_date, delete_batches=True)
    return total_num_rows
 def create_parquet_for_day(day, keep_folders: bool = False):
    """Create parquet file for a single day.
@@ -706,42 +578,10 @@ def create_parquet_for_day(day, keep_folders: bool = False):
        print(f"Parquet file already exists: {parquet_path}")
        return parquet_path
-    print(f"Creating parquet for {version_date}...")
+    print(f"Creating parquet for {version_date}")
    rows_processed = process_version_date(version_date, keep_folders)
    if rows_processed > 0 and parquet_path.exists():
        return parquet_path
    else:
        return None
 def main(start_date: str, end_date: str, keep_folders: bool = False):
    """Main function to download and convert adsb.lol data to Parquet."""
    version_dates = [f"v{date.strftime('%Y.%m.%d')}" for date in generate_version_dates(start_date, end_date)]
    print(f"Processing dates: {version_dates}")
    total_rows_all = 0
    for version_date in version_dates:
        rows_processed = process_version_date(version_date, keep_folders)
        total_rows_all += rows_processed
    print(f"\n=== Summary ===")
    print(f"Total dates processed: {len(version_dates)}")
    print(f"Total rows written to Parquet: {total_rows_all}")
    print(f"Parquet files location: {PARQUET_DIR}")
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
    parser = argparse.ArgumentParser(
        description="Download adsb.lol data and write to Parquet files"
    )
    parser.add_argument("start_date", type=str, help="Start date in YYYY-MM-DD format")
    parser.add_argument("end_date", type=str, help="End date in YYYY-MM-DD format")
    parser.add_argument("--keep-folders", action="store_true", 
                        help="Keep extracted folders after processing")
    args = parser.parse_args()
    main(args.start_date, args.end_date, args.keep_folders)
@@ -1,9 +1,7 @@
 """
-Downloads and extracts adsb.lol tar files, then lists all ICAO folders.
+Downloads and extracts adsb.lol tar files for a single day, then lists all ICAO folders.
 This is the first step of the map-reduce pipeline.
 Supports both single-day (daily) and multi-day (historical) modes.
 Outputs:
 - Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
 - ICAO manifest at data/output/icao_manifest_{date}.txt
@@ -25,11 +23,6 @@ from src.adsb.download_adsb_data_to_parquet import (
 )
 def get_target_day() -> datetime:
    """Get yesterday's date (the day we're processing)."""
    return datetime.utcnow() - timedelta(days=1)
 def download_and_extract(version_date: str) -> str | None:
    """Download and extract tar files, return extract directory path."""
    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
@@ -59,6 +52,12 @@ def download_and_extract(version_date: str) -> str | None:
            print(f"No releases found for {version_date}")
            return None
        # Prefer non-tmp releases; only use tmp if no normal releases exist
        normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
        tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
        releases = normal_releases if normal_releases else tmp_releases
        print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
        downloaded_files = []
        for release in releases:
            tag_name = release["tag_name"]
@@ -78,8 +77,9 @@ def download_and_extract(version_date: str) -> str | None:
            for asset in use_assets:
                asset_name = asset["name"]
                asset_url = asset["browser_download_url"]
                asset_size = asset.get("size")  # Get expected file size
                file_path = os.path.join(OUTPUT_DIR, asset_name)
-                if download_asset(asset_url, file_path):
+                if download_asset(asset_url, file_path, expected_size=asset_size):
                    downloaded_files.append(file_path)
    if not downloaded_files:
@@ -100,21 +100,6 @@ def list_icao_folders(extract_dir: str) -> list[str]:
    return icaos
 def write_manifest(icaos: list[str], manifest_id: str) -> str:
    """Write ICAO list to manifest file.
    Args:
        icaos: List of ICAO codes
        manifest_id: Identifier for manifest file (date or date range)
    """
    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
    with open(manifest_path, "w") as f:
        for icao in sorted(icaos):
            f.write(f"{icao}\n")
    print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
    return manifest_path
 def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    """Process a single day: download, extract, list ICAOs.
@@ -129,82 +114,50 @@ def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    extract_dir = download_and_extract(version_date)
    if not extract_dir:
        print(f"Failed to download/extract data for {date_str}")
-        return None, []
+        raise Exception(f"No data available for {date_str}")
    icaos = list_icao_folders(extract_dir)
    print(f"Found {len(icaos)} ICAOs for {date_str}")
    return extract_dir, icaos
-
+from pathlib import Path
-def process_date_range(start_date: datetime, end_date: datetime) -> set[str]:
+import tarfile
-    """Process multiple days: download, extract, combine ICAO lists.
+NUMBER_PARTS = 4
-    
+def split_folders_into_gzip_archives(extract_dir: Path, tar_output_dir: Path, icaos: list[str], parts = NUMBER_PARTS) -> list[str]:
-    Args:
+    traces_dir = extract_dir / "traces"
-        start_date: Start date (inclusive)
+    buckets = sorted(traces_dir.iterdir())
-        end_date: End date (inclusive)
+    tars = []
-    
+    for i in range(parts):
-    Returns:
+        tar_path = tar_output_dir / f"{tar_output_dir.name}_part_{i}.tar.gz"
-        Combined set of all ICAOs across the date range
+        tars.append(tarfile.open(tar_path, "w:gz"))
-    """
+    for idx, bucket_path in enumerate(buckets):
-    all_icaos: set[str] = set()
+        tar_idx = idx % parts
-    current = start_date
+        tars[tar_idx].add(bucket_path, arcname=bucket_path.name)
-    
+    for tar in tars:
-    # Both start and end are inclusive
+        tar.close()
    while current <= end_date:
        _, icaos = process_single_day(current)
        all_icaos.update(icaos)
        current += timedelta(days=1)
    return all_icaos
 def main():
-    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
+    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data for a single day")
    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    args = parser.parse_args()
-    # Determine mode: single day or date range
+    target_day = datetime.strptime(args.date, "%Y-%m-%d")
-    if args.start_date and args.end_date:
+    date_str = target_day.strftime("%Y-%m-%d")
-        # Historical mode: process date range
+    tar_output_dir = Path(f"./data/output/adsb_archives/{date_str}")
-        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+    
-        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+    extract_dir, icaos = process_single_day(target_day)
-        
+    extract_dir = Path(extract_dir)
-        print(f"Processing date range: {args.start_date} to {args.end_date}")
+    print(extract_dir)
-        
+    tar_output_dir.mkdir(parents=True, exist_ok=True)
-        all_icaos = process_date_range(start_date, end_date)
+    split_folders_into_gzip_archives(extract_dir, tar_output_dir, icaos)
-        
+    if not icaos:
-        if not all_icaos:
+        print("No ICAOs found")
-            print("No ICAOs found in date range")
+        sys.exit(1)
-            sys.exit(1)
+    
-        
+    print(f"\nDone! Extract dir: {extract_dir}")
-        # Write combined manifest with range identifier
+    print(f"Total ICAOs: {len(icaos)}")
        manifest_id = f"{args.start_date}_{args.end_date}"
        write_manifest(list(all_icaos), manifest_id)
        print(f"\nDone! Total ICAOs: {len(all_icaos)}")
    else:
        # Daily mode: single day
        if args.date:
            target_day = datetime.strptime(args.date, "%Y-%m-%d")
        else:
            target_day = get_target_day()
        date_str = target_day.strftime("%Y-%m-%d")
        extract_dir, icaos = process_single_day(target_day)
        if not icaos:
            print("No ICAOs found")
            sys.exit(1)
        write_manifest(icaos, date_str)
        print(f"\nDone! Extract dir: {extract_dir}")
        print(f"Total ICAOs: {len(icaos)}")
 if __name__ == "__main__":
@@ -41,7 +41,7 @@ def main() -> None:
    """Main entry point for GitHub Actions."""
    start_date = os.environ.get("INPUT_START_DATE")
    end_date = os.environ.get("INPUT_END_DATE")
-    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
+    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "1"))
    if not start_date or not end_date:
        print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
@@ -0,0 +1,78 @@
 """
 Main pipeline for processing ADS-B data from adsb.lol.
 Usage:
    python -m src.adsb.main --date 2026-01-01
    python -m src.adsb.main --start_date 2026-01-01 --end_date 2026-01-03
 """
 import argparse
 import subprocess
 import sys
 from datetime import datetime, timedelta
 import polars as pl
 from src.adsb.download_and_list_icaos import NUMBER_PARTS
 def main():
    parser = argparse.ArgumentParser(description="Process ADS-B data for a single day or date range")
    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format")
    parser.add_argument("--start_date", type=str, help="Start date (inclusive, YYYY-MM-DD)")
    parser.add_argument("--end_date", type=str, help="End date (exclusive, YYYY-MM-DD)")
    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
    args = parser.parse_args()
    if args.date and (args.start_date or args.end_date):
        raise SystemExit("Use --date or --start_date/--end_date, not both.")
    if args.date:
        start_date = datetime.strptime(args.date, "%Y-%m-%d")
        end_date = start_date + timedelta(days=1)
    else:
        if not args.start_date or not args.end_date:
            raise SystemExit("Provide --start_date and --end_date, or use --date.")
        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
    current = start_date
    while current < end_date:
        date_str = current.strftime("%Y-%m-%d")
        print(f"Processing day: {date_str}")
        # Download and split
        subprocess.run([sys.executable, "-m", "src.adsb.download_and_list_icaos", "--date", date_str], check=True)
        # Process parts
        for part_id in range(NUMBER_PARTS):
            subprocess.run([sys.executable, "-m", "src.adsb.process_icao_chunk", "--part-id", str(part_id), "--date", date_str], check=True)
        # Concatenate
        concat_cmd = [sys.executable, "-m", "src.adsb.concat_parquet_to_final", "--date", date_str]
        if args.concat_with_latest_csv:
            concat_cmd.append("--concat_with_latest_csv")
        subprocess.run(concat_cmd, check=True)
        current += timedelta(days=1)
    if end_date - start_date > timedelta(days=1):
        dates = []
        cur = start_date
        while cur < end_date:
            dates.append(cur.strftime("%Y-%m-%d"))
            cur += timedelta(days=1)
        csv_files = [
            f"data/outputs/openairframes_adsb_{d}_{d}.csv"
            for d in dates
        ]
        frames = [pl.read_csv(p) for p in csv_files]
        df = pl.concat(frames, how="vertical", rechunk=True)
        output_path = f"data/outputs/openairframes_adsb_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv"
        df.write_csv(output_path)
        print(f"Wrote combined CSV: {output_path}")
    print("Done")
 if __name__ == "__main__":
    main()
@@ -1,18 +1,9 @@
 """
-Processes a chunk of ICAOs from pre-extracted trace files.
+Processes trace files from a single archive part for a single day.
 This is the map phase of the map-reduce pipeline.
 Supports both single-day (daily) and multi-day (historical) modes.
 Expects extract_dir to already exist with trace files.
 Reads ICAO manifest to determine which ICAOs to process based on chunk-id.
 Usage:
-    # Daily mode (single day)
+    python -m src.adsb.process_icao_chunk --part-id 1 --date 2026-01-01
    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
    # Historical mode (date range)
    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
 """
 import gc
 import os
@@ -21,6 +12,9 @@ import argparse
 import time
 import concurrent.futures
 from datetime import datetime, timedelta
 import tarfile
 import tempfile
 import shutil
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -37,72 +31,21 @@ from src.adsb.download_adsb_data_to_parquet import (
 )
 CHUNK_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
 os.makedirs(CHUNK_OUTPUT_DIR, exist_ok=True)
 # Smaller batch size for memory efficiency
 BATCH_SIZE = 100_000
-
+def build_trace_file_map(archive_path: str) -> dict[str, str]:
-def get_target_day() -> datetime:
+    """Build a map of ICAO -> trace file path by extracting tar.gz archive."""
-    """Get yesterday's date (the day we're processing)."""
+    print(f"Extracting {archive_path}...")
    return datetime.utcnow() - timedelta(days=1)
 def read_manifest(manifest_id: str) -> list[str]:
    """Read ICAO manifest file.
-    Args:
+    temp_dir = tempfile.mkdtemp(prefix="adsb_extract_")
        manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
    """
    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
    if not os.path.exists(manifest_path):
        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
-    with open(manifest_path, "r") as f:
+    with tarfile.open(archive_path, 'r:gz') as tar:
-        icaos = [line.strip() for line in f if line.strip()]
+        tar.extractall(path=temp_dir, filter='data')
    return icaos
 def deterministic_hash(s: str) -> int:
    """Return a deterministic hash for a string (unlike Python's hash() which is randomized)."""
    # Use sum of byte values - simple but deterministic
    return sum(ord(c) for c in s)
 def get_chunk_icaos(icaos: list[str], chunk_id: int, total_chunks: int) -> list[str]:
    """Get the subset of ICAOs for this chunk based on deterministic hash partitioning."""
    return [icao for icao in icaos if deterministic_hash(icao) % total_chunks == chunk_id]
 def build_trace_file_map(extract_dir: str) -> dict[str, str]:
    """Build a map of ICAO -> trace file path using find command."""
    print(f"Building trace file map from {extract_dir}...")
-    # Debug: check what's in extract_dir
+    trace_map = collect_trace_files_with_find(temp_dir)
    if os.path.isdir(extract_dir):
        items = os.listdir(extract_dir)[:10]
        print(f"First 10 items in extract_dir: {items}")
        # Check if there are subdirectories
        for item in items[:3]:
            subpath = os.path.join(extract_dir, item)
            if os.path.isdir(subpath):
                subitems = os.listdir(subpath)[:5]
                print(f"  Contents of {item}/: {subitems}")
    trace_map = collect_trace_files_with_find(extract_dir)
    print(f"Found {len(trace_map)} trace files")
    if len(trace_map) == 0:
        # Debug: try manual find
        import subprocess
        result = subprocess.run(
            ['find', extract_dir, '-type', 'f', '-name', 'trace_full_*'],
            capture_output=True, text=True
        )
        print(f"Manual find output (first 500 chars): {result.stdout[:500]}")
        print(f"Manual find stderr: {result.stderr[:200]}")
    return trace_map
@@ -125,42 +68,13 @@ def rows_to_table(rows: list) -> pa.Table:
 def process_chunk(
-    chunk_id: int,
+    trace_files: list[str],
-    total_chunks: int,
+    part_id: int,
-    trace_map: dict[str, str],
+    date_str: str,
    icaos: list[str],
    output_id: str,
 ) -> str | None:
-    """Process a chunk of ICAOs and write to parquet.
+    """Process trace files and write to a single parquet file."""
-    Args:
+    output_path = os.path.join(PARQUET_DIR, f"part_{part_id}_{date_str}.parquet")
        chunk_id: This chunk's ID (0-indexed)
        total_chunks: Total number of chunks
        trace_map: Map of ICAO -> trace file path
        icaos: Full list of ICAOs from manifest
        output_id: Identifier for output file (date or date range)
    """
    chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
    print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
    if not chunk_icaos:
        print(f"Chunk {chunk_id}: No ICAOs to process")
        return None
    # Get trace file paths from the map
    trace_files = []
    for icao in chunk_icaos:
        if icao in trace_map:
            trace_files.append(trace_map[icao])
    print(f"Chunk {chunk_id}: Found {len(trace_files)} trace files")
    if not trace_files:
        print(f"Chunk {chunk_id}: No trace files found")
        return None
    # Process files and write parquet in batches
    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
    start_time = time.perf_counter()
    total_rows = 0
@@ -168,7 +82,8 @@ def process_chunk(
    writer = None
    try:
-        # Process in parallel batches
+        writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
        files_per_batch = MAX_WORKERS * 100
        for offset in range(0, len(trace_files), files_per_batch):
            batch_files = trace_files[offset:offset + files_per_batch]
@@ -178,166 +93,72 @@ def process_chunk(
                    if rows:
                        batch_rows.extend(rows)
                        # Write when batch is full
                        if len(batch_rows) >= BATCH_SIZE:
-                            table = rows_to_table(batch_rows)
+                            writer.write_table(rows_to_table(batch_rows))
                            total_rows += len(batch_rows)
                            if writer is None:
                                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
                            writer.write_table(table)
                            batch_rows = []
                            del table
                            gc.collect()
                            elapsed = time.perf_counter() - start_time
                            print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
            gc.collect()
        # Write remaining rows
        if batch_rows:
-            table = rows_to_table(batch_rows)
+            writer.write_table(rows_to_table(batch_rows))
            total_rows += len(batch_rows)
            if writer is None:
                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
            writer.write_table(table)
            del table
    finally:
        if writer:
            writer.close()
-    elapsed = time.perf_counter() - start_time
+    print(f"Part {part_id}: Done! {total_rows} rows in {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
    print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
-    if total_rows > 0:
+    return output_path if total_rows > 0 else None
        return output_path
    return None
 def process_single_day(
    chunk_id: int,
    total_chunks: int,
    target_day: datetime,
 ) -> str | None:
    """Process a single day for this chunk."""
    date_str = target_day.strftime("%Y-%m-%d")
    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
    if not os.path.isdir(extract_dir):
        print(f"Extract directory not found: {extract_dir}")
        return None
    trace_map = build_trace_file_map(extract_dir)
    if not trace_map:
        print("No trace files found")
        return None
    icaos = read_manifest(date_str)
    print(f"Total ICAOs in manifest: {len(icaos)}")
    return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
 def process_date_range(
    chunk_id: int,
    total_chunks: int,
    start_date: datetime,
    end_date: datetime,
 ) -> str | None:
    """Process a date range for this chunk.
    Combines trace files from all days in the range.
    Args:
        chunk_id: This chunk's ID (0-indexed)
        total_chunks: Total number of chunks
        start_date: Start date (inclusive)
        end_date: End date (inclusive)
    """
    start_str = start_date.strftime("%Y-%m-%d")
    end_str = end_date.strftime("%Y-%m-%d")
    manifest_id = f"{start_str}_{end_str}"
    print(f"Processing date range: {start_str} to {end_str}")
    # Build combined trace map from all days
    combined_trace_map: dict[str, str] = {}
    current = start_date
    # Both start and end are inclusive
    while current <= end_date:
        version_date = f"v{current.strftime('%Y.%m.%d')}"
        extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
        if os.path.isdir(extract_dir):
            trace_map = build_trace_file_map(extract_dir)
            # Later days override earlier days (use most recent trace file)
            combined_trace_map.update(trace_map)
            print(f"  {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
        else:
            print(f"  {current.strftime('%Y-%m-%d')}: no extract directory")
        current += timedelta(days=1)
    if not combined_trace_map:
        print("No trace files found in date range")
        return None
    print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
    icaos = read_manifest(manifest_id)
    print(f"Total ICAOs in manifest: {len(icaos)}")
    return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)
 from pathlib import Path
 def main():
-    parser = argparse.ArgumentParser(description="Process a chunk of ICAOs")
+    parser = argparse.ArgumentParser(description="Process a single archive part for a day")
-    parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
+    parser.add_argument("--part-id", type=int, required=True, help="Part ID (1-indexed)")
-    parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks")
+    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    args = parser.parse_args()
-    print(f"Processing chunk {args.chunk_id}/{args.total_chunks}")
+    print(f"Processing part {args.part_id} for {args.date}")
    print(f"OUTPUT_DIR: {OUTPUT_DIR}")
    print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
    print(f"Resource usage at start: {get_resource_usage()}")
-    # Debug: List what's in OUTPUT_DIR
+    # Get specific archive file for this part
-    print(f"\nContents of {OUTPUT_DIR}:")
+    archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", args.date)
-    if os.path.isdir(OUTPUT_DIR):
+    archive_path = os.path.join(archive_dir, f"{args.date}_part_{args.part_id}.tar.gz")
        for item in os.listdir(OUTPUT_DIR)[:20]:
            print(f"  - {item}")
    else:
        print(f"  Directory does not exist!")
-    # Determine mode: single day or date range
+    if not os.path.isfile(archive_path):
-    if args.start_date and args.end_date:
+        print(f"ERROR: Archive not found: {archive_path}")
-        # Historical mode
+        if os.path.isdir(archive_dir):
-        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+            print(f"Files in {archive_dir}: {os.listdir(archive_dir)}")
        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
        output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
    else:
        # Daily mode
        if args.date:
            target_day = datetime.strptime(args.date, "%Y-%m-%d")
        else:
-            target_day = get_target_day()
+            print(f"Directory does not exist: {archive_dir}")
-        output_path = process_single_day(args.chunk_id, args.total_chunks, target_day)
+        sys.exit(1)
-    if output_path:
+    # Extract and collect trace files
-        print(f"Output: {output_path}")
+    trace_map = build_trace_file_map(archive_path)
-    else:
+    all_trace_files = list(trace_map.values())
-        print("No output generated")
+    
    print(f"Total trace files: {len(all_trace_files)}")
    # Process and write output
    output_path = process_chunk(all_trace_files, args.part_id, args.date)
    from src.adsb.compress_adsb_to_aircraft_data import compress_parquet_part
    df_compressed = compress_parquet_part(args.part_id, args.date)
    # Write parquet
    df_compressed_output = OUTPUT_DIR / "compressed" / args.date/ f"part_{args.part_id}_{args.date}.parquet"
    os.makedirs(df_compressed_output.parent, exist_ok=True)
    df_compressed.write_parquet(df_compressed_output, compression='snappy')
    # Write CSV
    csv_output = OUTPUT_DIR / "compressed" / args.date / f"part_{args.part_id}_{args.date}.csv"
    df_compressed.write_csv(csv_output)
    print(f"Raw output: {output_path}" if output_path else "No raw output generated")
    print(f"Compressed parquet: {df_compressed_output}")
    print(f"Compressed CSV: {csv_output}")
 if __name__ == "__main__":
-    main()
+    main()
@@ -1,97 +0,0 @@
 """
 Reduce step: downloads all chunk CSVs from S3, combines them,
 deduplicates across the full dataset, and uploads the final result.
 Environment variables:
  S3_BUCKET         — bucket with intermediate results
  RUN_ID            — run identifier matching the map workers
  GLOBAL_START_DATE — overall start date for output filename
  GLOBAL_END_DATE   — overall end date for output filename
 """
 import gzip
 import os
 import shutil
 from pathlib import Path
 import boto3
 import polars as pl
 from compress_adsb_to_aircraft_data import COLUMNS, deduplicate_by_signature
 def main():
    s3_bucket = os.environ["S3_BUCKET"]
    run_id = os.environ.get("RUN_ID", "default")
    global_start = os.environ["GLOBAL_START_DATE"]
    global_end = os.environ["GLOBAL_END_DATE"]
    s3 = boto3.client("s3")
    prefix = f"intermediate/{run_id}/"
    # List all chunk files for this run
    paginator = s3.get_paginator("list_objects_v2")
    chunk_keys = []
    for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            if obj["Key"].endswith(".csv.gz"):
                chunk_keys.append(obj["Key"])
    chunk_keys.sort()
    print(f"Found {len(chunk_keys)} chunks to combine")
    if not chunk_keys:
        print("No chunks found — nothing to reduce.")
        return
    # Download and concatenate all chunks
    download_dir = Path("/tmp/chunks")
    download_dir.mkdir(parents=True, exist_ok=True)
    dfs = []
    for key in chunk_keys:
        gz_path = download_dir / Path(key).name
        csv_path = gz_path.with_suffix("")  # Remove .gz
        print(f"Downloading {key}...")
        s3.download_file(s3_bucket, key, str(gz_path))
        # Decompress
        with gzip.open(gz_path, 'rb') as f_in:
            with open(csv_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        gz_path.unlink()
        df_chunk = pl.read_csv(csv_path)
        print(f"  Loaded {df_chunk.height} rows from {csv_path.name}")
        dfs.append(df_chunk)
        # Free disk space after loading
        csv_path.unlink()
    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
    print(f"Combined: {df_accumulated.height} rows before dedup")
    # Final global deduplication
    df_accumulated = deduplicate_by_signature(df_accumulated)
    print(f"After dedup: {df_accumulated.height} rows")
    # Write and upload final result
    output_name = f"openairframes_adsb_{global_start}_{global_end}.csv.gz"
    csv_output = Path(f"/tmp/openairframes_adsb_{global_start}_{global_end}.csv")
    gz_output = Path(f"/tmp/{output_name}")
    df_accumulated.write_csv(csv_output)
    with open(csv_output, 'rb') as f_in:
        with gzip.open(gz_output, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    csv_output.unlink()
    final_key = f"final/{output_name}"
    print(f"Uploading to s3://{s3_bucket}/{final_key}")
    s3.upload_file(str(gz_output), s3_bucket, final_key)
    print(f"Final output: {df_accumulated.height} records -> {final_key}")
 if __name__ == "__main__":
    main()
@@ -1,2 +0,0 @@
 polars>=1.0
 boto3>=1.34
@@ -1,5 +0,0 @@
 polars>=1.0
 pyarrow>=14.0
 orjson>=3.9
 boto3>=1.34
 zstandard>=0.22
@@ -1,89 +0,0 @@
 """
 Map worker: processes a date range chunk, uploads result to S3.
 Environment variables:
  START_DATE  — inclusive, YYYY-MM-DD
  END_DATE    — exclusive, YYYY-MM-DD
  S3_BUCKET   — bucket for intermediate results
  RUN_ID      — unique run identifier for namespacing S3 keys
 """
 import os
 import sys
 from datetime import datetime, timedelta
 from pathlib import Path
 import boto3
 import polars as pl
 from compress_adsb_to_aircraft_data import (
    load_historical_for_day,
    deduplicate_by_signature,
    COLUMNS,
 )
 def main():
    start_date_str = os.environ["START_DATE"]
    end_date_str = os.environ["END_DATE"]
    s3_bucket = os.environ["S3_BUCKET"]
    run_id = os.environ.get("RUN_ID", "default")
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    total_days = (end_date - start_date).days
    print(f"Worker: processing {total_days} days [{start_date_str}, {end_date_str})")
    dfs = []
    current_date = start_date
    while current_date < end_date:
        day_str = current_date.strftime("%Y-%m-%d")
        print(f"  Loading {day_str}...")
        df_compressed = load_historical_for_day(current_date)
        if df_compressed.height == 0:
            raise RuntimeError(f"No data found for {day_str}")
        dfs.append(df_compressed)
        total_rows = sum(df.height for df in dfs)
        print(f"  +{df_compressed.height} rows (total: {total_rows})")
        # Delete local cache after each day to save disk in container
        cache_dir = Path("data/adsb")
        if cache_dir.exists():
            import shutil
            shutil.rmtree(cache_dir)
        current_date += timedelta(days=1)
    # Concatenate all days
    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
    # Deduplicate within this chunk
    df_accumulated = deduplicate_by_signature(df_accumulated)
    print(f"After dedup: {df_accumulated.height} rows")
    # Write to local file then upload to S3
    local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv")
    df_accumulated.write_csv(local_path)
    # Compress with gzip
    import gzip
    import shutil
    gz_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
    with open(local_path, 'rb') as f_in:
        with gzip.open(gz_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    local_path.unlink()  # Remove uncompressed file
    s3_key = f"intermediate/{run_id}/chunk_{start_date_str}_{end_date_str}.csv.gz"
    print(f"Uploading to s3://{s3_bucket}/{s3_key}")
    s3 = boto3.client("s3")
    s3.upload_file(str(gz_path), s3_bucket, s3_key)
    print("Done.")
 if __name__ == "__main__":
    main()
@@ -246,6 +246,20 @@ def process_submission(
    if schema_updated:
        schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
    # Truncate JSON preview to stay under GitHub's 65536 char body limit
    max_json_preview = 50000
    if len(content_json) > max_json_preview:
        # Show first few entries as a preview
        preview_entries = submissions[:10]
        preview_json = json.dumps(preview_entries, indent=2, sort_keys=True)
        json_section = (
            f"### Submissions (showing 10 of {len(submissions)})\n"
            f"```json\n{preview_json}\n```\n\n"
            f"*Full submission ({len(submissions)} entries, {len(content_json):,} chars) is in the committed file.*"
        )
    else:
        json_section = f"### Submissions\n```json\n{content_json}\n```"
    pr_body = f"""## Community Submission
 Adds {len(submissions)} submission(s) from @{author_username}.
@@ -257,10 +271,7 @@ Closes #{issue_number}
 ---
-### Submissions
+{json_section}"""
 ```json
 {content_json}
 ```"""
    pr = create_pull_request(
        title=f"Community submission: {filename}",
@@ -0,0 +1,40 @@
 #!/usr/bin/env python3
 """
 Download ADS-B Exchange basic-ac-db.json.gz.
 Usage:
    python -m src.contributions.create_daily_adsbexchange_release [--date YYYY-MM-DD]
 """
 from __future__ import annotations
 import argparse
 import shutil
 from datetime import datetime, timezone
 from pathlib import Path
 from urllib.request import Request, urlopen
 URL = "https://downloads.adsbexchange.com/downloads/basic-ac-db.json.gz"
 OUT_ROOT = Path("data/openairframes")
 def main() -> None:
    parser = argparse.ArgumentParser(description="Create daily ADS-B Exchange JSON release")
    parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
    args = parser.parse_args()
    date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
    OUT_ROOT.mkdir(parents=True, exist_ok=True)
    gz_path = OUT_ROOT / f"basic-ac-db_{date_str}.json.gz"
    print(f"Downloading {URL}...")
    req = Request(URL, headers={"User-Agent": "openairframes-downloader/1.0"}, method="GET")
    with urlopen(req, timeout=300) as r, gz_path.open("wb") as f:
        shutil.copyfileobj(r, f)
    print(f"Wrote: {gz_path}")
 if __name__ == "__main__":
    main()
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
    """Read all JSON submissions from the community directory."""
    all_submissions = []
-    for json_file in sorted(community_dir.glob("*.json")):
+    for json_file in sorted(community_dir.glob("**/*.json")):
        try:
            with open(json_file) as f:
                data = json.load(f)
@@ -0,0 +1,55 @@
 #!/usr/bin/env python3
 """
 Download Mictronics aircraft database zip.
 Usage:
    python -m src.contributions.create_daily_microtonics_release [--date YYYY-MM-DD]
 """
 from __future__ import annotations
 import argparse
 import shutil
 import sys
 import time
 from datetime import datetime, timezone
 from pathlib import Path
 from urllib.error import URLError
 from urllib.request import Request, urlopen
 URL = "https://www.mictronics.de/aircraft-database/indexedDB_old.php"
 OUT_ROOT = Path("data/openairframes")
 MAX_RETRIES = 3
 RETRY_DELAY = 30  # seconds
 def main() -> None:
    parser = argparse.ArgumentParser(description="Create daily Mictronics database release")
    parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
    args = parser.parse_args()
    date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
    OUT_ROOT.mkdir(parents=True, exist_ok=True)
    zip_path = OUT_ROOT / f"mictronics-db_{date_str}.zip"
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"Downloading {URL} (attempt {attempt}/{MAX_RETRIES})...")
            req = Request(URL, headers={"User-Agent": "Mozilla/5.0 (compatible; openairframes-downloader/1.0)"}, method="GET")
            with urlopen(req, timeout=120) as r, zip_path.open("wb") as f:
                shutil.copyfileobj(r, f)
            print(f"Wrote: {zip_path}")
            return
        except (URLError, TimeoutError) as e:
            print(f"Attempt {attempt} failed: {e}")
            if attempt < MAX_RETRIES:
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                print("All retries exhausted. Mictronics download failed.")
                sys.exit(1)
 if __name__ == "__main__":
    main()
@@ -36,6 +36,52 @@ def get_latest_schema_version() -> int:
    return max_version
 def _is_balanced_json(text: str) -> bool:
    """
    Check if JSON has balanced brackets/braces.
    This is a simple check to ensure we captured complete JSON.
    Ignores brackets/braces inside strings.
    Args:
        text: JSON text to check
    Returns:
        True if balanced, False otherwise
    """
    in_string = False
    escape = False
    stack = []
    pairs = {'[': ']', '{': '}'}
    for char in text:
        if escape:
            escape = False
            continue
        if char == '\\':
            escape = True
            continue
        if char == '"' and not escape:
            in_string = not in_string
            continue
        if in_string:
            continue
        if char in pairs:
            stack.append(char)
        elif char in pairs.values():
            if not stack:
                return False
            if pairs[stack[-1]] != char:
                return False
            stack.pop()
    return len(stack) == 0 and not in_string
 def get_schema_path(version: int | None = None) -> Path:
    """
    Get path to a specific schema version, or latest if version is None.
@@ -162,10 +208,14 @@ def extract_json_from_issue_body(body: str) -> str | None:
        return match.group(1).strip()
    # Try: Raw JSON after "### Submission JSON" until next section or end
-    pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*?[\]}])(?=\n###|\n\n###|$)"
+    # Use greedy matching since we have a clear boundary (next ### or end)
    pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*[\]}])(?=\s*\n###|\s*$)"
    match = re.search(pattern_raw, body)
    if match:
-        return match.group(1).strip()
+        candidate = match.group(1).strip()
        # Validate it's complete JSON by checking balanced brackets
        if _is_balanced_json(candidate):
            return candidate
    # Try: Any JSON object/array in the body (fallback)
    pattern_any = r"([\[{][\s\S]*?[\]}])"
@@ -219,7 +269,19 @@ def parse_and_validate(json_str: str, schema: dict | None = None) -> tuple[list
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as e:
-        return None, [f"Invalid JSON: {e}"]
+        # Provide detailed error context
        error_msg = f"Invalid JSON: {e}"
        # Show context around the error position
        if hasattr(e, 'pos') and e.pos is not None:
            start = max(0, e.pos - 50)
            end = min(len(json_str), e.pos + 50)
            context = json_str[start:end]
            # Escape for readability
            context_escaped = repr(context)
            error_msg += f"\n\nContext around position {e.pos}: {context_escaped}"
        return None, [error_msg]
    errors = validate_submission(data, schema)
    return data, errors
@@ -1,84 +0,0 @@
 from pathlib import Path
 from datetime import datetime, timezone, timedelta
 import sys
 import polars as pl
 # Add adsb directory to path
 sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation
 from adsb.compress_adsb_to_aircraft_data import (
    load_historical_for_day,
    concat_compressed_dfs,
    get_latest_aircraft_adsb_csv_df,
 )
 if __name__ == '__main__':
    # Get yesterday's date (data for the previous day)
    day = datetime.now(timezone.utc) - timedelta(days=1)
    # Find a day with complete data
    max_attempts = 2  # Don't look back more than a week
    for attempt in range(max_attempts):
        date_str = day.strftime("%Y-%m-%d")
        print(f"Processing ADS-B data for {date_str}")
        print("Loading new ADS-B data...")
        df_new = load_historical_for_day(day)
        if df_new.height == 0:
            day = day - timedelta(days=1)
            continue
        max_time = df_new['time'].max()
        if max_time is not None:
            # Handle timezone
            max_time_dt = max_time
            if hasattr(max_time_dt, 'replace'):
                max_time_dt = max_time_dt.replace(tzinfo=timezone.utc)
            end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5)
            # Convert polars datetime to python datetime if needed
            if isinstance(max_time_dt, datetime):
                if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day:
                    break
            else:
                # Polars returns python datetime already
                if max_time >= day.replace(hour=23, minute=54, second=59):
                    break
        print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.")
        day = day - timedelta(days=1)
    else:
        raise RuntimeError(f"Could not find complete data in the last {max_attempts} days")
    try:
        # Get the latest release data
        print("Downloading latest ADS-B release...")
        df_base, start_date_str = get_latest_aircraft_adsb_csv_df()
        # Combine with historical data
        print("Combining with historical data...")
        df_combined = concat_compressed_dfs(df_base, df_new)
    except Exception as e:
        print(f"Error downloading latest ADS-B release: {e}")
        df_combined = df_new
        start_date_str = date_str
    # Sort by time for consistent ordering
    df_combined = df_combined.sort('time')
    # Convert any list columns to strings for CSV compatibility
    for col in df_combined.columns:
        if df_combined[col].dtype == pl.List:
            df_combined = df_combined.with_columns(
                pl.col(col).list.join(",").alias(col)
            )
    # Save the result
    OUT_ROOT = Path("data/openairframes")
    OUT_ROOT.mkdir(parents=True, exist_ok=True)
    output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv"
    df_combined.write_csv(output_file)
    print(f"Saved: {output_file}")
    print(f"Total aircraft: {df_combined.height}")
@@ -47,6 +47,9 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
    # Convert all NaN to empty strings
    df = df.fillna("")
    # The FAA parser can produce the literal string "None" for missing values;
    # replace those so they match the empty-string convention used everywhere else.
    df = df.replace("None", "")
    return df
@@ -84,8 +87,8 @@ def concat_faa_historical_df(df_base, df_new):
            # Convert to string
            val_str = str(val).strip()
-            # Handle empty strings
+            # Handle empty strings and null-like literals
-            if val_str == "" or val_str == "nan":
+            if val_str == "" or val_str == "nan" or val_str == "None":
                return ""
            # Check if it looks like a list representation (starts with [ )
@@ -27,6 +27,33 @@ def _http_get_json(url: str, headers: dict[str, str]) -> dict:
    return json.loads(data.decode("utf-8"))
 def get_releases(repo: str = REPO, github_token: Optional[str] = None, per_page: int = 30) -> list[dict]:
    """Get a list of releases from the repository."""
    url = f"https://api.github.com/repos/{repo}/releases?per_page={per_page}"
    headers = {
        "Accept": "application/vnd.github+json",
        "User-Agent": "openairframes-downloader/1.0",
    }
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"
    return _http_get_json(url, headers=headers)
 def get_release_assets_from_release_data(release_data: dict) -> list[ReleaseAsset]:
    """Extract assets from a release data dictionary."""
    assets = []
    for a in release_data.get("assets", []):
        assets.append(
            ReleaseAsset(
                name=a["name"],
                download_url=a["browser_download_url"],
                size=int(a.get("size", 0)),
            )
        )
    return assets
 def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
    url = f"https://api.github.com/repos/{repo}/releases/latest"
    headers = {
@@ -37,16 +64,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
        headers["Authorization"] = f"Bearer {github_token}"
    payload = _http_get_json(url, headers=headers)
-    assets = []
+    return get_release_assets_from_release_data(payload)
    for a in payload.get("assets", []):
        assets.append(
            ReleaseAsset(
                name=a["name"],
                download_url=a["browser_download_url"],
                size=int(a.get("size", 0)),
            )
        )
    return assets
 def pick_asset(
@@ -119,6 +137,7 @@ def download_latest_aircraft_csv(
    Returns:
        Path to the downloaded file
    """
    output_dir = Path(output_dir)
    assets = get_latest_release_assets(repo, github_token=github_token)
    try:
        asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$")
@@ -154,7 +173,8 @@ def download_latest_aircraft_adsb_csv(
    repo: str = REPO,
 ) -> Path:
    """
-    Download the latest openairframes_adsb_*.csv file from the latest GitHub release.
+    Download the latest openairframes_adsb_*.csv file from GitHub releases.
    If the latest release doesn't have the file, searches previous releases.
    Args:
        output_dir: Directory to save the downloaded file (default: "downloads")
@@ -164,26 +184,70 @@ def download_latest_aircraft_adsb_csv(
    Returns:
        Path to the downloaded file
    """
-    assets = get_latest_release_assets(repo, github_token=github_token)
+    output_dir = Path(output_dir)
    asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$")
    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
    return saved_to
 def get_latest_aircraft_adsb_csv_df():
    csv_path = download_latest_aircraft_adsb_csv()
    import pandas as pd
    df = pd.read_csv(csv_path)
    df = df.fillna("")
    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
-    date_str = match.group(1)
+    # Get multiple releases
-    return df, date_str
+    releases = get_releases(repo, github_token=github_token, per_page=30)
    # Try each release until we find one with the matching asset
    for release in releases:
        assets = get_release_assets_from_release_data(release)
        try:
            asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
            saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
            print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
            return saved_to
        except FileNotFoundError:
            # This release doesn't have the matching asset, try the next one
            continue
    raise FileNotFoundError(
        f"No release in the last 30 releases has an asset matching 'openairframes_adsb_.*\\.csv(\\.gz)?$'"
    )
 import polars as pl
 def get_latest_aircraft_adsb_csv_df():
    """Download and load the latest ADS-B CSV from GitHub releases.
    Returns:
        tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
    """
    import re
    csv_path = download_latest_aircraft_adsb_csv()
    df = pl.read_csv(csv_path, null_values=[""])
    # Parse time column: values like "2025-12-31T00:00:00.040" or "2025-05-11T15:15:50.540+0000"
    # Try with timezone first (convert to naive), then without timezone
    df = df.with_columns(
        pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f%z", strict=False)
            .dt.replace_time_zone(None)  # Convert to naive datetime first
            .fill_null(pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f", strict=False))
    )
    # Cast dbFlags and year to strings to match the schema used in compress functions
    for col in ['dbFlags', 'year']:
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8))
    # Fill nulls with empty strings for string columns
    for col in df.columns:
        if df[col].dtype == pl.Utf8:
            df = df.with_columns(pl.col(col).fill_null(""))
    # Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
    if not match:
        raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
    start_date = match.group(1)
    end_date = match.group(2)
    print(df.columns)
    print(df.dtypes)
    return df, start_date, end_date
 if __name__ == "__main__":
    download_latest_aircraft_csv()
    download_latest_aircraft_adsb_csv()
Author	SHA1	Message	Date
JG	68cc4d89b3	Merge pull request #46 psimpson routes add psimpson routes	2026-05-25 18:59:28 -04:00
ggman12	b653c3a844	add psimpson routes	2026-05-25 18:54:57 -04:00
JG	2829e5fb6e	Merge pull request #35 from PlaneQuery/develop update readme.md	2026-03-18 14:31:29 -04:00
ggman12	9c744b0baf	update readme.md	2026-03-18 14:29:13 -04:00
JG	ebda04767f	Merge pull request #34 from PlaneQuery/develop Develop to main: theairtraffic google sheet	2026-03-10 05:12:11 -04:00
ggman12	3fdf443894	add russia_ukraine	2026-03-10 05:08:19 -04:00
ggman12	24313603c5	works	2026-03-10 05:08:19 -04:00
JG	2bb0a5eac3	Merge pull request #33 from PlaneQuery/develop Develop to Main: Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb	2026-02-26 15:32:59 -05:00
ggman12	b54f33aa56	Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb	2026-02-26 15:31:47 -05:00
JG	2dda3d341c	Merge pull request #32 from PlaneQuery/develop Develop to Main: Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.	2026-02-24 15:37:54 -05:00
ggman12	b0526f0a95	Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.	2026-02-24 15:36:10 -05:00
JG	4b6a043a9d	Merge pull request #31 from PlaneQuery/develop Develop to Main Fix adsb asset retrival to be more fault tolerant. Fix download issue	2026-02-24 02:17:08 -05:00
ggman12	55c464aad7	Fix adsb asset retrival to be more fault tolerant. Fix download issue for 2024-07-03	2026-02-24 02:12:55 -05:00
ggman12	aa509e8560	attempt to fix download issue for 2024-07-03	2026-02-19 17:51:49 -05:00
ggman12	82d11d8d24	try less strict tar extract for 2025-10-15 and other days that fail	2026-02-19 00:20:03 -05:00
ggman12	76a217ad14	src/contributions/approve_submission.py handle big json files	2026-02-18 23:18:19 -05:00
ggman12	ec2d1a1291	update download.sh	2026-02-18 23:18:19 -05:00
ggman12	97284c69a9	verify downlaod asssets	2026-02-18 23:18:19 -05:00
JG	892ffa78af	Merge pull request #28 from PlaneQuery/community-submission-27 Community submission: ggman12_2026-02-18_5ddbb8bd.json	2026-02-18 17:18:49 -05:00
github-actions[bot]	f77a91db2c	Update schema with new tags: manufacturer_icao, manufacturer_name, model, type_code, serial_number, icao_aircraft_type, operator, operator_callsign, operator_icao, citation_0	2026-02-18 22:18:12 +00:00
github-actions[bot]	b3bd654998	Add community submission from @ggman12 (closes #27 )	2026-02-18 22:18:12 +00:00
ggman12	302be8b8dc	update checker for arrays issue	2026-02-18 17:11:14 -05:00
ggman12	b61dc0f5e5	provide more error	2026-02-18 17:08:43 -05:00
ggman12	1ff17cc6a8	allow adsb to fail for when adsb.lol hasen't uploaded file yet.	2026-02-18 16:49:02 -05:00
ggman12	d216ea9329	Daily ADSB and Histoircal updates. Update readme.md	2026-02-18 16:34:06 -05:00