Merge pull request #35 from PlaneQuery/develop

update readme.md
2026-06-17 18:20:06 +02:00 · 2026-03-18 14:31:29 -04:00 · 2026-03-18 14:29:13 -04:00 · 2026-03-10 05:12:11 -04:00 · 2026-03-10 05:08:19 -04:00 · 2026-03-10 05:08:19 -04:00
54 changed files with 2747 additions and 2997 deletions
@@ -8,13 +8,13 @@ body:
  - type: markdown
    attributes:
      value: |
-        Submit **one object** or an **array of objects** that matches the community submission schema.
-
+        Submit **one object** or an **array of objects** that matches the community submission [schema](https://github.com/PlaneQuery/OpenAirframes/blob/main/schemas/community_submission.v1.schema.json). Reuse existing tags from the schema when possible.
+        
        **Rules (enforced on review/automation):**
        - Each object must include **at least one** of:
          - `registration_number`
          - `transponder_code_hex` (6 uppercase hex chars, e.g., `ABC123`)
-          - `planequery_airframe_id`
+          - `openairframes_id`
        - Your contributor name (entered below) will be applied to all objects.
        - `contributor_uuid` is derived from your GitHub account automatically.
        - `creation_timestamp` is created by the system (you may omit it).
@@ -27,7 +27,7 @@ body:
        ```json
        {
            "registration_number": "N12345",
-            "tags": {"owner": "John Doe"},
+            "tags": {"owner": "John Doe", "photo": "https://example.com/photo.jpg"},
            "start_date": "2025-01-01"
        }
        ```
@@ -77,6 +77,5 @@ body:
    id: notes
    attributes:
      label: Notes (optional)
-      description: Any context, sources, or links that help validate your submission.
    validations:
      required: false
@@ -0,0 +1,182 @@
+name: Historical ADS-B Processing
+
+on:
+  workflow_dispatch:
+    inputs:
+      date:
+        description: 'YYYY-MM-DD'
+        required: true
+        type: string
+      concat_with_latest_csv:
+        description: 'Also concatenate with latest CSV from GitHub releases'
+        required: false
+        type: boolean
+        default: false
+  workflow_call:
+    inputs:
+      date:
+        description: 'YYYY-MM-DD'
+        required: true
+        type: string
+      concat_with_latest_csv:
+        description: 'Also concatenate with latest CSV from GitHub releases'
+        required: false
+        type: boolean
+        default: false
+
+jobs:
+  adsb-extract:
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download and split ADS-B data
+        env:
+          DATE: ${{ inputs.date }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          python -m src.adsb.download_and_list_icaos --date "$DATE"
+          ls -lah data/output/adsb_archives/"$DATE" || true
+
+      - name: Upload archive part 0
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-0
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_0.tar.gz
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+      - name: Upload archive part 1
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-1
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_1.tar.gz
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+      - name: Upload archive part 2
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-2
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_2.tar.gz
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+      - name: Upload archive part 3
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-3
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_3.tar.gz
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+  adsb-map:
+    needs: adsb-extract
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      fail-fast: true
+      matrix:
+        part_id: [0, 1, 2, 3]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download archive part
+        uses: actions/download-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-${{ matrix.part_id }}
+          path: data/output/adsb_archives/${{ inputs.date }}
+
+      - name: Verify archive
+        run: |
+          FILE="data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_${{ matrix.part_id }}.tar.gz"
+          ls -lah data/output/adsb_archives/${{ inputs.date }}/
+          if [ ! -f "$FILE" ]; then
+            echo "::error::Archive not found: $FILE"
+            exit 1
+          fi
+          echo "Verified: $(du -h "$FILE")"
+
+      - name: Process part
+        env:
+          DATE: ${{ inputs.date }}
+        run: |
+          python -m src.adsb.process_icao_chunk --part-id ${{ matrix.part_id }} --date "$DATE"
+
+      - name: Upload compressed outputs
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-compressed-${{ inputs.date }}-part-${{ matrix.part_id }}
+          path: data/output/compressed/${{ inputs.date }}
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+  adsb-reduce:
+    needs: adsb-map
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download compressed outputs
+        uses: actions/download-artifact@v4
+        with:
+          pattern: adsb-compressed-${{ inputs.date }}-part-*
+          path: data/output/compressed/${{ inputs.date }}
+          merge-multiple: true
+
+      - name: Concatenate final outputs
+        env:
+          DATE: ${{ inputs.date }}
+          CONCAT_WITH_LATEST_CSV: ${{ inputs.concat_with_latest_csv }}
+        run: |
+          EXTRA=""
+          if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
+            EXTRA="--concat_with_latest_csv"
+          fi
+          python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
+          ls -lah data/output/ || true
+
+      - name: Upload final artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: openairframes_adsb-${{ inputs.date }}
+          path: data/output/openairframes_adsb_*
+          retention-days: 30
+          if-no-files-found: error
@@ -0,0 +1,118 @@
+name: adsb-to-aircraft-multiple-day-run
+
+on:
+  workflow_dispatch:
+    inputs:
+      start_date:
+        description: 'YYYY-MM-DD (inclusive)'
+        required: true
+        type: string
+      end_date:
+        description: 'YYYY-MM-DD (exclusive)'
+        required: true
+        type: string
+
+jobs:
+  generate-dates:
+    runs-on: ubuntu-24.04-arm
+    outputs:
+      dates: ${{ steps.generate.outputs.dates }}
+    steps:
+      - name: Generate date list
+        id: generate
+        env:
+          START_DATE: ${{ inputs.start_date }}
+          END_DATE: ${{ inputs.end_date }}
+        run: |
+          python - <<'PY'
+          import json
+          import os
+          from datetime import datetime, timedelta
+
+          start = datetime.strptime(os.environ["START_DATE"], "%Y-%m-%d")
+          end = datetime.strptime(os.environ["END_DATE"], "%Y-%m-%d")
+          if end <= start:
+            raise SystemExit("end_date must be after start_date")
+
+          dates = []
+          cur = start
+          while cur < end:
+            dates.append(cur.strftime("%Y-%m-%d"))
+            cur += timedelta(days=1)
+
+          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+            f.write(f"dates={json.dumps(dates)}\n")
+          PY
+
+  adsb-day:
+    needs: generate-dates
+    strategy:
+      fail-fast: true
+      matrix:
+        date: ${{ fromJson(needs.generate-dates.outputs.dates) }}
+    uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
+    with:
+      date: ${{ matrix.date }}
+
+  adsb-final:
+    needs: adsb-day
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download daily CSVs
+        uses: actions/download-artifact@v4
+        with:
+          pattern: openairframes_adsb-*
+          path: outputs/daily/
+          merge-multiple: true
+
+      - name: Concatenate all days to final CSV
+        env:
+          START_DATE: ${{ inputs.start_date }}
+          END_DATE: ${{ inputs.end_date }}
+        run: |
+          python - <<'PY'
+          import os
+          import re
+          from pathlib import Path
+          import polars as pl
+
+          start = os.environ["START_DATE"]
+          end = os.environ["END_DATE"]
+          daily_dir = Path("outputs/daily")
+          files = sorted(daily_dir.glob("openairframes_adsb_*.csv.gz"))
+          if not files:
+            raise SystemExit("No daily CSVs found")
+
+          def date_key(path: Path) -> str:
+            m = re.match(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", path.name)
+            return m.group(1) if m else path.name
+
+          files = sorted(files, key=date_key)
+          frames = [pl.read_csv(p) for p in files]
+          df = pl.concat(frames, how="vertical", rechunk=True)
+
+          output_path = Path("outputs") / f"openairframes_adsb_{start}_{end}.csv.gz"
+          df.write_csv(output_path, compression="gzip")
+          print(f"Wrote {output_path} with {df.height} rows")
+          PY
+
+      - name: Upload final CSV
+        uses: actions/upload-artifact@v4
+        with:
+          name: openairframes_adsb-${{ inputs.start_date }}-${{ inputs.end_date }}
+          path: outputs/openairframes_adsb_${{ inputs.start_date }}_${{ inputs.end_date }}.csv.gz
+          retention-days: 30
+# gh workflow run adsb-to-aircraft-multiple-day-run.yaml --repo ggman12/OpenAirframes --ref jonah/fix-historical-proper -f start_date=2025-12-31 -f end_date=2026-01-02
@@ -1,230 +0,0 @@
-name: Historical ADS-B Processing
-
-on:
-  workflow_dispatch:
-    inputs:
-      start_date:
-        description: 'Start date (YYYY-MM-DD, inclusive)'
-        required: true
-        type: string
-      end_date:
-        description: 'End date (YYYY-MM-DD, exclusive)'
-        required: true
-        type: string
-      chunk_days:
-        description: 'Days per job chunk (default: 7)'
-        required: false
-        type: number
-        default: 7
-
-jobs:
-  generate-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      chunks: ${{ steps.generate.outputs.chunks }}
-      global_start: ${{ inputs.start_date }}
-      global_end: ${{ inputs.end_date }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Generate date chunks
-        id: generate
-        env:
-          INPUT_START_DATE: ${{ inputs.start_date }}
-          INPUT_END_DATE: ${{ inputs.end_date }}
-          INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
-        run: python src/adsb/historical_generate_matrix.py
-
-  adsb-extract:
-    needs: generate-matrix
-    runs-on: ubuntu-24.04-arm
-    strategy:
-      matrix:
-        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
-      max-parallel: 3
-      fail-fast: false
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Free disk space
-        run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /usr/local/share/boost
-          df -h
-
-      - name: Download and extract ADS-B data
-        env:
-          START_DATE: ${{ matrix.chunk.start_date }}
-          END_DATE: ${{ matrix.chunk.end_date }}
-        run: |
-          python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
-          ls -lah data/output/
-
-      - name: Create tar of extracted data
-        run: |
-          cd data/output
-          echo "=== Disk space before tar ==="
-          df -h .
-          echo "=== Files to tar ==="
-          ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
-          
-          # Create tar with explicit error checking
-          if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
-            tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
-            echo "=== Tar file created ==="
-            ls -lah extracted_data.tar
-            # Verify tar integrity
-            tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
-          else
-            echo "ERROR: No extracted directories found, cannot create tar"
-            exit 1
-          fi
-
-      - name: Upload extracted data
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
-          path: data/output/extracted_data.tar
-          retention-days: 1
-          compression-level: 0
-          if-no-files-found: warn
-
-  adsb-map:
-    needs: [generate-matrix, adsb-extract]
-    runs-on: ubuntu-24.04-arm
-    strategy:
-      fail-fast: true
-      matrix:
-        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
-        icao_chunk: [0, 1, 2, 3]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Free disk space
-        run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /usr/local/share/boost
-          df -h
-
-      - name: Download extracted data
-        uses: actions/download-artifact@v4
-        with:
-          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
-          path: data/output/
-        continue-on-error: true
-
-      - name: Extract tar
-        id: extract
-        run: |
-          cd data/output
-          if [ -f extracted_data.tar ]; then
-            echo "=== Tar file info ==="
-            ls -lah extracted_data.tar
-            echo "=== Verifying tar integrity ==="
-            tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
-            echo "=== Extracting ==="
-            tar -xvf extracted_data.tar
-            rm extracted_data.tar
-            echo "has_data=true" >> "$GITHUB_OUTPUT"
-            echo "=== Contents of data/output ==="
-            ls -lah
-          else
-            echo "No extracted_data.tar found"
-            echo "has_data=false" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Process ICAO chunk
-        if: steps.extract.outputs.has_data == 'true'
-        env:
-          START_DATE: ${{ matrix.chunk.start_date }}
-          END_DATE: ${{ matrix.chunk.end_date }}
-        run: |
-          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.icao_chunk }} --total-chunks 4 --start-date "$START_DATE" --end-date "$END_DATE"
-          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
-
-      - name: Upload chunk artifacts
-        if: steps.extract.outputs.has_data == 'true'
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-map-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}-chunk-${{ matrix.icao_chunk }}
-          path: data/output/adsb_chunks/
-          retention-days: 1
-          if-no-files-found: ignore
-
-  adsb-reduce:
-    needs: [generate-matrix, adsb-map]
-    runs-on: ubuntu-24.04-arm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.12'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Download all chunk artifacts
-        uses: actions/download-artifact@v4
-        with:
-          pattern: adsb-map-*
-          path: data/output/adsb_chunks/
-          merge-multiple: true
-
-      - name: Debug downloaded files
-        run: |
-          echo "=== Disk space before processing ==="
-          df -h
-          echo "=== Listing data/output/adsb_chunks/ ==="
-          find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
-          echo "=== Total parquet size ==="
-          du -sh data/output/adsb_chunks/ || echo "No chunks dir"
-
-      - name: Combine chunks to CSV
-        env:
-          START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
-          END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
-        run: |
-          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
-          ls -lah data/planequery_aircraft/
-
-      - name: Upload final artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: planequery_aircraft_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
-          path: data/planequery_aircraft/*.csv
-          retention-days: 30
@@ -0,0 +1,430 @@
+name: openairframes-daily-release
+
+on:
+  schedule:
+    # 6:00pm UTC every day - runs on default branch, triggers both
+    - cron: "0 06 * * *"
+  workflow_dispatch:
+    inputs:
+      date:
+        description: 'Date to process (YYYY-MM-DD format, default: yesterday)'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+  actions: write
+
+jobs:
+  trigger-releases:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'schedule'
+    steps:
+      - name: Trigger main branch release
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.actions.createWorkflowDispatch({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              workflow_id: 'openairframes-daily-release.yaml',
+              ref: 'main'
+            });
+      
+      - name: Trigger develop branch release
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.actions.createWorkflowDispatch({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              workflow_id: 'openairframes-daily-release.yaml',
+              ref: 'develop'
+            });
+
+  build-faa:
+    runs-on: ubuntu-24.04-arm
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run FAA release script
+        run: |
+          python src/create_daily_faa_release.py ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
+          ls -lah data/faa_releasable
+          ls -lah data/openairframes
+
+      - name: Upload FAA artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: faa-release
+          path: |
+            data/openairframes/openairframes_faa_*.csv
+            data/faa_releasable/ReleasableAircraft_*.zip
+          retention-days: 1
+
+  resolve-dates:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule'
+    outputs:
+      date: ${{ steps.out.outputs.date }}
+      adsb_date: ${{ steps.out.outputs.adsb_date }}
+    steps:
+      - id: out
+        run: |
+          if [ -n "${{ inputs.date }}" ]; then
+            echo "date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
+            echo "adsb_date=${{ inputs.date }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+            echo "adsb_date=$(date -u -d 'yesterday' +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+          fi
+
+  adsb-to-aircraft:
+    needs: resolve-dates
+    if: github.event_name != 'schedule'
+    uses: ./.github/workflows/adsb-to-aircraft-for-day.yaml
+    with:
+      date: ${{ needs.resolve-dates.outputs.adsb_date }}
+      concat_with_latest_csv: true
+
+  adsb-reduce:
+    needs: [resolve-dates, adsb-to-aircraft]
+    if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download compressed outputs
+        uses: actions/download-artifact@v4
+        with:
+          pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
+          path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
+          merge-multiple: true
+
+      - name: Concatenate final outputs
+        env:
+          DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
+          CONCAT_WITH_LATEST_CSV: true
+        run: |
+          EXTRA=""
+          if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
+            EXTRA="--concat_with_latest_csv"
+          fi
+          python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
+          ls -lah data/output/ || true
+
+      - name: Upload final artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
+          path: data/output/openairframes_adsb_*
+          retention-days: 30
+          if-no-files-found: error
+
+  build-community:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pandas
+
+      - name: Run Community release script
+        run: |
+          python -m src.contributions.create_daily_community_release
+          ls -lah data/openairframes
+
+      - name: Upload Community artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: community-release
+          path: data/openairframes/openairframes_community_*.csv
+          retention-days: 1
+
+  build-adsbexchange-json:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Run ADS-B Exchange JSON release script
+        run: |
+          python -m src.contributions.create_daily_adsbexchange_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
+          ls -lah data/openairframes
+
+      - name: Upload ADS-B Exchange JSON artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsbexchange-json
+          path: data/openairframes/basic-ac-db_*.json.gz
+          retention-days: 1
+
+  build-mictronics-db:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Run Mictronics DB release script
+        continue-on-error: true
+        run: |
+          python -m src.contributions.create_daily_microtonics_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
+          ls -lah data/openairframes
+
+      - name: Upload Mictronics DB artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: mictronics-db
+          path: data/openairframes/mictronics-db_*.zip
+          retention-days: 1
+          if-no-files-found: ignore
+
+  create-release:
+    runs-on: ubuntu-latest
+    needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
+    if: github.event_name != 'schedule' && !cancelled()
+    steps:
+      - name: Check ADS-B workflow status
+        if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
+        run: |
+          echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"
+
+      - name: Checkout for gh CLI
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            .github
+          sparse-checkout-cone-mode: false
+
+      - name: Download FAA artifacts
+        uses: actions/download-artifact@v5
+        with:
+          name: faa-release
+          path: artifacts/faa
+
+      - name: Download ADS-B artifacts
+        uses: actions/download-artifact@v5
+        if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
+        continue-on-error: true
+        with:
+          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
+          path: artifacts/adsb
+
+      - name: Download Community artifacts
+        uses: actions/download-artifact@v5
+        with:
+          name: community-release
+          path: artifacts/community
+
+      - name: Download ADS-B Exchange JSON artifact
+        uses: actions/download-artifact@v5
+        with:
+          name: adsbexchange-json
+          path: artifacts/adsbexchange
+
+      - name: Download Mictronics DB artifact
+        uses: actions/download-artifact@v5
+        continue-on-error: true
+        with:
+          name: mictronics-db
+          path: artifacts/mictronics
+
+      - name: Debug artifact structure
+        run: |
+          echo "=== Full artifacts tree ==="
+          find artifacts -type f 2>/dev/null || echo "No files found in artifacts"
+          echo "=== FAA artifacts ==="
+          find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
+          echo "=== ADS-B artifacts ==="
+          find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
+          echo "=== Community artifacts ==="
+          find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
+          echo "=== ADS-B Exchange JSON artifacts ==="
+          find artifacts/adsbexchange -type f 2>/dev/null || echo "No files found in artifacts/adsbexchange"
+          echo "=== Mictronics DB artifacts ==="
+          find artifacts/mictronics -type f 2>/dev/null || echo "No files found in artifacts/mictronics"
+
+      - name: Prepare release metadata
+        id: meta
+        run: |
+          DATE=$(date -u +"%Y-%m-%d")
+          BRANCH_NAME="${GITHUB_REF#refs/heads/}"
+          BRANCH_SUFFIX=""
+          if [ "$BRANCH_NAME" = "main" ]; then
+            BRANCH_SUFFIX="-main"
+          elif [ "$BRANCH_NAME" = "develop" ]; then
+            BRANCH_SUFFIX="-develop"
+          fi
+          TAG="openairframes-${DATE}${BRANCH_SUFFIX}"
+          
+          # Find files from artifacts using find (handles nested structures)
+          CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
+          # Prefer concatenated file (with date range) over single-day file
+          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*_*.csv.gz" -type f 2>/dev/null | head -1)
+          if [ -z "$CSV_FILE_ADSB" ]; then
+            CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
+          fi
+          CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
+          ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
+          JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1)
+          ZIP_FILE_MICTRONICS=$(find artifacts/mictronics -name "mictronics-db_*.zip" -type f 2>/dev/null | head -1)
+          
+          # Validate required files exist
+          MISSING_FILES=""
+          if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then
+            MISSING_FILES="$MISSING_FILES FAA_CSV"
+          fi
+          if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then
+            MISSING_FILES="$MISSING_FILES FAA_ZIP"
+          fi
+          if [ -z "$JSON_FILE_ADSBX" ] || [ ! -f "$JSON_FILE_ADSBX" ]; then
+            MISSING_FILES="$MISSING_FILES ADSBX_JSON"
+          fi
+          
+          # Optional files - warn but don't fail
+          OPTIONAL_MISSING=""
+          if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
+            OPTIONAL_MISSING="$OPTIONAL_MISSING ADSB_CSV"
+            CSV_FILE_ADSB=""
+            CSV_BASENAME_ADSB=""
+          fi
+          if [ -z "$ZIP_FILE_MICTRONICS" ] || [ ! -f "$ZIP_FILE_MICTRONICS" ]; then
+            OPTIONAL_MISSING="$OPTIONAL_MISSING MICTRONICS_ZIP"
+            ZIP_FILE_MICTRONICS=""
+          fi
+          
+          if [ -n "$MISSING_FILES" ]; then
+            echo "ERROR: Missing required release files:$MISSING_FILES"
+            echo "FAA CSV: $CSV_FILE_FAA"
+            echo "ADSB CSV: $CSV_FILE_ADSB"
+            echo "ZIP: $ZIP_FILE"
+            echo "ADSBX JSON: $JSON_FILE_ADSBX"
+            echo "MICTRONICS ZIP: $ZIP_FILE_MICTRONICS"
+            exit 1
+          fi
+          
+          # Get basenames for display
+          CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
+          if [ -n "$CSV_FILE_ADSB" ]; then
+            CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
+          fi
+          CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
+          ZIP_BASENAME=$(basename "$ZIP_FILE")
+          JSON_BASENAME_ADSBX=$(basename "$JSON_FILE_ADSBX")
+          ZIP_BASENAME_MICTRONICS=""
+          if [ -n "$ZIP_FILE_MICTRONICS" ]; then
+            ZIP_BASENAME_MICTRONICS=$(basename "$ZIP_FILE_MICTRONICS")
+          fi
+          
+          if [ -n "$OPTIONAL_MISSING" ]; then
+            echo "WARNING: Optional files missing:$OPTIONAL_MISSING (will continue without them)"
+          fi
+          
+          echo "date=$DATE" >> "$GITHUB_OUTPUT"
+          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
+          echo "csv_file_faa=$CSV_FILE_FAA" >> "$GITHUB_OUTPUT"
+          echo "csv_basename_faa=$CSV_BASENAME_FAA" >> "$GITHUB_OUTPUT"
+          echo "csv_file_adsb=$CSV_FILE_ADSB" >> "$GITHUB_OUTPUT"
+          echo "csv_basename_adsb=$CSV_BASENAME_ADSB" >> "$GITHUB_OUTPUT"
+          echo "csv_file_community=$CSV_FILE_COMMUNITY" >> "$GITHUB_OUTPUT"
+          echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT"
+          echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
+          echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
+          echo "json_file_adsbx=$JSON_FILE_ADSBX" >> "$GITHUB_OUTPUT"
+          echo "json_basename_adsbx=$JSON_BASENAME_ADSBX" >> "$GITHUB_OUTPUT"
+          echo "zip_file_mictronics=$ZIP_FILE_MICTRONICS" >> "$GITHUB_OUTPUT"
+          echo "zip_basename_mictronics=$ZIP_BASENAME_MICTRONICS" >> "$GITHUB_OUTPUT"
+          echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
+          
+          echo "Found files:"
+          echo "  FAA CSV: $CSV_FILE_FAA"
+          echo "  ADSB CSV: $CSV_FILE_ADSB"
+          echo "  Community CSV: $CSV_FILE_COMMUNITY"
+          echo "  ZIP: $ZIP_FILE"
+          echo "  ADSBX JSON: $JSON_FILE_ADSBX"
+          echo "  MICTRONICS ZIP: $ZIP_FILE_MICTRONICS"
+
+      - name: Delete existing release if exists
+        run: |
+          echo "Attempting to delete release: ${{ steps.meta.outputs.tag }}"
+          gh release delete "${{ steps.meta.outputs.tag }}" --yes --cleanup-tag || echo "No existing release to delete"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Create GitHub Release and upload assets
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.meta.outputs.tag }}
+          name: ${{ steps.meta.outputs.name }}
+          fail_on_unmatched_files: false
+          body: |
+            Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
+
+            Assets:
+            - ${{ steps.meta.outputs.csv_basename_faa }}
+            ${{ steps.meta.outputs.csv_basename_adsb && format('- {0}', steps.meta.outputs.csv_basename_adsb) || '' }}
+            - ${{ steps.meta.outputs.csv_basename_community }}
+            - ${{ steps.meta.outputs.zip_basename }}
+            - ${{ steps.meta.outputs.json_basename_adsbx }}
+            ${{ steps.meta.outputs.zip_basename_mictronics && format('- {0}', steps.meta.outputs.zip_basename_mictronics) || '' }}
+          files: |
+            ${{ steps.meta.outputs.csv_file_faa }}
+            ${{ steps.meta.outputs.csv_file_adsb }}
+            ${{ steps.meta.outputs.csv_file_community }}
+            ${{ steps.meta.outputs.zip_file }}
+            ${{ steps.meta.outputs.json_file_adsbx }}
+            ${{ steps.meta.outputs.zip_file_mictronics }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -1,350 +0,0 @@
-name: planequery-aircraft Daily Release
-
-on:
-  schedule:
-    # 6:00pm UTC every day - runs on default branch, triggers both
-    - cron: "0 06 * * *"
-  workflow_dispatch:
-
-permissions:
-  contents: write
-  actions: write
-
-jobs:
-  trigger-releases:
-    runs-on: ubuntu-latest
-    if: github.event_name == 'schedule'
-    steps:
-      - name: Trigger main branch release
-        uses: actions/github-script@v7
-        with:
-          script: |
-            await github.rest.actions.createWorkflowDispatch({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              workflow_id: 'planequery-aircraft-daily-release.yaml',
-              ref: 'main'
-            });
-      
-      - name: Trigger develop branch release
-        uses: actions/github-script@v7
-        with:
-          script: |
-            await github.rest.actions.createWorkflowDispatch({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              workflow_id: 'planequery-aircraft-daily-release.yaml',
-              ref: 'develop'
-            });
-
-  build-faa:
-    runs-on: ubuntu-24.04-arm
-    if: github.event_name != 'schedule'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.14"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Run FAA release script
-        run: |
-          python src/create_daily_planequery_aircraft_release.py
-          ls -lah data/faa_releasable
-          ls -lah data/planequery_aircraft
-
-      - name: Upload FAA artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: faa-release
-          path: |
-            data/planequery_aircraft/planequery_aircraft_faa_*.csv
-            data/faa_releasable/ReleasableAircraft_*.zip
-          retention-days: 1
-
-  adsb-extract:
-    runs-on: ubuntu-24.04-arm
-    if: github.event_name != 'schedule'
-    outputs:
-      manifest-exists: ${{ steps.check.outputs.exists }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.14"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Download and extract ADS-B data
-        run: |
-          python -m src.adsb.download_and_list_icaos
-          ls -lah data/output/
-
-      - name: Check manifest exists
-        id: check
-        run: |
-          if ls data/output/icao_manifest_*.txt 1>/dev/null 2>&1; then
-            echo "exists=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "exists=false" >> "$GITHUB_OUTPUT"
-          fi
-
-      - name: Create tar of extracted data
-        run: |
-          cd data/output
-          tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
-          ls -lah extracted_data.tar
-
-      - name: Upload extracted data
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-extracted
-          path: data/output/extracted_data.tar
-          retention-days: 1
-          compression-level: 0  # Already compressed trace files
-
-  adsb-map:
-    runs-on: ubuntu-24.04-arm
-    needs: adsb-extract
-    if: github.event_name != 'schedule' && needs.adsb-extract.outputs.manifest-exists == 'true'
-    strategy:
-      fail-fast: false
-      matrix:
-        chunk: [0, 1, 2, 3]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.14"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Download extracted data
-        uses: actions/download-artifact@v4
-        with:
-          name: adsb-extracted
-          path: data/output/
-
-      - name: Extract tar
-        run: |
-          cd data/output
-          tar -xf extracted_data.tar
-          rm extracted_data.tar
-          echo "=== Contents of data/output ==="
-          ls -lah
-          echo "=== Looking for manifest ==="
-          cat icao_manifest_*.txt | head -20 || echo "No manifest found"
-          echo "=== Looking for extracted dirs ==="
-          ls -d *-planes-readsb-prod-0* 2>/dev/null || echo "No extracted dirs"
-
-      - name: Process chunk ${{ matrix.chunk }}
-        run: |
-          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4
-          mkdir -p data/output/adsb_chunks
-          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
-
-      - name: Upload chunk artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-chunk-${{ matrix.chunk }}
-          path: data/output/adsb_chunks/
-          retention-days: 1
-
-  adsb-reduce:
-    runs-on: ubuntu-24.04-arm
-    needs: adsb-map
-    if: github.event_name != 'schedule'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.14"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Download all chunk artifacts
-        uses: actions/download-artifact@v4
-        with:
-          pattern: adsb-chunk-*
-          path: data/output/adsb_chunks/
-          merge-multiple: true
-
-      - name: Debug downloaded files
-        run: |
-          echo "=== Listing data/ ==="
-          find data/ -type f 2>/dev/null | head -50 || echo "No files in data/"
-          echo "=== Looking for parquet files ==="
-          find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found"
-
-      - name: Combine chunks to CSV
-        run: |
-          mkdir -p data/output/adsb_chunks
-          ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
-          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
-          ls -lah data/planequery_aircraft/
-
-      - name: Upload ADS-B artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-release
-          path: data/planequery_aircraft/planequery_aircraft_adsb_*.csv
-          retention-days: 1
-
-  build-community:
-    runs-on: ubuntu-latest
-    if: github.event_name != 'schedule'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: "3.14"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install pandas
-
-      - name: Run Community release script
-        run: |
-          python -m src.contributions.create_daily_community_release
-          ls -lah data/planequery_aircraft
-
-      - name: Upload Community artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: community-release
-          path: data/planequery_aircraft/planequery_aircraft_community_*.csv
-          retention-days: 1
-
-  create-release:
-    runs-on: ubuntu-latest
-    needs: [build-faa, adsb-reduce, build-community]
-    if: github.event_name != 'schedule'
-    steps:
-      - name: Download FAA artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: faa-release
-          path: artifacts/faa
-
-      - name: Download ADS-B artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: adsb-release
-          path: artifacts/adsb
-
-      - name: Download Community artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: community-release
-          path: artifacts/community
-
-      - name: Debug artifact structure
-        run: |
-          echo "=== FAA artifacts ==="
-          find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
-          echo "=== ADS-B artifacts ==="
-          find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
-          echo "=== Community artifacts ==="
-          find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
-
-      - name: Prepare release metadata
-        id: meta
-        run: |
-          DATE=$(date -u +"%Y-%m-%d")
-          BRANCH_NAME="${GITHUB_REF#refs/heads/}"
-          BRANCH_SUFFIX=""
-          if [ "$BRANCH_NAME" = "main" ]; then
-            BRANCH_SUFFIX="-main"
-          elif [ "$BRANCH_NAME" = "develop" ]; then
-            BRANCH_SUFFIX="-develop"
-          fi
-          TAG="planequery-aircraft-${DATE}${BRANCH_SUFFIX}"
-          
-          # Find files from artifacts
-          CSV_FILE_FAA=$(ls artifacts/faa/data/planequery_aircraft/planequery_aircraft_faa_*.csv | head -1)
-          CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
-          CSV_FILE_ADSB=$(ls artifacts/adsb/planequery_aircraft_adsb_*.csv | head -1)
-          CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
-          CSV_FILE_COMMUNITY=$(ls artifacts/community/planequery_aircraft_community_*.csv 2>/dev/null | head -1 || echo "")
-          CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
-          ZIP_FILE=$(ls artifacts/faa/data/faa_releasable/ReleasableAircraft_*.zip | head -1)
-          ZIP_BASENAME=$(basename "$ZIP_FILE")
-          
-          echo "date=$DATE" >> "$GITHUB_OUTPUT"
-          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
-          echo "csv_file_faa=$CSV_FILE_FAA" >> "$GITHUB_OUTPUT"
-          echo "csv_basename_faa=$CSV_BASENAME_FAA" >> "$GITHUB_OUTPUT"
-          echo "csv_file_adsb=$CSV_FILE_ADSB" >> "$GITHUB_OUTPUT"
-          echo "csv_basename_adsb=$CSV_BASENAME_ADSB" >> "$GITHUB_OUTPUT"
-          echo "csv_file_community=$CSV_FILE_COMMUNITY" >> "$GITHUB_OUTPUT"
-          echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT"
-          echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
-          echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
-          echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
-
-      - name: Delete existing release if exists
-        run: |
-          gh release delete "${{ steps.meta.outputs.tag }}" --yes 2>/dev/null || true
-          git push --delete origin "refs/tags/${{ steps.meta.outputs.tag }}" 2>/dev/null || true
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Create GitHub Release and upload assets
-        uses: softprops/action-gh-release@v2
-        with:
-          tag_name: ${{ steps.meta.outputs.tag }}
-          name: ${{ steps.meta.outputs.name }}
-          body: |
-            Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
-
-            Assets:
-            - ${{ steps.meta.outputs.csv_basename_faa }}
-            - ${{ steps.meta.outputs.csv_basename_adsb }}
-            - ${{ steps.meta.outputs.csv_basename_community }}
-            - ${{ steps.meta.outputs.zip_basename }}
-          files: |
-            ${{ steps.meta.outputs.csv_file_faa }}
-            ${{ steps.meta.outputs.csv_file_adsb }}
-            ${{ steps.meta.outputs.csv_file_community }}
-            ${{ steps.meta.outputs.zip_file }}
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,171 @@
+name: Process Historical FAA Data
+
+on:
+  workflow_dispatch:  # Manual trigger
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - name: Generate date ranges
+        id: set-matrix
+        run: |
+          python3 << 'EOF'
+          import json
+          from datetime import datetime, timedelta
+          
+          start = datetime(2023, 8, 16)
+          end = datetime(2026, 1, 1)
+          
+          ranges = []
+          current = start
+          
+          # Process in 4-day chunks
+          while current < end:
+            chunk_end = current + timedelta(days=4)
+            # Don't go past the end date
+            if chunk_end > end:
+              chunk_end = end
+            
+            ranges.append({
+              "since": current.strftime("%Y-%m-%d"),
+              "until": chunk_end.strftime("%Y-%m-%d")
+            })
+            
+            current = chunk_end
+          
+          print(f"::set-output name=matrix::{json.dumps(ranges)}")
+          EOF
+
+  clone-faa-repo:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cache FAA repository
+        id: cache-faa-repo
+        uses: actions/cache@v4
+        with:
+          path: data/scrape-faa-releasable-aircraft
+          key: faa-repo-v1
+          
+      - name: Clone FAA repository
+        if: steps.cache-faa-repo.outputs.cache-hit != 'true'
+        run: |
+          mkdir -p data
+          git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft
+          echo "Repository cloned successfully"
+
+  process-chunk:
+    needs: [generate-matrix, clone-faa-repo]
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5  # Process 5 chunks at a time
+      matrix:
+        range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      
+      - name: Restore FAA repository cache
+        uses: actions/cache/restore@v4
+        with:
+          path: data/scrape-faa-releasable-aircraft
+          key: faa-repo-v1
+          fail-on-cache-miss: true
+      
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+      
+      - name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }}
+        run: |
+          python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}"
+      
+      - name: Upload CSV artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }}
+          path: data/faa_releasable_historical/*.csv
+          retention-days: 1
+
+  create-release:
+    needs: process-chunk
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      
+      - name: Prepare release files
+        run: |
+          mkdir -p release-files
+          find artifacts -name "*.csv" -exec cp {} release-files/ \;
+          ls -lh release-files/
+      
+      - name: Create Release
+        uses: softprops/action-gh-release@v1
+        with:
+          tag_name: historical-faa-${{ github.run_number }}
+          name: Historical FAA Data Release ${{ github.run_number }}
+          body: |
+            Automated release of historical FAA aircraft data
+            Processing period: 2023-08-16 to 2026-01-01
+            Generated: ${{ github.event.repository.updated_at }}
+          files: release-files/*.csv
+          draft: false
+          prerelease: false
+
+  concatenate-and-release:
+    needs: process-chunk
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+      
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      
+      - name: Prepare CSVs for concatenation
+        run: |
+          mkdir -p data/faa_releasable_historical
+          find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
+          ls -lh data/faa_releasable_historical/
+      
+      - name: Concatenate all CSVs
+        run: |
+          python scripts/concat_csvs.py
+      
+      - name: Create Combined Release
+        uses: softprops/action-gh-release@v1
+        with:
+          tag_name: historical-faa-combined-${{ github.run_number }}
+          name: Historical FAA Data Combined Release ${{ github.run_number }}
+          body: |
+            Combined historical FAA aircraft data (all chunks concatenated)
+            Processing period: 2023-08-16 to 2026-01-01
+            Generated: ${{ github.event.repository.updated_at }}
+          files: data/openairframes/*.csv
+          draft: false
+          prerelease: false
@@ -48,29 +48,52 @@ jobs:
            git fetch origin "$branch_name"
            git checkout "$branch_name"
            
-            # Merge main into PR branch
            git config user.name "github-actions[bot]"
            git config user.email "github-actions[bot]@users.noreply.github.com"
            
-            if git merge origin/main -m "Merge main to update schema"; then
-              # Regenerate schema for this PR's submission (adds any new tags)
-              python -m src.contributions.regenerate_pr_schema || true
-              
-              # If there are changes, commit and push
-              if [ -n "$(git status --porcelain schemas/)" ]; then
-                git add schemas/
-                git commit -m "Update schema with new tags"
-                git push origin "$branch_name"
-                echo "  Updated PR #$pr_number with schema changes"
-              else
-                git push origin "$branch_name"
-                echo "  Merged main into PR #$pr_number"
+            # Get the community submission file(s) and schema from this branch
+            community_files=$(git diff --name-only origin/main...HEAD -- 'community/' 'schemas/')
+            
+            if [ -z "$community_files" ]; then
+              echo "  No community/schema files found in PR #$pr_number, skipping"
+              git checkout main
+              continue
+            fi
+            
+            echo "  Files to preserve: $community_files"
+            
+            # Save the community files content
+            mkdir -p /tmp/pr_files
+            for file in $community_files; do
+              if [ -f "$file" ]; then
+                mkdir -p "/tmp/pr_files/$(dirname "$file")"
+                cp "$file" "/tmp/pr_files/$file"
              fi
+            done
+            
+            # Reset branch to main (clean slate)
+            git reset --hard origin/main
+            
+            # Restore the community files
+            for file in $community_files; do
+              if [ -f "/tmp/pr_files/$file" ]; then
+                mkdir -p "$(dirname "$file")"
+                cp "/tmp/pr_files/$file" "$file"
+              fi
+            done
+            rm -rf /tmp/pr_files
+            
+            # Regenerate schema with current main + this submission's tags
+            python -m src.contributions.regenerate_pr_schema || true
+            
+            # Stage and commit all changes
+            git add community/ schemas/
+            if ! git diff --cached --quiet; then
+              git commit -m "Community submission (rebased on main)"
+              git push --force origin "$branch_name"
+              echo "  Rebased PR #$pr_number onto main"
            else
-              echo "  Merge conflict in PR #$pr_number, adding comment"
-              gh pr comment "$pr_number" --body $'⚠️ **Merge Conflict**\n\nAnother community submission was merged and this PR has conflicts.\n\nA maintainer may need to:\n1. Close this PR\n2. Remove the `approved` label from the original issue\n3. Re-add the `approved` label to regenerate the PR'
-              git merge --abort
-              fi
+              echo "  No changes needed for PR #$pr_number"
            fi
            
            git checkout main
@@ -281,4 +281,7 @@ read*lock
 .nx/

 # jsii-rosetta files
-type-fingerprints.txt
+type-fingerprints.txt
+
+notebooks/whatever.ipynb
+.snapshots/
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2026 PlaneQuery
+Copyright (c) 2026 OpenAirframes

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -1 +1,58 @@
-Downloads [`https://registry.faa.gov/database/ReleasableAircraft.zip`](https://registry.faa.gov/database/ReleasableAircraft.zip). Creates a daily GitHub Release at 06:00 UTC containing the unaltered `ReleasableAircraft.zip` and a derived CSV file with all data from FAA database since 2023-08-16. The FAA database updates daily at 05:30 UTC.
+# OpenAirframes.org
+
+OpenAirframes.org is an open-source, community-driven airframes database.
+
+The data includes:
+- Registration information from Civil Aviation Authorities (FAA)
+- Airline data (e.g., Air France)
+- Community contributions such as ownership details, military aircraft info, photos, and more
+
+---
+
+## For Users
+
+A daily release is created at **06:00 UTC** and includes:
+
+- **openairframes_community.csv**  
+  All community submissions
+
+- **openairframes_adsb.csv**  
+  Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that day’s ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
+Example Usage:
+```python
+import pandas as pd
+url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
+df = pd.read_csv(url)
+df
+```
+![](docs/images/df_adsb_example_0.png)
+- **openairframes_faa.csv**  
+  All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)
+
+
+- **ReleasableAircraft_{date}.zip**  
+  A daily snapshot of the FAA database, which updates at **05:30 UTC**
+
+---
+
+## For Contributors
+
+Submit data via a [GitHub Issue](https://github.com/PlaneQuery/OpenAirframes/issues/new?template=community_submission.yaml) with your preferred attribution. Once approved, it will appear in the daily release. A leaderboard will be available in the future.
+All data is valuable. Examples include:
+- Celebrity ownership (with citations)
+- Photos
+- Internet capability
+- Military aircraft information
+- Unique facts (e.g., an airframe that crashed, performs aerobatics, etc.)
+
+Please try to follow the submission formatting guidelines. If you are struggling with them, that is fine—submit your data anyway and it will be formatted for you.
+
+---
+
+## For Developers
+All code, compute (GitHub Actions), and storage (releases) are in this GitHub repository Improvements are welcome. Potential features include:
+- Web UI for data
+- Web UI for contributors
+- Additional export formats in the daily release
+- Data fusion from multiple sources in the daily release
+- Automated airframe data connectors, including (but not limited to) civil aviation authorities and airline APIs
@@ -1,19 +0,0 @@
-[
-  {
-    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
-    "creation_timestamp": "2026-02-12T20:52:47.207684+00:00",
-    "registration_number": "N12345",
-    "tags": {
-      "internet": "starlink"
-    }
-  },
-  {
-    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
-    "creation_timestamp": "2026-02-12T20:52:47.207684+00:00",
-    "tags": {
-      "internet": "viasat",
-      "owner": "John Doe"
-    },
-    "transponder_code_hex": "ABC123"
-  }
-]
@@ -0,0 +1,40 @@
+[
+  {
+    "contributor_name": "JohnSmith.com",
+    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
+    "creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
+    "registration_number": "ZM146",
+    "tags": {
+      "citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
+      "icao_aircraft_type": "L1J",
+      "manufacturer_icao": "LOCKHEED MARTIN",
+      "manufacturer_name": "Lockheed-martin",
+      "model": "F-35B Lightning II",
+      "operator": "Royal Air Force",
+      "operator_callsign": "RAFAIR",
+      "operator_icao": "RFR",
+      "serial_number": "BK-12",
+      "type_code": "VF35"
+    },
+    "transponder_code_hex": "43C81C"
+  },
+  {
+    "contributor_name": "JohnSmith.com",
+    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
+    "creation_timestamp": "2026-02-18T22:18:11.349009+00:00",
+    "registration_number": "ZM148",
+    "tags": {
+      "citation_0": "https://assets.publishing.service.gov.uk/media/5c07a65f40f0b6705f11cf37/10389.pdf",
+      "icao_aircraft_type": "L1J",
+      "manufacturer_icao": "LOCKHEED MARTIN",
+      "manufacturer_name": "Lockheed-martin",
+      "model": "F-35B Lightning II",
+      "operator": "Royal Air Force",
+      "operator_callsign": "RAFAIR",
+      "operator_icao": "RFR",
+      "serial_number": "BK-14",
+      "type_code": "VF35"
+    },
+    "transponder_code_hex": "43C811"
+  }
+]
@@ -1,11 +0,0 @@
-#!/usr/bin/env python3
-import os
-import aws_cdk as cdk
-from stack import AdsbProcessingStack
-
-app = cdk.App()
-AdsbProcessingStack(app, "AdsbProcessingStack", env=cdk.Environment(
-    account=os.environ["CDK_DEFAULT_ACCOUNT"],
-    region=os.environ["CDK_DEFAULT_REGION"],
-))
-app.synth()
@@ -1,3 +0,0 @@
-{
-  "app": "python3 app.py"
-}
@@ -1,2 +0,0 @@
-aws-cdk-lib>=2.170.0
-constructs>=10.0.0
@@ -1,213 +0,0 @@
-import aws_cdk as cdk
-from aws_cdk import (
-    Stack,
-    Duration,
-    RemovalPolicy,
-    aws_s3 as s3,
-    aws_ecs as ecs,
-    aws_ec2 as ec2,
-    aws_ecr_assets,
-    aws_iam as iam,
-    aws_logs as logs,
-    aws_stepfunctions as sfn,
-    aws_stepfunctions_tasks as sfn_tasks,
-)
-from constructs import Construct
-from pathlib import Path
-
-
-class AdsbProcessingStack(Stack):
-    def __init__(self, scope: Construct, id: str, **kwargs):
-        super().__init__(scope, id, **kwargs)
-
-        # --- S3 bucket for intermediate and final results ---
-        bucket = s3.Bucket(
-            self, "ResultsBucket",
-            bucket_name="planequery-aircraft-dev",
-            removal_policy=RemovalPolicy.DESTROY,
-            auto_delete_objects=True,
-            lifecycle_rules=[
-                s3.LifecycleRule(
-                    prefix="intermediate/",
-                    expiration=Duration.days(7),
-                )
-            ],
-        )
-
-        # --- Use default VPC (no additional cost) ---
-        vpc = ec2.Vpc.from_lookup(
-            self, "Vpc",
-            is_default=True,
-        )
-
-        # --- ECS Cluster ---
-        cluster = ecs.Cluster(
-            self, "Cluster",
-            vpc=vpc,
-            container_insights=True,
-        )
-
-        # --- Log group ---
-        log_group = logs.LogGroup(
-            self, "LogGroup",
-            log_group_name="/adsb-processing",
-            removal_policy=RemovalPolicy.DESTROY,
-            retention=logs.RetentionDays.TWO_WEEKS,
-        )
-
-        # --- Docker images (built from local Dockerfiles) ---
-        adsb_dir = str(Path(__file__).parent.parent / "src" / "adsb")
-
-        worker_image = ecs.ContainerImage.from_asset(
-            adsb_dir,
-            file="Dockerfile.worker",
-            platform=cdk.aws_ecr_assets.Platform.LINUX_ARM64,
-        )
-        reducer_image = ecs.ContainerImage.from_asset(
-            adsb_dir,
-            file="Dockerfile.reducer",
-            platform=cdk.aws_ecr_assets.Platform.LINUX_ARM64,
-        )
-
-        # --- Task role (shared) ---
-        task_role = iam.Role(
-            self, "TaskRole",
-            assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
-        )
-        bucket.grant_read_write(task_role)
-
-        # --- MAP: worker task definition ---
-        map_task_def = ecs.FargateTaskDefinition(
-            self, "MapTaskDef",
-            cpu=4096,           # 4 vCPU
-            memory_limit_mib=30720,  # 30 GB
-            task_role=task_role,
-            runtime_platform=ecs.RuntimePlatform(
-                cpu_architecture=ecs.CpuArchitecture.ARM64,
-                operating_system_family=ecs.OperatingSystemFamily.LINUX,
-            ),
-        )
-        map_container = map_task_def.add_container(
-            "worker",
-            image=worker_image,
-            logging=ecs.LogDrivers.aws_logs(
-                stream_prefix="map",
-                log_group=log_group,
-            ),
-            environment={
-                "S3_BUCKET": bucket.bucket_name,
-            },
-        )
-
-        # --- REDUCE: reducer task definition ---
-        reduce_task_def = ecs.FargateTaskDefinition(
-            self, "ReduceTaskDef",
-            cpu=4096,            # 4 vCPU
-            memory_limit_mib=30720,  # 30 GB — must hold full year in memory
-            task_role=task_role,
-            runtime_platform=ecs.RuntimePlatform(
-                cpu_architecture=ecs.CpuArchitecture.ARM64,
-                operating_system_family=ecs.OperatingSystemFamily.LINUX,
-            ),
-        )
-        reduce_container = reduce_task_def.add_container(
-            "reducer",
-            image=reducer_image,
-            logging=ecs.LogDrivers.aws_logs(
-                stream_prefix="reduce",
-                log_group=log_group,
-            ),
-            environment={
-                "S3_BUCKET": bucket.bucket_name,
-            },
-        )
-
-        # --- Step Functions ---
-
-        # Map task: run ECS Fargate for each date chunk
-        map_ecs_task = sfn_tasks.EcsRunTask(
-            self, "ProcessChunk",
-            integration_pattern=sfn.IntegrationPattern.RUN_JOB,
-            cluster=cluster,
-            task_definition=map_task_def,
-            launch_target=sfn_tasks.EcsFargateLaunchTarget(
-                platform_version=ecs.FargatePlatformVersion.LATEST,
-            ),
-            container_overrides=[
-                sfn_tasks.ContainerOverride(
-                    container_definition=map_container,
-                    environment=[
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="START_DATE",
-                            value=sfn.JsonPath.string_at("$.start_date"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="END_DATE",
-                            value=sfn.JsonPath.string_at("$.end_date"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="RUN_ID",
-                            value=sfn.JsonPath.string_at("$.run_id"),
-                        ),
-                    ],
-                )
-            ],
-            assign_public_ip=True,
-            subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
-            result_path="$.task_result",
-        )
-
-        # Map state — max 3 concurrent workers
-        map_state = sfn.Map(
-            self, "FanOutChunks",
-            items_path="$.chunks",
-            max_concurrency=3,
-            result_path="$.map_results",
-        )
-        map_state.item_processor(map_ecs_task)
-
-        # Reduce task: combine all chunk CSVs
-        reduce_ecs_task = sfn_tasks.EcsRunTask(
-            self, "ReduceResults",
-            integration_pattern=sfn.IntegrationPattern.RUN_JOB,
-            cluster=cluster,
-            task_definition=reduce_task_def,
-            launch_target=sfn_tasks.EcsFargateLaunchTarget(
-                platform_version=ecs.FargatePlatformVersion.LATEST,
-            ),
-            container_overrides=[
-                sfn_tasks.ContainerOverride(
-                    container_definition=reduce_container,
-                    environment=[
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="RUN_ID",
-                            value=sfn.JsonPath.string_at("$.run_id"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="GLOBAL_START_DATE",
-                            value=sfn.JsonPath.string_at("$.global_start_date"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="GLOBAL_END_DATE",
-                            value=sfn.JsonPath.string_at("$.global_end_date"),
-                        ),
-                    ],
-                )
-            ],
-            assign_public_ip=True,
-            subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
-        )
-
-        # Chain: fan-out map → reduce
-        definition = map_state.next(reduce_ecs_task)
-
-        sfn.StateMachine(
-            self, "Pipeline",
-            state_machine_name="adsb-map-reduce",
-            definition_body=sfn.DefinitionBody.from_chainable(definition),
-            timeout=Duration.hours(48),
-        )
-
-        # --- Outputs ---
-        cdk.CfnOutput(self, "BucketName", value=bucket.bucket_name)
-        cdk.CfnOutput(self, "StateMachineName", value="adsb-map-reduce")
@@ -1,640 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "06ae0319",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import clickhouse_connect\n",
-    "client = clickhouse_connect.get_client(\n",
-    "    host=os.environ[\"CLICKHOUSE_HOST\"],\n",
-    "    username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
-    "    password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
-    "    secure=True,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "779710f0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = client.query_df(\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '2024-01-01 00:00:00' AND time < '2024-01-02 00:00:00'\")\n",
-    "df_copy = df.copy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bf024da8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# -- military = dbFlags & 1; interesting = dbFlags & 2; PIA = dbFlags & 4; LADD = dbFlags & 8;"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "270607b5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = load_raw_adsb_for_day(datetime(2024,1,1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ac06a30e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['aircraft']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "91edab3e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
-    "def compress_df(df):\n",
-    "    icao = df.name\n",
-    "    df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
-    "    original_df = df.copy()\n",
-    "    df = df.groupby(\"_signature\", as_index=False).last() # check if it works with both last and first.\n",
-    "    # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
-    "    def get_non_empty_dict(row):\n",
-    "        return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
-    "    \n",
-    "    df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
-    "    df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
-    "    \n",
-    "    # Check if row i's non-empty values are a subset of row j's non-empty values\n",
-    "    def is_subset_of_any(idx):\n",
-    "        row_dict = df.loc[idx, '_non_empty_dict']\n",
-    "        row_count = df.loc[idx, '_non_empty_count']\n",
-    "        \n",
-    "        for other_idx in df.index:\n",
-    "            if idx == other_idx:\n",
-    "                continue\n",
-    "            other_dict = df.loc[other_idx, '_non_empty_dict']\n",
-    "            other_count = df.loc[other_idx, '_non_empty_count']\n",
-    "            \n",
-    "            # Check if all non-empty values in current row match those in other row\n",
-    "            if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
-    "                # If they match and other has more defined columns, current row is redundant\n",
-    "                if other_count > row_count:\n",
-    "                    return True\n",
-    "        return False\n",
-    "    \n",
-    "    # Keep rows that are not subsets of any other row\n",
-    "    keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
-    "    df = df[keep_mask]\n",
-    "\n",
-    "    if len(df) > 1:\n",
-    "        original_df = original_df[original_df['_signature'].isin(df['_signature'])]\n",
-    "        value_counts = original_df[\"_signature\"].value_counts()\n",
-    "        max_signature = value_counts.idxmax()\n",
-    "        df = df[df['_signature'] == max_signature]\n",
-    "\n",
-    "    df['icao'] = icao\n",
-    "    df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
-    "    return df\n",
-    "\n",
-    "# df = df_copy\n",
-    "# df = df_copy.iloc[0:100000]\n",
-    "# df = df[df['r'] == \"N4131T\"]\n",
-    "# df = df[(df['icao'] == \"008081\")]\n",
-    "# df = df.iloc[0:500]\n",
-    "df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
-    "df = df.drop(columns=['aircraft'])\n",
-    "df = df.sort_values(['icao', 'time'])\n",
-    "df[COLUMNS] = df[COLUMNS].fillna('')\n",
-    "ORIGINAL_COLUMNS = df.columns.tolist()\n",
-    "df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
-    "cols = df_compressed.columns.tolist()\n",
-    "cols.remove(\"icao\")\n",
-    "cols.insert(1, \"icao\")\n",
-    "df_compressed = df_compressed[cols]\n",
-    "df_compressed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "efdfcb2c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
-    "df[~df['aircraft_category'].isna()]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "495c5025",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SOME KIND OF MAP REDUCE SYSTEM\n",
-    "import os\n",
-    "\n",
-    "COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
-    "def compress_df(df):\n",
-    "    icao = df.name\n",
-    "    df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
-    "    \n",
-    "    # Compute signature counts before grouping (avoid copy)\n",
-    "    signature_counts = df[\"_signature\"].value_counts()\n",
-    "    \n",
-    "    df = df.groupby(\"_signature\", as_index=False).first() # check if it works with both last and first.\n",
-    "    # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
-    "    def get_non_empty_dict(row):\n",
-    "        return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
-    "    \n",
-    "    df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
-    "    df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
-    "    \n",
-    "    # Check if row i's non-empty values are a subset of row j's non-empty values\n",
-    "    def is_subset_of_any(idx):\n",
-    "        row_dict = df.loc[idx, '_non_empty_dict']\n",
-    "        row_count = df.loc[idx, '_non_empty_count']\n",
-    "        \n",
-    "        for other_idx in df.index:\n",
-    "            if idx == other_idx:\n",
-    "                continue\n",
-    "            other_dict = df.loc[other_idx, '_non_empty_dict']\n",
-    "            other_count = df.loc[other_idx, '_non_empty_count']\n",
-    "            \n",
-    "            # Check if all non-empty values in current row match those in other row\n",
-    "            if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
-    "                # If they match and other has more defined columns, current row is redundant\n",
-    "                if other_count > row_count:\n",
-    "                    return True\n",
-    "        return False\n",
-    "    \n",
-    "    # Keep rows that are not subsets of any other row\n",
-    "    keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
-    "    df = df[keep_mask]\n",
-    "\n",
-    "    if len(df) > 1:\n",
-    "        # Use pre-computed signature counts instead of original_df\n",
-    "        remaining_sigs = df['_signature']\n",
-    "        sig_counts = signature_counts[remaining_sigs]\n",
-    "        max_signature = sig_counts.idxmax()\n",
-    "        df = df[df['_signature'] == max_signature]\n",
-    "\n",
-    "    df['icao'] = icao\n",
-    "    df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
-    "    return df\n",
-    "\n",
-    "# names of releases something like\n",
-    "# planequery_aircraft_adsb_2024-06-01T00-00-00Z.csv.gz\n",
-    "\n",
-    "# Let's build historical first. \n",
-    "\n",
-    "_ch_client = None\n",
-    "\n",
-    "def _get_clickhouse_client():\n",
-    "    \"\"\"Return a reusable ClickHouse client, with retry/backoff for transient DNS or connection errors.\"\"\"\n",
-    "    global _ch_client\n",
-    "    if _ch_client is not None:\n",
-    "        return _ch_client\n",
-    "\n",
-    "    import clickhouse_connect\n",
-    "    import time\n",
-    "\n",
-    "    max_retries = 5\n",
-    "    for attempt in range(1, max_retries + 1):\n",
-    "        try:\n",
-    "            _ch_client = clickhouse_connect.get_client(\n",
-    "                host=os.environ[\"CLICKHOUSE_HOST\"],\n",
-    "                username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
-    "                password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
-    "                secure=True,\n",
-    "            )\n",
-    "            return _ch_client\n",
-    "        except Exception as e:\n",
-    "            wait = min(2 ** attempt, 30)\n",
-    "            print(f\"  ClickHouse connect attempt {attempt}/{max_retries} failed: {e}\")\n",
-    "            if attempt == max_retries:\n",
-    "                raise\n",
-    "            print(f\"  Retrying in {wait}s...\")\n",
-    "            time.sleep(wait)\n",
-    "\n",
-    "\n",
-    "def load_raw_adsb_for_day(day):\n",
-    "    \"\"\"Load raw ADS-B data for a day from cache or ClickHouse.\"\"\"\n",
-    "    from datetime import timedelta\n",
-    "    from pathlib import Path\n",
-    "    import pandas as pd\n",
-    "    import time\n",
-    "    \n",
-    "    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)\n",
-    "    end_time = start_time + timedelta(days=1)\n",
-    "    \n",
-    "    # Set up caching\n",
-    "    cache_dir = Path(\"data/adsb\")\n",
-    "    cache_dir.mkdir(parents=True, exist_ok=True)\n",
-    "    cache_file = cache_dir / f\"adsb_raw_{start_time.strftime('%Y-%m-%d')}.csv.zst\"\n",
-    "    \n",
-    "    # Check if cache exists\n",
-    "    if cache_file.exists():\n",
-    "        print(f\"  Loading from cache: {cache_file}\")\n",
-    "        df = pd.read_csv(cache_file, compression='zstd')\n",
-    "        df['time'] = pd.to_datetime(df['time'])\n",
-    "    else:\n",
-    "        # Format dates for the query\n",
-    "        start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')\n",
-    "        end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')\n",
-    "        \n",
-    "        max_retries = 3\n",
-    "        for attempt in range(1, max_retries + 1):\n",
-    "            try:\n",
-    "                client = _get_clickhouse_client()\n",
-    "                print(f\"  Querying ClickHouse for {start_time.strftime('%Y-%m-%d')}\")\n",
-    "                df = client.query_df(f\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '{start_str}' AND time < '{end_str}'\")\n",
-    "                break\n",
-    "            except Exception as e:\n",
-    "                wait = min(2 ** attempt, 30)\n",
-    "                print(f\"  Query attempt {attempt}/{max_retries} failed: {e}\")\n",
-    "                if attempt == max_retries:\n",
-    "                    raise\n",
-    "                # Reset client in case connection is stale\n",
-    "                global _ch_client\n",
-    "                _ch_client = None\n",
-    "                print(f\"  Retrying in {wait}s...\")\n",
-    "                time.sleep(wait)\n",
-    "        \n",
-    "        # Save to cache\n",
-    "        df.to_csv(cache_file, index=False, compression='zstd')\n",
-    "        print(f\"  Saved to cache: {cache_file}\")\n",
-    "    \n",
-    "    return df\n",
-    "\n",
-    "def load_historical_for_day(day):\n",
-    "    from pathlib import Path\n",
-    "    import pandas as pd\n",
-    "    \n",
-    "    df = load_raw_adsb_for_day(day)\n",
-    "    print(df)\n",
-    "    df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
-    "    df = df.drop(columns=['aircraft'])\n",
-    "    df = df.sort_values(['icao', 'time'])\n",
-    "    df[COLUMNS] = df[COLUMNS].fillna('')\n",
-    "    df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
-    "    cols = df_compressed.columns.tolist()\n",
-    "    cols.remove('time')\n",
-    "    cols.insert(0, 'time')\n",
-    "    cols.remove(\"icao\")\n",
-    "    cols.insert(1, \"icao\")\n",
-    "    df_compressed = df_compressed[cols]\n",
-    "    return df_compressed\n",
-    "\n",
-    "\n",
-    "def concat_compressed_dfs(df_base, df_new):\n",
-    "    \"\"\"Concatenate base and new compressed dataframes, keeping the most informative row per ICAO.\"\"\"\n",
-    "    import pandas as pd\n",
-    "    \n",
-    "    # Combine both dataframes\n",
-    "    df_combined = pd.concat([df_base, df_new], ignore_index=True)\n",
-    "    \n",
-    "    # Sort by ICAO and time\n",
-    "    df_combined = df_combined.sort_values(['icao', 'time'])\n",
-    "    \n",
-    "    # Fill NaN values\n",
-    "    df_combined[COLUMNS] = df_combined[COLUMNS].fillna('')\n",
-    "    \n",
-    "    # Apply compression logic per ICAO to get the best row\n",
-    "    df_compressed = df_combined.groupby('icao', group_keys=False).apply(compress_df)\n",
-    "    \n",
-    "    # Sort by time\n",
-    "    df_compressed = df_compressed.sort_values('time')\n",
-    "    \n",
-    "    return df_compressed\n",
-    "\n",
-    "\n",
-    "def get_latest_aircraft_adsb_csv_df():\n",
-    "    \"\"\"Download and load the latest ADS-B CSV from GitHub releases.\"\"\"\n",
-    "    from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv\n",
-    "    \n",
-    "    import pandas as pd\n",
-    "    import re\n",
-    "    \n",
-    "    csv_path = download_latest_aircraft_adsb_csv()\n",
-    "    df = pd.read_csv(csv_path)\n",
-    "    df = df.fillna(\"\")\n",
-    "    \n",
-    "    # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv\n",
-    "    match = re.search(r\"planequery_aircraft_adsb_(\\d{4}-\\d{2}-\\d{2})_\", str(csv_path))\n",
-    "    if not match:\n",
-    "        raise ValueError(f\"Could not extract date from filename: {csv_path.name}\")\n",
-    "    \n",
-    "    date_str = match.group(1)\n",
-    "    return df, date_str\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7f66acf7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# SOME KIND OF MAP REDUCE SYSTEM\n",
-    "\n",
-    "\n",
-    "COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
-    "def compress_df(df):\n",
-    "    icao = df.name\n",
-    "    df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
-    "    original_df = df.copy()\n",
-    "    df = df.groupby(\"_signature\", as_index=False).first() # check if it works with both last and first.\n",
-    "    # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
-    "    def get_non_empty_dict(row):\n",
-    "        return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
-    "    \n",
-    "    df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
-    "    df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
-    "    \n",
-    "    # Check if row i's non-empty values are a subset of row j's non-empty values\n",
-    "    def is_subset_of_any(idx):\n",
-    "        row_dict = df.loc[idx, '_non_empty_dict']\n",
-    "        row_count = df.loc[idx, '_non_empty_count']\n",
-    "        \n",
-    "        for other_idx in df.index:\n",
-    "            if idx == other_idx:\n",
-    "                continue\n",
-    "            other_dict = df.loc[other_idx, '_non_empty_dict']\n",
-    "            other_count = df.loc[other_idx, '_non_empty_count']\n",
-    "            \n",
-    "            # Check if all non-empty values in current row match those in other row\n",
-    "            if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
-    "                # If they match and other has more defined columns, current row is redundant\n",
-    "                if other_count > row_count:\n",
-    "                    return True\n",
-    "        return False\n",
-    "    \n",
-    "    # Keep rows that are not subsets of any other row\n",
-    "    keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
-    "    df = df[keep_mask]\n",
-    "\n",
-    "    if len(df) > 1:\n",
-    "        original_df = original_df[original_df['_signature'].isin(df['_signature'])]\n",
-    "        value_counts = original_df[\"_signature\"].value_counts()\n",
-    "        max_signature = value_counts.idxmax()\n",
-    "        df = df[df['_signature'] == max_signature]\n",
-    "\n",
-    "    df['icao'] = icao\n",
-    "    df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
-    "    return df\n",
-    "\n",
-    "# names of releases something like\n",
-    "# planequery_aircraft_adsb_2024-06-01T00-00-00Z.csv.gz\n",
-    "\n",
-    "# Let's build historical first. \n",
-    "\n",
-    "def load_raw_adsb_for_day(day):\n",
-    "    \"\"\"Load raw ADS-B data for a day from cache or ClickHouse.\"\"\"\n",
-    "    from datetime import timedelta\n",
-    "    import clickhouse_connect\n",
-    "    from pathlib import Path\n",
-    "    import pandas as pd\n",
-    "    \n",
-    "    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)\n",
-    "    end_time = start_time + timedelta(days=1)\n",
-    "    \n",
-    "    # Set up caching\n",
-    "    cache_dir = Path(\"data/adsb\")\n",
-    "    cache_dir.mkdir(parents=True, exist_ok=True)\n",
-    "    cache_file = cache_dir / f\"adsb_raw_{start_time.strftime('%Y-%m-%d')}.csv.zst\"\n",
-    "    \n",
-    "    # Check if cache exists\n",
-    "    if cache_file.exists():\n",
-    "        print(f\"  Loading from cache: {cache_file}\")\n",
-    "        df = pd.read_csv(cache_file, compression='zstd')\n",
-    "        df['time'] = pd.to_datetime(df['time'])\n",
-    "    else:\n",
-    "        # Format dates for the query\n",
-    "        start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')\n",
-    "        end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')\n",
-    "        \n",
-    "        client = clickhouse_connect.get_client(\n",
-    "            host=os.environ[\"CLICKHOUSE_HOST\"],\n",
-    "            username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
-    "            password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
-    "            secure=True,\n",
-    "        )\n",
-    "        print(f\"  Querying ClickHouse for {start_time.strftime('%Y-%m-%d')}\")\n",
-    "        df = client.query_df(f\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '{start_str}' AND time < '{end_str}'\")\n",
-    "        \n",
-    "        # Save to cache\n",
-    "        df.to_csv(cache_file, index=False, compression='zstd')\n",
-    "        print(f\"  Saved to cache: {cache_file}\")\n",
-    "    \n",
-    "    return df\n",
-    "\n",
-    "def load_historical_for_day(day):\n",
-    "    from pathlib import Path\n",
-    "    import pandas as pd\n",
-    "    \n",
-    "    df = load_raw_adsb_for_day(day)\n",
-    "    \n",
-    "    df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
-    "    df = df.drop(columns=['aircraft'])\n",
-    "    df = df.sort_values(['icao', 'time'])\n",
-    "    df[COLUMNS] = df[COLUMNS].fillna('')\n",
-    "    df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
-    "    cols = df_compressed.columns.tolist()\n",
-    "    cols.remove('time')\n",
-    "    cols.insert(0, 'time')\n",
-    "    cols.remove(\"icao\")\n",
-    "    cols.insert(1, \"icao\")\n",
-    "    df_compressed = df_compressed[cols]\n",
-    "    return df_compressed\n",
-    "\n",
-    "\n",
-    "def concat_compressed_dfs(df_base, df_new):\n",
-    "    \"\"\"Concatenate base and new compressed dataframes, keeping the most informative row per ICAO.\"\"\"\n",
-    "    import pandas as pd\n",
-    "    \n",
-    "    # Combine both dataframes\n",
-    "    df_combined = pd.concat([df_base, df_new], ignore_index=True)\n",
-    "    \n",
-    "    # Sort by ICAO and time\n",
-    "    df_combined = df_combined.sort_values(['icao', 'time'])\n",
-    "    \n",
-    "    # Fill NaN values\n",
-    "    df_combined[COLUMNS] = df_combined[COLUMNS].fillna('')\n",
-    "    \n",
-    "    # Apply compression logic per ICAO to get the best row\n",
-    "    df_compressed = df_combined.groupby('icao', group_keys=False).apply(compress_df)\n",
-    "    \n",
-    "    # Sort by time\n",
-    "    df_compressed = df_compressed.sort_values('time')\n",
-    "    \n",
-    "    return df_compressed\n",
-    "\n",
-    "\n",
-    "def get_latest_aircraft_adsb_csv_df():\n",
-    "    \"\"\"Download and load the latest ADS-B CSV from GitHub releases.\"\"\"\n",
-    "    from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv\n",
-    "    \n",
-    "    import pandas as pd\n",
-    "    import re\n",
-    "    \n",
-    "    csv_path = download_latest_aircraft_adsb_csv()\n",
-    "    df = pd.read_csv(csv_path)\n",
-    "    df = df.fillna(\"\")\n",
-    "    \n",
-    "    # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv\n",
-    "    match = re.search(r\"planequery_aircraft_adsb_(\\d{4}-\\d{2}-\\d{2})_\", str(csv_path))\n",
-    "    if not match:\n",
-    "        raise ValueError(f\"Could not extract date from filename: {csv_path.name}\")\n",
-    "    \n",
-    "    date_str = match.group(1)\n",
-    "    return df, date_str\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e14c8363",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datetime import datetime\n",
-    "df = load_historical_for_day(datetime(2024,1,1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3874ba4d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bcae50ad",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[(df['icao'] == \"008081\")]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "50921c86",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[df['icao'] == \"a4e1d2\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8194d9aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[df['r'] == \"N4131T\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1e3b7aa2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_compressed[df_compressed['icao'].duplicated(keep=False)]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "40613bc1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import gzip\n",
-    "import json\n",
-    "\n",
-    "path = \"/Users/jonahgoode/Downloads/test_extract/traces/fb/trace_full_acbbfb.json\"\n",
-    "\n",
-    "with gzip.open(path, \"rt\", encoding=\"utf-8\") as f:\n",
-    "    data = json.load(f)\n",
-    "\n",
-    "print(type(data))\n",
-    "# use `data` here\n",
-    "import json\n",
-    "print(json.dumps(data, indent=2)[:2000])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "320109b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# First, load the JSON to inspect its structure\n",
-    "import json\n",
-    "with open(\"/Users/jonahgoode/Documents/PlaneQuery/Other-Code/readsb-protobuf/webapp/src/db/aircrafts.json\", 'r') as f:\n",
-    "    data = json.load(f)\n",
-    "\n",
-    "# Check the structure\n",
-    "print(type(data))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "590134f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data['AC97E3']"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
@@ -1,6 +1,6 @@
 {
  "$schema": "https://json-schema.org/draft/2020-12/schema",
-  "title": "PlaneQuery Aircraft Community Submission (v1)",
+  "title": "OpenAirframes Community Submission (v1)",
  "type": "object",
  "additionalProperties": false,
  "properties": {
@@ -12,7 +12,7 @@
      "type": "string",
      "pattern": "^[0-9A-F]{6}$"
    },
-    "planequery_airframe_id": {
+    "openairframes_id": {
      "type": "string",
      "minLength": 1
    },
@@ -46,21 +46,45 @@
    },
    "tags": {
      "type": "object",
-      "description": "Community-defined tags. New tags can be added, but must use consistent types.",
+      "description": "Additional community-defined tags as key/value pairs (values may be scalar, array, or object).",
      "propertyNames": {
        "type": "string",
        "pattern": "^[a-z][a-z0-9_]{0,63}$"
      },
-      "properties": {
-        "internet": {
-          "type": "string"
-        },
-        "owner": {
-          "type": "string"
-        }
-      },
      "additionalProperties": {
        "$ref": "#/$defs/tagValue"
+      },
+      "properties": {
+        "citation_0": {
+          "type": "string"
+        },
+        "icao_aircraft_type": {
+          "type": "string"
+        },
+        "manufacturer_icao": {
+          "type": "string"
+        },
+        "manufacturer_name": {
+          "type": "string"
+        },
+        "model": {
+          "type": "string"
+        },
+        "operator": {
+          "type": "string"
+        },
+        "operator_callsign": {
+          "type": "string"
+        },
+        "operator_icao": {
+          "type": "string"
+        },
+        "serial_number": {
+          "type": "string"
+        },
+        "type_code": {
+          "type": "string"
+        }
      }
    }
  },
@@ -79,7 +103,7 @@
        },
        {
          "required": [
-            "planequery_airframe_id"
+            "openairframes_id"
          ]
        }
      ]
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import re
+from pathlib import Path
+import polars as pl
+
+# Find all CSV.gz files in the downloaded artifacts
+artifacts_dir = Path("downloads/adsb_artifacts")
+files = sorted(artifacts_dir.glob("*/openairframes_adsb_*.csv.gz"))
+
+if not files:
+    raise SystemExit("No CSV.gz files found in downloads/adsb_artifacts/")
+
+print(f"Found {len(files)} files to concatenate")
+
+# Extract dates from filenames to determine range
+def extract_dates(path: Path) -> tuple[str, str]:
+    """Extract start and end dates from filename"""
+    m = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv\.gz", path.name)
+    if m:
+        return m.group(1), m.group(2)
+    return None, None
+
+# Collect all dates
+all_dates = []
+for f in files:
+    start, end = extract_dates(f)
+    if start and end:
+        all_dates.extend([start, end])
+        print(f"  {f.name}: {start} to {end}")
+
+if not all_dates:
+    raise SystemExit("Could not extract dates from filenames")
+
+# Find earliest and latest dates
+earliest = min(all_dates)
+latest = max(all_dates)
+print(f"\nDate range: {earliest} to {latest}")
+
+# Read and concatenate all files
+print("\nReading and concatenating files...")
+frames = [pl.read_csv(f) for f in files]
+df = pl.concat(frames, how="vertical", rechunk=True)
+
+# Write output
+output_path = Path("downloads") / f"openairframes_adsb_{earliest}_{latest}.csv.gz"
+output_path.parent.mkdir(parents=True, exist_ok=True)
+df.write_csv(output_path, compression="gzip")
+
+print(f"\nWrote {output_path} with {df.height:,} rows")
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Create download directory
+mkdir -p downloads/adsb_artifacts
+
+# Repository from the workflow comment
+REPO="ggman12/OpenAirframes"
+
+# Get last 15 runs of the workflow and download matching artifacts
+gh run list \
+  --repo "$REPO" \
+  --workflow adsb-to-aircraft-multiple-day-run.yaml \
+  --limit 15 \
+  --json databaseId \
+  --jq '.[].databaseId' | while read -r run_id; do
+  
+  echo "Checking run ID: $run_id"
+  
+  # List artifacts for this run using the API
+  # Match pattern: openairframes_adsb-YYYY-MM-DD-YYYY-MM-DD (with second date)
+  gh api \
+    --paginate \
+    "repos/$REPO/actions/runs/$run_id/artifacts" \
+    --jq '.artifacts[] | select(.name | test("^openairframes_adsb-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}$")) | .name' | while read -r artifact_name; do
+    
+    # Check if artifact directory already exists and has files
+    if [ -d "downloads/adsb_artifacts/$artifact_name" ] && [ -n "$(ls -A "downloads/adsb_artifacts/$artifact_name" 2>/dev/null)" ]; then
+      echo "  Skipping (already exists): $artifact_name"
+      continue
+    fi
+    
+    echo "  Downloading: $artifact_name"
+    gh run download "$run_id" \
+      --repo "$REPO" \
+      --name "$artifact_name" \
+      --dir "downloads/adsb_artifacts/$artifact_name"
+  done
+done
+
+echo "Download complete! Files saved to downloads/adsb_artifacts/"
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+"""
+Download and concatenate artifacts from a specific set of workflow runs.
+
+Usage:
+    python scripts/download_and_concat_runs.py triggered_runs_20260216_123456.json
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def download_run_artifact(run_id, output_dir):
+    """Download artifact from a specific workflow run."""
+    print(f"  Downloading artifacts from run {run_id}...")
+    
+    cmd = [
+        'gh', 'run', 'download', str(run_id),
+        '--pattern', 'openairframes_adsb-*',
+        '--dir', output_dir
+    ]
+    
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode == 0:
+        print(f"  ✓ Downloaded")
+        return True
+    else:
+        if "no artifacts" in result.stderr.lower():
+            print(f"  ⚠ No artifacts found (workflow may still be running)")
+        else:
+            print(f"  ✗ Failed: {result.stderr}")
+        return False
+
+
+def find_csv_files(download_dir):
+    """Find all CSV.gz files in the download directory."""
+    csv_files = []
+    for root, dirs, files in os.walk(download_dir):
+        for file in files:
+            if file.endswith('.csv.gz'):
+                csv_files.append(os.path.join(root, file))
+    return sorted(csv_files)
+
+
+def concatenate_csv_files(csv_files, output_file):
+    """Concatenate CSV files in order, preserving headers."""
+    import gzip
+    
+    print(f"\nConcatenating {len(csv_files)} CSV files...")
+    
+    with gzip.open(output_file, 'wt') as outf:
+        header_written = False
+        
+        for i, csv_file in enumerate(csv_files, 1):
+            print(f"  [{i}/{len(csv_files)}] Processing {os.path.basename(csv_file)}")
+            
+            with gzip.open(csv_file, 'rt') as inf:
+                lines = inf.readlines()
+                
+                if not header_written:
+                    # Write header from first file
+                    outf.writelines(lines)
+                    header_written = True
+                else:
+                    # Skip header for subsequent files
+                    outf.writelines(lines[1:])
+    
+    print(f"\n✓ Concatenated CSV saved to: {output_file}")
+    
+    # Show file size
+    size_mb = os.path.getsize(output_file) / (1024 * 1024)
+    print(f"  Size: {size_mb:.1f} MB")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Download and concatenate artifacts from workflow runs'
+    )
+    parser.add_argument(
+        'runs_file',
+        help='JSON file containing run IDs (from run_historical_adsb_action.py)'
+    )
+    parser.add_argument(
+        '--output-dir',
+        default='./downloads/historical_concat',
+        help='Directory for downloads (default: ./downloads/historical_concat)'
+    )
+    parser.add_argument(
+        '--wait',
+        action='store_true',
+        help='Wait for workflows to complete before downloading'
+    )
+    
+    args = parser.parse_args()
+    
+    # Load run IDs
+    if not os.path.exists(args.runs_file):
+        print(f"Error: File not found: {args.runs_file}")
+        sys.exit(1)
+    
+    with open(args.runs_file, 'r') as f:
+        data = json.load(f)
+    
+    runs = data['runs']
+    start_date = data['start_date']
+    end_date = data['end_date']
+    
+    print("=" * 60)
+    print("Download and Concatenate Historical Artifacts")
+    print("=" * 60)
+    print(f"Date range: {start_date} to {end_date}")
+    print(f"Workflow runs: {len(runs)}")
+    print(f"Output directory: {args.output_dir}")
+    print("=" * 60)
+    
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    # Wait for workflows to complete if requested
+    if args.wait:
+        print("\nWaiting for workflows to complete...")
+        for run_info in runs:
+            run_id = run_info['run_id']
+            print(f"  Checking run {run_id}...")
+            
+            cmd = ['gh', 'run', 'watch', str(run_id)]
+            subprocess.run(cmd)
+    
+    # Download artifacts
+    print("\nDownloading artifacts...")
+    successful_downloads = 0
+    
+    for i, run_info in enumerate(runs, 1):
+        run_id = run_info['run_id']
+        print(f"\n[{i}/{len(runs)}] Run {run_id} ({run_info['start']} to {run_info['end']})")
+        
+        if download_run_artifact(run_id, args.output_dir):
+            successful_downloads += 1
+    
+    print(f"\n\nDownload Summary: {successful_downloads}/{len(runs)} artifacts downloaded")
+    
+    if successful_downloads == 0:
+        print("\nNo artifacts downloaded. Workflows may still be running.")
+        print("Use --wait to wait for completion, or try again later.")
+        sys.exit(1)
+    
+    # Find all CSV files
+    csv_files = find_csv_files(args.output_dir)
+    
+    if not csv_files:
+        print("\nError: No CSV files found in download directory")
+        sys.exit(1)
+    
+    print(f"\nFound {len(csv_files)} CSV file(s):")
+    for csv_file in csv_files:
+        print(f"  - {os.path.basename(csv_file)}")
+    
+    # Concatenate
+    # Calculate actual end date for filename (end_date - 1 day since it's exclusive)
+    from datetime import datetime, timedelta
+    end_dt = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
+    actual_end = end_dt.strftime('%Y-%m-%d')
+    
+    output_file = os.path.join(
+        args.output_dir,
+        f"openairframes_adsb_{start_date}_{actual_end}.csv.gz"
+    )
+    
+    concatenate_csv_files(csv_files, output_file)
+    
+    print("\n" + "=" * 60)
+    print("Done!")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+Script to trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks.
+
+Usage:
+    python scripts/run_historical_adsb_action.py --start-date 2025-01-01 --end-date 2025-06-01
+"""
+
+import argparse
+import subprocess
+import sys
+from datetime import datetime, timedelta
+from calendar import monthrange
+
+
+def generate_monthly_chunks(start_date_str, end_date_str):
+    """Generate date ranges in monthly chunks from start to end date.
+    
+    End dates are exclusive (e.g., to process Jan 1-31, end_date should be Feb 1).
+    """
+    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
+    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
+    
+    chunks = []
+    current = start_date
+    
+    while current < end_date:
+        # Get the first day of the next month (exclusive end)
+        _, days_in_month = monthrange(current.year, current.month)
+        month_end = current.replace(day=days_in_month)
+        next_month_start = month_end + timedelta(days=1)
+        
+        # Don't go past the global end date
+        chunk_end = min(next_month_start, end_date)
+        
+        chunks.append({
+            'start': current.strftime('%Y-%m-%d'),
+            'end': chunk_end.strftime('%Y-%m-%d')
+        })
+        
+        # Move to first day of next month
+        if next_month_start >= end_date:
+            break
+        current = next_month_start
+    
+    return chunks
+
+
+def trigger_workflow(start_date, end_date, repo='ggman12/OpenAirframes', branch='main', dry_run=False):
+    """Trigger the adsb-to-aircraft-multiple-day-run workflow via GitHub CLI."""
+    cmd = [
+        'gh', 'workflow', 'run', 'adsb-to-aircraft-multiple-day-run.yaml',
+        '--repo', repo,
+        '--ref', branch,
+        '-f', f'start_date={start_date}',
+        '-f', f'end_date={end_date}'
+    ]
+    
+    if dry_run:
+        print(f"[DRY RUN] Would run: {' '.join(cmd)}")
+        return True, None
+    
+    print(f"Triggering workflow: {start_date} to {end_date} (on {branch})")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode == 0:
+        print(f"✓ Successfully triggered workflow for {start_date} to {end_date}")
+        
+        # Get the run ID of the workflow we just triggered
+        # Wait a moment for it to appear
+        import time
+        time.sleep(2)
+        
+        # Get the most recent run (should be the one we just triggered)
+        list_cmd = [
+            'gh', 'run', 'list',
+            '--repo', repo,
+            '--workflow', 'adsb-to-aircraft-multiple-day-run.yaml',
+            '--branch', branch,
+            '--limit', '1',
+            '--json', 'databaseId',
+            '--jq', '.[0].databaseId'
+        ]
+        list_result = subprocess.run(list_cmd, capture_output=True, text=True)
+        run_id = list_result.stdout.strip() if list_result.returncode == 0 else None
+        
+        return True, run_id
+    else:
+        print(f"✗ Failed to trigger workflow for {start_date} to {end_date}")
+        print(f"Error: {result.stderr}")
+        return False, None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Trigger adsb-to-aircraft-multiple-day-run workflow runs in monthly chunks'
+    )
+    parser.add_argument(
+        '--start-date', '--start_date',
+        dest='start_date',
+        required=True,
+        help='Start date in YYYY-MM-DD format (inclusive)'
+    )
+    parser.add_argument(
+        '--end-date', '--end_date',
+        dest='end_date',
+        required=True,
+        help='End date in YYYY-MM-DD format (exclusive)'
+    )
+    parser.add_argument(
+        '--repo',
+        type=str,
+        default='ggman12/OpenAirframes',
+        help='GitHub repository (default: ggman12/OpenAirframes)'
+    )
+    parser.add_argument(
+        '--branch',
+        type=str,
+        default='main',
+        help='Branch to run the workflow on (default: main)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Print commands without executing them'
+    )
+    parser.add_argument(
+        '--delay',
+        type=int,
+        default=5,
+        help='Delay in seconds between workflow triggers (default: 5)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate dates
+    try:
+        start = datetime.strptime(args.start_date, '%Y-%m-%d')
+        end = datetime.strptime(args.end_date, '%Y-%m-%d')
+        if start > end:
+            print("Error: start_date must be before or equal to end_date")
+            sys.exit(1)
+    except ValueError as e:
+        print(f"Error: Invalid date format - {e}")
+        sys.exit(1)
+    
+    # Generate monthly chunks
+    chunks = generate_monthly_chunks(args.start_date, args.end_date)
+    
+    print(f"\nGenerating {len(chunks)} monthly workflow runs on branch '{args.branch}' (repo: {args.repo}):")
+    for i, chunk in enumerate(chunks, 1):
+        print(f"  {i}. {chunk['start']} to {chunk['end']}")
+    
+    if not args.dry_run:
+        response = input(f"\nProceed with triggering {len(chunks)} workflows on '{args.branch}'? [y/N]: ")
+        if response.lower() != 'y':
+            print("Cancelled.")
+            sys.exit(0)
+    
+    print()
+    
+    # Trigger workflows
+    import time
+    success_count = 0
+    triggered_runs = []
+    
+    for i, chunk in enumerate(chunks, 1):
+        print(f"\n[{i}/{len(chunks)}] ", end='')
+        
+        success, run_id = trigger_workflow(
+            chunk['start'],
+            chunk['end'],
+            repo=args.repo,
+            branch=args.branch,
+            dry_run=args.dry_run
+        )
+        
+        if success:
+            success_count += 1
+            if run_id:
+                triggered_runs.append({
+                    'run_id': run_id,
+                    'start': chunk['start'],
+                    'end': chunk['end']
+                })
+        
+        # Add delay between triggers (except for last one)
+        if i < len(chunks) and not args.dry_run:
+            time.sleep(args.delay)
+    
+    print(f"\n\nSummary: {success_count}/{len(chunks)} workflows triggered successfully")
+    
+    # Save triggered run IDs to a file
+    if triggered_runs and not args.dry_run:
+        import json
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        runs_file = f"./output/triggered_runs_{timestamp}.json"
+        with open(runs_file, 'w') as f:
+            json.dump({
+                'start_date': args.start_date,
+                'end_date': args.end_date,
+                'repo': args.repo,
+                'branch': args.branch,
+                'runs': triggered_runs
+            }, f, indent=2)
+        print(f"\nRun IDs saved to: {runs_file}")
+        print(f"\nTo download and concatenate these artifacts, run:")
+        print(f"  python scripts/download_and_concat_runs.py {runs_file}")
+    
+    if success_count < len(chunks):
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Run src.adsb.main in an isolated git worktree so edits in the main
+working tree won't affect subprocess imports during the run.
+
+Usage:
+    python scripts/run_main_isolated.py 2026-01-01
+    python scripts/run_main_isolated.py --start_date 2026-01-01 --end_date 2026-01-03
+"""
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def run(
+    cmd: list[str],
+    *,
+    cwd: Path | None = None,
+    check: bool = True,
+) -> subprocess.CompletedProcess:
+    print(f"\n>>> {' '.join(cmd)}")
+    return subprocess.run(cmd, cwd=cwd, check=check)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run src.adsb.main in an isolated worktree")
+    parser.add_argument("date", nargs="?", help="Single date to process (YYYY-MM-DD)")
+    parser.add_argument("--start_date", help="Start date (inclusive, YYYY-MM-DD)")
+    parser.add_argument("--end_date", help="End date (exclusive, YYYY-MM-DD)")
+    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
+    args = parser.parse_args()
+
+    if args.date and (args.start_date or args.end_date):
+        raise SystemExit("Use a single date or --start_date/--end_date, not both.")
+
+    if args.date:
+        datetime.strptime(args.date, "%Y-%m-%d")
+        main_args = ["--date", args.date]
+    else:
+        if not args.start_date or not args.end_date:
+            raise SystemExit("Provide --start_date and --end_date, or a single date.")
+        datetime.strptime(args.start_date, "%Y-%m-%d")
+        datetime.strptime(args.end_date, "%Y-%m-%d")
+        main_args = ["--start_date", args.start_date, "--end_date", args.end_date]
+
+    if args.concat_with_latest_csv:
+        main_args.append("--concat_with_latest_csv")
+
+    repo_root = Path(__file__).resolve().parents[1]
+    snapshots_root = repo_root / ".snapshots"
+    snapshots_root.mkdir(exist_ok=True)
+
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+    snapshot_root = snapshots_root / f"run_{timestamp}"
+    snapshot_src = snapshot_root / "src"
+
+    exit_code = 0
+    try:
+        shutil.copytree(repo_root / "src", snapshot_src)
+
+        runner = (
+            "import sys, runpy; "
+            f"sys.path.insert(0, {repr(str(snapshot_root))}); "
+            f"sys.argv = ['src.adsb.main'] + {main_args!r}; "
+            "runpy.run_module('src.adsb.main', run_name='__main__')"
+        )
+        cmd = [sys.executable, "-c", runner]
+        run(cmd, cwd=repo_root)
+    except subprocess.CalledProcessError as exc:
+        exit_code = exc.returncode
+    finally:
+        shutil.rmtree(snapshot_root, ignore_errors=True)
+
+    return exit_code
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
+
+Source: "TheAirTraffic Database - Aircraft 2.csv"
+Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
+
+Categories in the spreadsheet columns (paired: name, registrations, separator):
+  Col  1-3:  Business
+  Col  4-6:  Government
+  Col  7-9:  People
+  Col 10-12: Sports
+  Col 13-15: Celebrity
+  Col 16-18: State Govt./Law
+  Col 19-21: Other
+  Col 22-24: Test Aircraft
+  Col 25-27: YouTubers
+  Col 28-30: Formula 1 VIP's
+  Col 31-33: Active GII's and GIII's  (test/demo aircraft)
+  Col 34-37: Russia & Ukraine          (extra col for old/new)
+  Col 38-40: Helicopters & Blimps
+  Col 41-43: Unique Reg's
+  Col 44-46: Saudi & UAE
+  Col 47-49: Schools
+  Col 50-52: Special Charter
+  Col 53-55: Unknown Owners
+  Col 56-59: Frequent Flyers           (extra cols: name, aircraft, logged, hours)
+"""
+
+import csv
+import json
+import hashlib
+import re
+import sys
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+# ── Category mapping ────────────────────────────────────────────────────────
+# Each entry: (name_col, reg_col, owner_category_tags)
+# owner_category_tags is a dict of tag keys to add beyond "owner"
+CATEGORY_COLUMNS = [
+    # (name_col, reg_col, {tag_key: tag_value, ...})
+    (1,  2,  {"owner_category_0": "business"}),
+    (4,  5,  {"owner_category_0": "government"}),
+    (7,  8,  {"owner_category_0": "celebrity"}),
+    (10, 11, {"owner_category_0": "sports"}),
+    (13, 14, {"owner_category_0": "celebrity"}),
+    (16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
+    (19, 20, {"owner_category_0": "other"}),
+    (22, 23, {"owner_category_0": "test_aircraft"}),
+    (25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
+    (28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
+    (31, 32, {"owner_category_0": "test_aircraft"}),
+    # Russia & Ukraine: col 34=name, col 35 or 36 may have reg
+    (34, 35, {"owner_category_0": "russia_ukraine"}),
+    (38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
+    (41, 42, {"owner_category_0": "other"}),
+    (44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
+    (47, 48, {"owner_category_0": "education"}),
+    (50, 51, {"owner_category_0": "charter"}),
+    (53, 54, {"owner_category_0": "unknown"}),
+    (56, 57, {"owner_category_0": "celebrity"}),   # Frequent Flyers name col, aircraft col
+]
+
+# First data row index (0-based) in the CSV
+DATA_START_ROW = 4
+
+# ── Contributor info ────────────────────────────────────────────────────────
+CONTRIBUTOR_NAME = "TheAirTraffic"
+# Deterministic UUID v5 from contributor name
+CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
+
+# Citation
+CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
+
+
+def looks_like_military_serial(reg: str) -> bool:
+    """
+    Detect military-style serials like 92-9000, 82-8000, 98-0001
+    or pure numeric IDs like 929000, 828000, 980001.
+    These aren't standard civil registrations; use openairframes_id.
+    """
+    # Pattern: NN-NNNN
+    if re.match(r'^\d{2}-\d{4}$', reg):
+        return True
+    # Pure 6-digit numbers (likely ICAO hex or military mode-S)
+    if re.match(r'^\d{6}$', reg):
+        return True
+    # Short numeric-only (1-5 digits) like "01", "02", "676"
+    if re.match(r'^\d{1,5}$', reg):
+        return True
+    return False
+
+
+def normalize_reg(raw: str) -> str:
+    """Clean up a registration string."""
+    reg = raw.strip().rstrip(',').strip()
+    # Remove carriage returns and other whitespace
+    reg = reg.replace('\r', '').replace('\n', '').strip()
+    return reg
+
+
+def parse_regs(cell_value: str) -> list[str]:
+    """
+    Parse a cell that may contain one or many registrations,
+    separated by commas, possibly wrapped in quotes.
+    """
+    if not cell_value or not cell_value.strip():
+        return []
+
+    # Some cells have ADS-B exchange URLs – skip those
+    if 'globe.adsbexchange.com' in cell_value:
+        return []
+    if cell_value.strip() in ('.', ',', ''):
+        return []
+
+    results = []
+    # Split on comma
+    parts = cell_value.split(',')
+    for part in parts:
+        reg = normalize_reg(part)
+        if not reg:
+            continue
+        # Skip URLs, section labels, etc.
+        if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
+            continue
+        # Skip if it's just whitespace or dots
+        if reg in ('.', '..', '...'):
+            continue
+        results.append(reg)
+    return results
+
+
+def make_submission(
+    reg: str,
+    owner: str,
+    category_tags: dict[str, str],
+) -> dict:
+    """Build a single community_submission.v1 object."""
+
+    entry: dict = {}
+
+    # Decide identifier field
+    if looks_like_military_serial(reg):
+        entry["openairframes_id"] = reg
+    else:
+        entry["registration_number"] = reg
+
+    # Tags
+    tags: dict = {
+        "citation_0": CITATION,
+    }
+    if owner:
+        tags["owner"] = owner.strip()
+    tags.update(category_tags)
+    entry["tags"] = tags
+
+    return entry
+
+
+def main():
+    csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
+        "/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
+    )
+
+    if not csv_path.exists():
+        print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
+        sys.exit(1)
+
+    # Read CSV
+    with open(csv_path, 'r', encoding='utf-8-sig') as f:
+        reader = csv.reader(f)
+        rows = list(reader)
+
+    print(f"Read {len(rows)} rows from {csv_path.name}")
+
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    submissions: list[dict] = []
+    seen: set[tuple] = set()  # (reg, owner) dedup
+
+    for row_idx in range(DATA_START_ROW, len(rows)):
+        row = rows[row_idx]
+        if len(row) < 3:
+            continue
+
+        for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
+            if reg_col >= len(row) or name_col >= len(row):
+                continue
+
+            owner_raw = row[name_col].strip().rstrip(',').strip()
+            reg_raw = row[reg_col]
+
+            # Clean owner name
+            owner = owner_raw.replace('\r', '').replace('\n', '').strip()
+            if not owner or owner in ('.', ',', 'Section 1'):
+                continue
+            # Skip header-like values
+            if owner.startswith('http') or owner.startswith('Link '):
+                continue
+
+            regs = parse_regs(reg_raw)
+            if not regs:
+                # For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
+                if name_col == 34 and reg_col + 1 < len(row):
+                    regs = parse_regs(row[reg_col + 1])
+
+            for reg in regs:
+                key = (reg, owner)
+                if key in seen:
+                    continue
+                seen.add(key)
+                submissions.append(make_submission(reg, owner, cat_tags))
+
+    print(f"Generated {len(submissions)} submissions")
+
+    # Write output
+    proj_root = Path(__file__).resolve().parent.parent
+    out_dir = proj_root / "community" / date_str
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    out_file = out_dir / f"theairtraffic_{date_str}.json"
+
+    with open(out_file, 'w', encoding='utf-8') as f:
+        json.dump(submissions, f, indent=2, ensure_ascii=False)
+
+    print(f"Written to {out_file}")
+    print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
+
+    # Quick stats
+    cats = {}
+    for s in submissions:
+        c = s['tags'].get('owner_category_0', 'NONE')
+        cats[c] = cats.get(c, 0) + 1
+    print("\nCategory breakdown:")
+    for c, n in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {c}: {n}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Validate the generated theairtraffic JSON output."""
+import json
+import glob
+import sys
+
+# Find the latest output
+files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
+if not files:
+    print("No output files found!")
+    sys.exit(1)
+
+path = files[-1]
+print(f"Validating: {path}")
+
+with open(path) as f:
+    data = json.load(f)
+
+print(f"Total entries: {len(data)}")
+
+# Check military serial handling
+mil = [d for d in data if "openairframes_id" in d]
+print(f"\nEntries using openairframes_id: {len(mil)}")
+for m in mil[:10]:
+    print(f"  {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
+
+# Check youtuber entries
+yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
+print(f"\nYouTuber entries: {len(yt)}")
+for y in yt[:5]:
+    reg = y.get("registration_number", y.get("openairframes_id"))
+    c0 = y["tags"].get("owner_category_0")
+    c1 = y["tags"].get("owner_category_1")
+    print(f"  {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
+
+# Check US Govt / military
+gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
+print(f"\nUSA 747/757 entries: {len(gov)}")
+for g in gov:
+    oid = g.get("openairframes_id", g.get("registration_number"))
+    print(f"  {oid}")
+
+# Schema validation
+issues = 0
+for i, d in enumerate(data):
+    has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
+    if not has_id:
+        print(f"  Entry {i}: no identifier!")
+        issues += 1
+    if "tags" not in d:
+        print(f"  Entry {i}: no tags!")
+        issues += 1
+    # Check tag key format
+    for k in d.get("tags", {}):
+        import re
+        if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
+            print(f"  Entry {i}: invalid tag key '{k}'")
+            issues += 1
+
+print(f"\nSchema issues: {issues}")
+
+# Category breakdown
+cats = {}
+for s in data:
+    c = s["tags"].get("owner_category_0", "NONE")
+    cats[c] = cats.get(c, 0) + 1
+print("\nCategory breakdown:")
+for c, n in sorted(cats.items(), key=lambda x: -x[1]):
+    print(f"  {c}: {n}")
@@ -1,11 +0,0 @@
-FROM --platform=linux/arm64 python:3.12-slim
-
-WORKDIR /app
-
-COPY requirements.reducer.txt requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY compress_adsb_to_aircraft_data.py .
-COPY reducer.py .
-
-CMD ["python", "-u", "reducer.py"]
@@ -1,12 +0,0 @@
-FROM --platform=linux/arm64 python:3.12-slim
-
-WORKDIR /app
-
-COPY requirements.worker.txt requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt
-
-COPY compress_adsb_to_aircraft_data.py .
-COPY download_adsb_data_to_parquet.py .
-COPY worker.py .
-
-CMD ["python", "-u", "worker.py"]
@@ -1,250 +0,0 @@
-"""
-Combines chunk parquet files and compresses to final aircraft CSV.
-This is the reduce phase of the map-reduce pipeline.
-
-Supports both single-day (daily) and multi-day (historical) modes.
-
-Memory-efficient: processes each chunk separately, compresses, then combines.
-
-Usage:
-    # Daily mode
-    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
-    
-    # Historical mode
-    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
-"""
-import gc
-import os
-import sys
-import glob
-import argparse
-from datetime import datetime, timedelta
-
-import polars as pl
-
-from src.adsb.download_adsb_data_to_parquet import OUTPUT_DIR, get_resource_usage
-from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLUMNS
-
-
-DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
-FINAL_OUTPUT_DIR = "./data/planequery_aircraft"
-os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
-
-
-def get_target_day() -> datetime:
-    """Get yesterday's date (the day we're processing)."""
-    return datetime.utcnow() - timedelta(days=1)
-
-
-def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
-    """Load and compress a single chunk parquet file.
-    
-    Args:
-        chunk_path: Path to parquet file
-        delete_after_load: If True, delete the parquet file after loading to free disk space
-    """
-    print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
-    
-    # Load chunk - only columns we need
-    needed_columns = ['time', 'icao'] + COLUMNS
-    df = pl.read_parquet(chunk_path, columns=needed_columns)
-    print(f"  Loaded {len(df)} rows")
-    
-    # Delete file immediately after loading to free disk space
-    if delete_after_load:
-        try:
-            os.remove(chunk_path)
-            print(f"  Deleted {chunk_path} to free disk space")
-        except Exception as e:
-            print(f"  Warning: Failed to delete {chunk_path}: {e}")
-    
-    # Compress to aircraft records (one per ICAO) using shared function
-    compressed = compress_multi_icao_df(df, verbose=True)
-    print(f"  Compressed to {len(compressed)} aircraft records")
-    
-    del df
-    gc.collect()
-    
-    return compressed
-
-
-def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFrame:
-    """Combine multiple compressed DataFrames.
-    
-    Since chunks are partitioned by ICAO hash, each ICAO only appears in one chunk.
-    No deduplication needed here - just concatenate.
-    """
-    print(f"Combining {len(compressed_dfs)} compressed chunks... | {get_resource_usage()}")
-    
-    # Concat all
-    combined = pl.concat(compressed_dfs)
-    print(f"Combined: {len(combined)} records")
-    
-    return combined
-
-
-def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame:
-    """Download base release and merge with new data."""
-    from src.get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv
-    
-    print("Downloading base ADS-B release...")
-    try:
-        base_path = download_latest_aircraft_adsb_csv(
-            output_dir="./data/planequery_aircraft_base"
-        )
-        print(f"Download returned: {base_path}")
-        
-        if base_path and os.path.exists(str(base_path)):
-            print(f"Loading base release from {base_path}")
-            base_df = pl.read_csv(base_path)
-            print(f"Base release has {len(base_df)} records")
-            
-            # Ensure columns match
-            base_cols = set(base_df.columns)
-            new_cols = set(compressed_df.columns)
-            print(f"Base columns: {sorted(base_cols)}")
-            print(f"New columns: {sorted(new_cols)}")
-            
-            # Add missing columns
-            for col in new_cols - base_cols:
-                base_df = base_df.with_columns(pl.lit(None).alias(col))
-            for col in base_cols - new_cols:
-                compressed_df = compressed_df.with_columns(pl.lit(None).alias(col))
-            
-            # Reorder columns to match
-            compressed_df = compressed_df.select(base_df.columns)
-            
-            # Concat and deduplicate by icao (keep new data - it comes last)
-            combined = pl.concat([base_df, compressed_df])
-            print(f"After concat: {len(combined)} records")
-            
-            deduplicated = combined.unique(subset=["icao"], keep="last")
-            
-            print(f"Combined with base: {len(combined)} -> {len(deduplicated)} after dedup")
-            
-            del base_df, combined
-            gc.collect()
-            
-            return deduplicated
-        else:
-            print(f"No base release found at {base_path}, using only new data")
-            return compressed_df
-    except Exception as e:
-        import traceback
-        print(f"Failed to download base release: {e}")
-        traceback.print_exc()
-        return compressed_df
-
-
-def cleanup_chunks(output_id: str, chunks_dir: str):
-    """Delete chunk parquet files after successful merge."""
-    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
-    chunk_files = glob.glob(pattern)
-    for f in chunk_files:
-        try:
-            os.remove(f)
-            print(f"Deleted {f}")
-        except Exception as e:
-            print(f"Failed to delete {f}: {e}")
-
-
-def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
-    """Find chunk parquet files matching the output ID."""
-    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
-    chunk_files = sorted(glob.glob(pattern))
-    
-    if not chunk_files:
-        # Try recursive search for historical mode with merged artifacts
-        pattern = os.path.join(chunks_dir, "**", "*.parquet")
-        chunk_files = sorted(glob.glob(pattern, recursive=True))
-    
-    return chunk_files
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
-    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
-    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
-    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
-    parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
-    parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
-    parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
-    parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
-    args = parser.parse_args()
-    
-    # Determine output ID and filename based on mode
-    if args.start_date and args.end_date:
-        # Historical mode
-        output_id = f"{args.start_date}_{args.end_date}"
-        output_filename = f"planequery_aircraft_adsb_{args.start_date}_{args.end_date}.csv"
-        print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
-    else:
-        # Daily mode
-        if args.date:
-            target_day = datetime.strptime(args.date, "%Y-%m-%d")
-        else:
-            target_day = get_target_day()
-        
-        date_str = target_day.strftime("%Y-%m-%d")
-        output_id = date_str
-        output_filename = f"planequery_aircraft_adsb_{date_str}.csv"
-        print(f"Combining chunks for {date_str}")
-    
-    chunks_dir = args.chunks_dir
-    print(f"Chunks directory: {chunks_dir}")
-    print(f"Resource usage at start: {get_resource_usage()}")
-    
-    # Find chunk files
-    chunk_files = find_chunk_files(chunks_dir, output_id)
-    
-    if not chunk_files:
-        print(f"No chunk files found in: {chunks_dir}")
-        sys.exit(1)
-    
-    print(f"Found {len(chunk_files)} chunk files")
-    
-    # Process each chunk separately to save memory
-    # With --stream, delete parquet files immediately after loading to save disk space
-    compressed_chunks = []
-    for chunk_path in chunk_files:
-        compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
-        compressed_chunks.append(compressed)
-        gc.collect()
-    
-    # Combine all compressed chunks
-    combined = combine_compressed_chunks(compressed_chunks)
-    
-    # Free memory from individual chunks
-    del compressed_chunks
-    gc.collect()
-    print(f"After combining: {get_resource_usage()}")
-    
-    # Merge with base release (unless skipped)
-    if not args.skip_base:
-        combined = download_and_merge_base_release(combined)
-    
-    # Convert list columns to strings for CSV compatibility
-    for col in combined.columns:
-        if combined[col].dtype == pl.List:
-            combined = combined.with_columns(
-                pl.col(col).list.join(",").alias(col)
-            )
-    
-    # Sort by time for consistent output
-    if 'time' in combined.columns:
-        combined = combined.sort('time')
-    
-    # Write final CSV
-    output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
-    combined.write_csv(output_path)
-    print(f"Wrote {len(combined)} records to {output_path}")
-    
-    # Cleanup
-    if not args.keep_chunks:
-        cleanup_chunks(output_id, chunks_dir)
-    
-    print(f"Done! | {get_resource_usage()}")
-
-
-if __name__ == "__main__":
-    main()
@@ -5,23 +5,6 @@ import polars as pl
 COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']


-def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
-    """For each icao, keep only the earliest row with each unique signature.
-    
-    This is used for deduplicating across multiple compressed chunks.
-    """
-    # Create signature column
-    df = df.with_columns(
-        pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
-    )
-    # Group by icao and signature, take first row (earliest due to time sort)
-    df = df.sort("time")
-    df_deduped = df.group_by(["icao", "_signature"]).first()
-    df_deduped = df_deduped.drop("_signature")
-    df_deduped = df_deduped.sort("time")
-    return df_deduped
-
-
 def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
    """Compress a single ICAO group to its most informative row using Polars."""
    # Create signature string
@@ -99,9 +82,6 @@ def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
 def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
    """Compress a DataFrame with multiple ICAOs to one row per ICAO.
    
-    This is the main entry point for compressing ADS-B data.
-    Used by both daily GitHub Actions runs and historical AWS runs.
-    
    Args:
        df: DataFrame with columns ['time', 'icao'] + COLUMNS
        verbose: Whether to print progress
@@ -120,29 +100,27 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
    
-    # First pass: quick deduplication of exact duplicates
+    # Quick deduplication of exact duplicates
    df = df.unique(subset=['icao'] + COLUMNS, keep='first')
    if verbose:
        print(f"After quick dedup: {df.height} records")
    
-    # Second pass: sophisticated compression per ICAO
+    # Compress per ICAO
    if verbose:
        print("Compressing per ICAO...")
    
-    # Process each ICAO group
    icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
    compressed_dfs = []
    
    for icao_key, group_df in icao_groups.items():
-        # partition_by with as_dict=True returns tuple keys, extract first element
-        icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
+        icao = icao_key[0]
        compressed = compress_df_polars(group_df, str(icao))
        compressed_dfs.append(compressed)
    
    if compressed_dfs:
        df_compressed = pl.concat(compressed_dfs)
    else:
-        df_compressed = df.head(0)  # Empty with same schema
+        df_compressed = df.head(0)
    
    if verbose:
        print(f"After compress: {df_compressed.height} records")
@@ -155,45 +133,22 @@ def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFra
    return df_compressed


-def load_raw_adsb_for_day(day):
-    """Load raw ADS-B data for a day from parquet file."""
-    from datetime import timedelta
+def load_parquet_part(part_id: int, date: str) -> pl.DataFrame:
+    """Load a single parquet part file for a date.
+    
+    Args:
+        part_id: Part ID (e.g., 1, 2, 3)
+        date: Date string in YYYY-MM-DD format
+    
+    Returns:
+        DataFrame with ADS-B data
+    """
    from pathlib import Path
    
-    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
-    
-    # Check for parquet file first
-    version_date = f"v{start_time.strftime('%Y.%m.%d')}"
-    parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
+    parquet_file = Path(f"data/output/parquet_output/part_{part_id}_{date}.parquet")
    
    if not parquet_file.exists():
-        # Try to generate parquet file by calling the download function
-        print(f"  Parquet file not found: {parquet_file}")
-        print(f"  Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
-        
-        from download_adsb_data_to_parquet import create_parquet_for_day
-        result_path = create_parquet_for_day(start_time, keep_folders=False)
-        
-        if result_path:
-            print(f"  Successfully generated parquet file: {result_path}")
-        else:
-            raise Exception("Failed to generate parquet file")
-    
-    if parquet_file.exists():
-        print(f"  Loading from parquet: {parquet_file}")
-        df = pl.read_parquet(
-            parquet_file, 
-            columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
-        )
-        
-        # Convert to timezone-naive datetime
-        if df["time"].dtype == pl.Datetime:
-            df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
-        
-        return df
-    else:
-        # Return empty DataFrame if parquet file doesn't exist
-        print(f"  No data available for {start_time.strftime('%Y-%m-%d')}")
+        print(f"Parquet file not found: {parquet_file}")
        return pl.DataFrame(schema={
            'time': pl.Datetime,
            'icao': pl.Utf8,
@@ -205,17 +160,33 @@ def load_raw_adsb_for_day(day):
            'desc': pl.Utf8,
            'aircraft_category': pl.Utf8
        })
+    
+    print(f"Loading from parquet: {parquet_file}")
+    df = pl.read_parquet(
+        parquet_file,
+        columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
+    )
+    
+    # Convert to timezone-naive datetime
+    if df["time"].dtype == pl.Datetime:
+        df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
+    os.remove(parquet_file)
+    return df


-def load_historical_for_day(day):
-    """Load and compress historical ADS-B data for a day."""
-    df = load_raw_adsb_for_day(day)
+def compress_parquet_part(part_id: int, date: str) -> pl.DataFrame:
+    """Load and compress a single parquet part file."""
+    df = load_parquet_part(part_id, date)
+    
    if df.height == 0:
        return df
+
+    # Filter to rows within the given date (UTC-naive). This is because sometimes adsb.lol export can have rows at 00:00:00 of next day or similar.
+    date_lit = pl.lit(date).str.strptime(pl.Date, "%Y-%m-%d")
+    df = df.filter(pl.col("time").dt.date() == date_lit)
    
-    print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")
+    print(f"Loaded {df.height} raw records for part {part_id}, date {date}")
    
-    # Use shared compression function
    return compress_multi_icao_df(df, verbose=True)


@@ -223,52 +194,4 @@ def concat_compressed_dfs(df_base, df_new):
    """Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
    # Combine both dataframes
    df_combined = pl.concat([df_base, df_new])
-    
-    # Sort by ICAO and time
-    df_combined = df_combined.sort(['icao', 'time'])
-    
-    # Fill null values
-    for col in COLUMNS:
-        if col in df_combined.columns:
-            df_combined = df_combined.with_columns(pl.col(col).fill_null(""))
-    
-    # Apply compression logic per ICAO to get the best row
-    icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
-    compressed_dfs = []
-    
-    for icao, group_df in icao_groups.items():
-        compressed = compress_df_polars(group_df, icao)
-        compressed_dfs.append(compressed)
-    
-    if compressed_dfs:
-        df_compressed = pl.concat(compressed_dfs)
-    else:
-        df_compressed = df_combined.head(0)
-    
-    # Sort by time
-    df_compressed = df_compressed.sort('time')
-    
-    return df_compressed
-
-
-def get_latest_aircraft_adsb_csv_df():
-    """Download and load the latest ADS-B CSV from GitHub releases."""
-    from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv
-    import re
-    
-    csv_path = download_latest_aircraft_adsb_csv()
-    df = pl.read_csv(csv_path, null_values=[""])
-    
-    # Fill nulls with empty strings
-    for col in df.columns:
-        if df[col].dtype == pl.Utf8:
-            df = df.with_columns(pl.col(col).fill_null(""))
-    
-    # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv
-    match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
-    if not match:
-        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
-    
-    date_str = match.group(1)
-    return df, date_str
-
+    return df_combined
@@ -0,0 +1,67 @@
+from pathlib import Path
+import polars as pl
+import argparse
+import os
+OUTPUT_DIR = Path("./data/output")
+CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]
+
+def main():
+    parser = argparse.ArgumentParser(description="Concatenate compressed parquet files for a single day")
+    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
+    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Whether to also concatenate with the latest CSV from GitHub releases")
+    args = parser.parse_args()
+
+    compressed_dir = OUTPUT_DIR / "compressed"
+    date_dir = compressed_dir / args.date
+
+    parquet_files = sorted(date_dir.glob("*.parquet"))
+    df = None
+    if parquet_files: # TODO: This logic could be updated slightly.
+        print(f"No parquet files found in {date_dir}")
+
+        frames = [pl.read_parquet(p) for p in parquet_files]
+        df = pl.concat(frames, how="vertical", rechunk=True)
+
+        df = df.sort(["time", "icao"])
+        df = df.select(CORRECT_ORDER_OF_COLUMNS)
+        
+        output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
+        print(f"Writing combined parquet to {output_path} with {df.height} rows")
+        df.write_parquet(output_path)
+
+        csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
+        print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
+        df.write_csv(csv_output_path, compression="gzip")
+
+    if args.concat_with_latest_csv:
+        print("Loading latest CSV from GitHub releases to concatenate with...")
+        from src.get_latest_release import get_latest_aircraft_adsb_csv_df
+        from datetime import datetime
+        
+        df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
+        
+        # Compare dates: end_date is exclusive, so if csv_end_date > args.date, 
+        # the latest CSV already includes this day's data
+        csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
+        args_dt = datetime.strptime(args.date, "%Y-%m-%d")
+        
+        if df is None or csv_end_dt >= args_dt:
+            print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
+            print("Writing latest CSV directly without concatenation to avoid duplicates")
+            os.makedirs(OUTPUT_DIR, exist_ok=True)
+            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
+            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+            df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
+        else:
+            print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
+            # Ensure column order matches before concatenating
+            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+            from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
+            df_final = concat_compressed_dfs(df_latest_csv, df)
+            df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
+            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
+            df_final.write_csv(final_csv_output_path, compression="gzip")
+        print(f"Final CSV written to {final_csv_output_path}")
+
+if __name__ == "__main__":
+    main()
@@ -1,42 +1,33 @@
 """
 Downloads adsb.lol data and writes to Parquet files.

-Usage:
-    python -m src.process_historical_adsb_data.download_to_parquet 2025-01-01 2025-01-02
-
-This will download trace data for the specified date range and output Parquet files.
-
-This file is self-contained and does not import from other project modules.
+This file contains utility functions for downloading and processing adsb.lol trace data.
+Used by the historical ADS-B processing pipeline.
 """
-import gc
-import glob
+import datetime as dt
 import gzip
+import os
+import re
 import resource
 import shutil
-import sys
-import logging
-import time
-import re
 import signal
-import concurrent.futures
 import subprocess
-import os
-import argparse
-import datetime as dt
-from datetime import datetime, timedelta, timezone
-import urllib.request
+import sys
 import urllib.error
-
+import urllib.request
+from datetime import datetime
+import time
 import orjson
 import pyarrow as pa
 import pyarrow.parquet as pq
+from pathlib import Path


 # ============================================================================
 # Configuration
 # ============================================================================

-OUTPUT_DIR = "./data/output"
+OUTPUT_DIR = Path("./data/output")
 os.makedirs(OUTPUT_DIR, exist_ok=True)

 PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output")
@@ -76,19 +67,16 @@ def timeout_handler(signum, frame):
    raise DownloadTimeoutException("Download timed out after 40 seconds")


-def fetch_releases(version_date: str) -> list:
-    """Fetch GitHub releases for a given version date from adsblol."""
-    year = version_date.split('.')[0][1:]
-    if version_date == "v2024.12.31":
-        year = "2025"
+def _fetch_releases_from_repo(year: str, version_date: str) -> list:
+    """Fetch GitHub releases for a given version date from a specific year's adsblol repo."""
    BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
-    PATTERN = f"{version_date}-planes-readsb-prod-0"
+    PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+(tmp)?$"
    releases = []
    page = 1
    
    while True:
        max_retries = 10
-        retry_delay = 60
+        retry_delay = 60*5
        
        for attempt in range(1, max_retries + 1):
            try:
@@ -100,7 +88,7 @@ def fetch_releases(version_date: str) -> list:
                    else:
                        print(f"Failed to fetch releases (attempt {attempt}/{max_retries}): {response.status} {response.reason}")
                        if attempt < max_retries:
-                            print(f"Waiting {retry_delay} seconds before retry...")
+                            print(f"Waiting {retry_delay} seconds before retry")
                            time.sleep(retry_delay)
                        else:
                            print(f"Giving up after {max_retries} attempts")
@@ -108,7 +96,7 @@ def fetch_releases(version_date: str) -> list:
            except Exception as e:
                print(f"Request exception (attempt {attempt}/{max_retries}): {e}")
                if attempt < max_retries:
-                    print(f"Waiting {retry_delay} seconds before retry...")
+                    print(f"Waiting {retry_delay} seconds before retry")
                    time.sleep(retry_delay)
                else:
                    print(f"Giving up after {max_retries} attempts")
@@ -122,41 +110,118 @@ def fetch_releases(version_date: str) -> list:
    return releases


-def download_asset(asset_url: str, file_path: str) -> bool:
-    """Download a single release asset."""
+def fetch_releases(version_date: str) -> list:
+    """Fetch GitHub releases for a given version date from adsblol.
+    
+    For Dec 31 dates, if no releases are found in the current year's repo,
+    also checks the next year's repo (adsblol sometimes publishes Dec 31
+    data in the following year's repository).
+    """
+    year = version_date.split('.')[0][1:]
+    releases = _fetch_releases_from_repo(year, version_date)
+    
+    # For last day of year, also check next year's repo if nothing found
+    if not releases and version_date.endswith(".12.31"):
+        next_year = str(int(year) + 1)
+        print(f"No releases found for {version_date} in {year} repo, checking {next_year} repo")
+        releases = _fetch_releases_from_repo(next_year, version_date)
+    
+    return releases
+
+
+def download_asset(asset_url: str, file_path: str, expected_size: int | None = None) -> bool:
+    """Download a single release asset with size verification.
+    
+    Args:
+        asset_url: URL to download from
+        file_path: Local path to save to
+        expected_size: Expected file size in bytes (for verification)
+    
+    Returns:
+        True if download succeeded and size matches (if provided), False otherwise
+    """
    os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
    
+    # Check if file exists and has correct size
    if os.path.exists(file_path):
-        print(f"[SKIP] {file_path} already downloaded.")
-        return True
-    
-    print(f"Downloading {asset_url}...")
-    try:
-        signal.signal(signal.SIGALRM, timeout_handler)
-        signal.alarm(40)  # 40-second timeout
-        
-        req = urllib.request.Request(asset_url, headers=HEADERS)
-        with urllib.request.urlopen(req) as response:
-            signal.alarm(0)
-            
-            if response.status == 200:
-                with open(file_path, "wb") as file:
-                    while True:
-                        chunk = response.read(8192)
-                        if not chunk:
-                            break
-                        file.write(chunk)
-                print(f"Saved {file_path}")
+        if expected_size is not None:
+            actual_size = os.path.getsize(file_path)
+            if actual_size == expected_size:
+                print(f"[SKIP] {file_path} already downloaded and verified ({actual_size} bytes).")
                return True
            else:
-                print(f"Failed to download {asset_url}: {response.status} {response.msg}")
+                print(f"[WARN] {file_path} exists but size mismatch (expected {expected_size}, got {actual_size}). Re-downloading.")
+                os.remove(file_path)
+        else:
+            print(f"[SKIP] {file_path} already downloaded.")
+            return True
+    
+    max_retries = 2
+    retry_delay = 30
+    timeout_seconds = 140
+    
+    for attempt in range(1, max_retries + 1):
+        print(f"Downloading {asset_url} (attempt {attempt}/{max_retries})")
+        try:
+            req = urllib.request.Request(asset_url, headers=HEADERS)
+            with urllib.request.urlopen(req, timeout=timeout_seconds) as response:
+                if response.status == 200:
+                    with open(file_path, "wb") as file:
+                        while True:
+                            chunk = response.read(8192)
+                            if not chunk:
+                                break
+                            file.write(chunk)
+                    
+                    # Verify file size if expected_size was provided
+                    if expected_size is not None:
+                        actual_size = os.path.getsize(file_path)
+                        if actual_size != expected_size:
+                            print(f"[ERROR] Size mismatch for {file_path}: expected {expected_size} bytes, got {actual_size} bytes")
+                            os.remove(file_path)
+                            if attempt < max_retries:
+                                print(f"Waiting {retry_delay} seconds before retry")
+                                time.sleep(retry_delay)
+                                continue
+                            return False
+                        print(f"Saved {file_path} ({actual_size} bytes, verified)")
+                    else:
+                        print(f"Saved {file_path}")
+                    return True
+                else:
+                    print(f"Failed to download {asset_url}: {response.status} {response.msg}")
+                    if attempt < max_retries:
+                        print(f"Waiting {retry_delay} seconds before retry")
+                        time.sleep(retry_delay)
+                    else:
+                        return False
+        except urllib.error.HTTPError as e:
+            if e.code == 404:
+                print(f"404 Not Found: {asset_url}")
+                raise Exception(f"Asset not found (404): {asset_url}")
+            else:
+                print(f"HTTP error occurred (attempt {attempt}/{max_retries}): {e.code} {e.reason}")
+                if attempt < max_retries:
+                    print(f"Waiting {retry_delay} seconds before retry")
+                    time.sleep(retry_delay)
+                else:
+                    return False
+        except urllib.error.URLError as e:
+            print(f"URL/Timeout error (attempt {attempt}/{max_retries}): {e}")
+            if attempt < max_retries:
+                print(f"Waiting {retry_delay} seconds before retry")
+                time.sleep(retry_delay)
+            else:
                return False
-    except DownloadTimeoutException as e:
-        print(f"Download aborted for {asset_url}: {e}")
-        return False
-    except Exception as e:
-        print(f"An error occurred while downloading {asset_url}: {e}")
-        return False
+        except Exception as e:
+            print(f"An error occurred (attempt {attempt}/{max_retries}): {e}")
+            if attempt < max_retries:
+                print(f"Waiting {retry_delay} seconds before retry")
+                time.sleep(retry_delay)
+            else:
+                return False
+    
+    return False


 def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
@@ -187,19 +252,40 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        cat_proc = subprocess.Popen(
            ["cat"] + file_paths,
            stdout=subprocess.PIPE,
-            stderr=subprocess.DEVNULL
+            stderr=subprocess.PIPE
        )
        tar_cmd = ["tar", "xf", "-", "-C", extract_dir, "--strip-components=1"]
-        subprocess.run(
+        result = subprocess.run(
            tar_cmd,
            stdin=cat_proc.stdout,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-            check=True
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
        )
        cat_proc.stdout.close()
+        cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
        cat_proc.wait()
        
+        if cat_stderr:
+            print(f"cat stderr: {cat_stderr}")
+        
+        tar_stderr = result.stderr.decode() if result.stderr else ""
+        if result.returncode != 0:
+            # GNU tar exits non-zero for format issues that BSD tar silently
+            # tolerates (e.g. trailing junk after the last valid entry).
+            # Check whether files were actually extracted before giving up.
+            extracted_items = os.listdir(extract_dir)
+            if extracted_items:
+                print(f"[WARN] tar exited {result.returncode} but extracted "
+                      f"{len(extracted_items)} items — treating as success")
+                if tar_stderr:
+                    print(f"tar stderr: {tar_stderr}")
+            else:
+                print(f"Failed to extract split archive (tar exit {result.returncode})")
+                if tar_stderr:
+                    print(f"tar stderr: {tar_stderr}")
+                shutil.rmtree(extract_dir, ignore_errors=True)
+                return False
+        
        print(f"Successfully extracted archive to {extract_dir}")
        
        # Delete tar files immediately after extraction
@@ -216,8 +302,9 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        print(f"Disk space after tar deletion: {free_gb:.1f}GB free")
        
        return True
-    except subprocess.CalledProcessError as e:
+    except Exception as e:
        print(f"Failed to extract split archive: {e}")
+        shutil.rmtree(extract_dir, ignore_errors=True)
        return False


@@ -381,8 +468,6 @@ COLUMNS = [

 OS_CPU_COUNT = os.cpu_count() or 1
 MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1
-CHUNK_SIZE = MAX_WORKERS * 500  # Reduced for lower RAM usage
-BATCH_SIZE = 250_000  # Fixed size for predictable memory usage (~500MB per batch)

 # PyArrow schema for efficient Parquet writing
 PARQUET_SCHEMA = pa.schema([
@@ -470,211 +555,6 @@ def collect_trace_files_with_find(root_dir):
    return trace_dict


-def generate_version_dates(start_date: str, end_date: str) -> list:
-    """Generate a list of dates from start_date to end_date inclusive."""
-    start = datetime.strptime(start_date, "%Y-%m-%d")
-    end = datetime.strptime(end_date, "%Y-%m-%d")
-    delta = end - start
-    return [start + timedelta(days=i) for i in range(delta.days + 1)]
-
-
-def safe_process(fp):
-    """Safely process a file, returning empty list on error."""
-    try:
-        return process_file(fp)
-    except Exception as e:
-        logging.error(f"Error processing {fp}: {e}")
-        return []
-
-
-def rows_to_arrow_table(rows: list) -> pa.Table:
-    """Convert list of rows to a PyArrow Table directly (no pandas)."""
-    # Transpose rows into columns
-    columns = list(zip(*rows))
-    
-    # Build arrays for each column according to schema
-    arrays = []
-    for i, field in enumerate(PARQUET_SCHEMA):
-        col_data = list(columns[i]) if i < len(columns) else [None] * len(rows)
-        arrays.append(pa.array(col_data, type=field.type))
-    
-    return pa.Table.from_arrays(arrays, schema=PARQUET_SCHEMA)
-
-
-def write_batch_to_parquet(rows: list, version_date: str, batch_idx: int):
-    """Write a batch of rows to a Parquet file."""
-    if not rows:
-        return
-    
-    table = rows_to_arrow_table(rows)
-    
-    parquet_path = os.path.join(PARQUET_DIR, f"{version_date}_batch_{batch_idx:04d}.parquet")
-    
-    pq.write_table(table, parquet_path, compression='snappy')
-    
-    print(f"Written parquet batch {batch_idx} ({len(rows)} rows) | {get_resource_usage()}")
-
-
-def merge_parquet_files(version_date: str, delete_batches: bool = True):
-    """Merge all batch parquet files for a version_date into a single file using streaming."""
-    pattern = os.path.join(PARQUET_DIR, f"{version_date}_batch_*.parquet")
-    batch_files = sorted(glob.glob(pattern))
-    
-    if not batch_files:
-        print(f"No batch files found for {version_date}")
-        return None
-    
-    print(f"Merging {len(batch_files)} batch files for {version_date} (streaming)...")
-    
-    merged_path = os.path.join(PARQUET_DIR, f"{version_date}.parquet")
-    total_rows = 0
-    
-    # Stream write: read one batch at a time to minimize RAM usage
-    writer = None
-    try:
-        for i, f in enumerate(batch_files):
-            table = pq.read_table(f)
-            total_rows += table.num_rows
-            
-            if writer is None:
-                writer = pq.ParquetWriter(merged_path, table.schema, compression='snappy')
-            
-            writer.write_table(table)
-            
-            # Delete batch file immediately after reading to free disk space
-            if delete_batches:
-                os.remove(f)
-            
-            # Free memory
-            del table
-            if (i + 1) % 10 == 0:
-                gc.collect()
-                print(f"  Merged {i + 1}/{len(batch_files)} batches... | {get_resource_usage()}")
-    finally:
-        if writer is not None:
-            writer.close()
-    
-    print(f"Merged parquet file written to {merged_path} ({total_rows} total rows) | {get_resource_usage()}")
-    
-    if delete_batches:
-        print(f"Deleted {len(batch_files)} batch files during merge")
-    
-    gc.collect()
-    return merged_path
-
-
-def process_version_date(version_date: str, keep_folders: bool = False):
-    """Download, extract, and process trace files for a single version date."""
-    print(f"\nProcessing version_date: {version_date}")
-    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
-    
-    def collect_trace_files_for_version_date(vd):
-        releases = fetch_releases(vd)
-        if len(releases) == 0:
-            print(f"No releases found for {vd}.")
-            return None
-        
-        downloaded_files = []
-        for release in releases:
-            tag_name = release["tag_name"]
-            print(f"Processing release: {tag_name}")
-
-            # Only download prod-0 if available, else prod-0tmp
-            assets = release.get("assets", [])
-            normal_assets = [
-                a for a in assets
-                if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
-            ]
-            tmp_assets = [
-                a for a in assets
-                if "planes-readsb-prod-0tmp" in a["name"]
-            ]
-            use_assets = normal_assets if normal_assets else tmp_assets
-
-            for asset in use_assets:
-                asset_name = asset["name"]
-                asset_url = asset["browser_download_url"]
-                file_path = os.path.join(OUTPUT_DIR, asset_name)
-                result = download_asset(asset_url, file_path)
-                if result:
-                    downloaded_files.append(file_path)
-
-        extract_split_archive(downloaded_files, extract_dir)
-        return collect_trace_files_with_find(extract_dir)
-
-    # Check if files already exist
-    pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
-    matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
-    
-    if matches:
-        print(f"Found existing files for {version_date}:")
-        # Prefer non-tmp slices when reusing existing files
-        normal_matches = [
-            p for p in matches
-            if "-planes-readsb-prod-0." in os.path.basename(p)
-            and "tmp" not in os.path.basename(p)
-        ]
-        downloaded_files = normal_matches if normal_matches else matches
-        
-        extract_split_archive(downloaded_files, extract_dir)
-        trace_files = collect_trace_files_with_find(extract_dir)
-    else:
-        trace_files = collect_trace_files_for_version_date(version_date)
-    
-    if trace_files is None or len(trace_files) == 0:
-        print(f"No trace files found for version_date: {version_date}")
-        return 0
-    
-    file_list = list(trace_files.values())
-    
-    start_time = time.perf_counter()
-    total_num_rows = 0
-    batch_rows = []
-    batch_idx = 0
-    
-    # Process files in chunks
-    for offset in range(0, len(file_list), CHUNK_SIZE):
-        chunk = file_list[offset:offset + CHUNK_SIZE]
-        with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_executor:
-            for rows in process_executor.map(safe_process, chunk):
-                if not rows:
-                    continue
-                batch_rows.extend(rows)
-                
-                if len(batch_rows) >= BATCH_SIZE:
-                    total_num_rows += len(batch_rows)
-                    write_batch_to_parquet(batch_rows, version_date, batch_idx)
-                    batch_idx += 1
-                    batch_rows = []
-                    
-                    elapsed = time.perf_counter() - start_time
-                    speed = total_num_rows / elapsed if elapsed > 0 else 0
-                    print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
-        
-        gc.collect()
-    
-    # Final batch
-    if batch_rows:
-        total_num_rows += len(batch_rows)
-        write_batch_to_parquet(batch_rows, version_date, batch_idx)
-        elapsed = time.perf_counter() - start_time
-        speed = total_num_rows / elapsed if elapsed > 0 else 0
-        print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
-    
-    print(f"Total rows processed for version_date {version_date}: {total_num_rows}")
-    
-    # Clean up extracted directory immediately after processing (before merging parquet files)
-    if not keep_folders and os.path.isdir(extract_dir):
-        print(f"Deleting extraction directory with 100,000+ files: {extract_dir}")
-        shutil.rmtree(extract_dir)
-        print(f"Successfully deleted extraction directory: {extract_dir} | {get_resource_usage()}")
-    
-    # Merge batch files into a single parquet file
-    merge_parquet_files(version_date, delete_batches=True)
-    
-    return total_num_rows
-
-
 def create_parquet_for_day(day, keep_folders: bool = False):
    """Create parquet file for a single day.
    
@@ -698,42 +578,10 @@ def create_parquet_for_day(day, keep_folders: bool = False):
        print(f"Parquet file already exists: {parquet_path}")
        return parquet_path
    
-    print(f"Creating parquet for {version_date}...")
+    print(f"Creating parquet for {version_date}")
    rows_processed = process_version_date(version_date, keep_folders)
    
    if rows_processed > 0 and parquet_path.exists():
        return parquet_path
    else:
        return None
-
-
-def main(start_date: str, end_date: str, keep_folders: bool = False):
-    """Main function to download and convert adsb.lol data to Parquet."""
-    version_dates = [f"v{date.strftime('%Y.%m.%d')}" for date in generate_version_dates(start_date, end_date)]
-    print(f"Processing dates: {version_dates}")
-    
-    total_rows_all = 0
-    for version_date in version_dates:
-        rows_processed = process_version_date(version_date, keep_folders)
-        total_rows_all += rows_processed
-    
-    print(f"\n=== Summary ===")
-    print(f"Total dates processed: {len(version_dates)}")
-    print(f"Total rows written to Parquet: {total_rows_all}")
-    print(f"Parquet files location: {PARQUET_DIR}")
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
-    
-    parser = argparse.ArgumentParser(
-        description="Download adsb.lol data and write to Parquet files"
-    )
-    parser.add_argument("start_date", type=str, help="Start date in YYYY-MM-DD format")
-    parser.add_argument("end_date", type=str, help="End date in YYYY-MM-DD format")
-    parser.add_argument("--keep-folders", action="store_true", 
-                        help="Keep extracted folders after processing")
-    
-    args = parser.parse_args()
-    
-    main(args.start_date, args.end_date, args.keep_folders)
@@ -1,9 +1,7 @@
 """
-Downloads and extracts adsb.lol tar files, then lists all ICAO folders.
+Downloads and extracts adsb.lol tar files for a single day, then lists all ICAO folders.
 This is the first step of the map-reduce pipeline.

-Supports both single-day (daily) and multi-day (historical) modes.
-
 Outputs:
 - Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
 - ICAO manifest at data/output/icao_manifest_{date}.txt
@@ -25,11 +23,6 @@ from src.adsb.download_adsb_data_to_parquet import (
 )


-def get_target_day() -> datetime:
-    """Get yesterday's date (the day we're processing)."""
-    return datetime.utcnow() - timedelta(days=1)
-
-
 def download_and_extract(version_date: str) -> str | None:
    """Download and extract tar files, return extract directory path."""
    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
@@ -59,6 +52,12 @@ def download_and_extract(version_date: str) -> str | None:
            print(f"No releases found for {version_date}")
            return None
        
+        # Prefer non-tmp releases; only use tmp if no normal releases exist
+        normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
+        tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
+        releases = normal_releases if normal_releases else tmp_releases
+        print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
+        
        downloaded_files = []
        for release in releases:
            tag_name = release["tag_name"]
@@ -78,8 +77,9 @@ def download_and_extract(version_date: str) -> str | None:
            for asset in use_assets:
                asset_name = asset["name"]
                asset_url = asset["browser_download_url"]
+                asset_size = asset.get("size")  # Get expected file size
                file_path = os.path.join(OUTPUT_DIR, asset_name)
-                if download_asset(asset_url, file_path):
+                if download_asset(asset_url, file_path, expected_size=asset_size):
                    downloaded_files.append(file_path)
    
    if not downloaded_files:
@@ -100,21 +100,6 @@ def list_icao_folders(extract_dir: str) -> list[str]:
    return icaos


-def write_manifest(icaos: list[str], manifest_id: str) -> str:
-    """Write ICAO list to manifest file.
-    
-    Args:
-        icaos: List of ICAO codes
-        manifest_id: Identifier for manifest file (date or date range)
-    """
-    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
-    with open(manifest_path, "w") as f:
-        for icao in sorted(icaos):
-            f.write(f"{icao}\n")
-    print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
-    return manifest_path
-
-
 def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    """Process a single day: download, extract, list ICAOs.
    
@@ -129,82 +114,50 @@ def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
    extract_dir = download_and_extract(version_date)
    if not extract_dir:
        print(f"Failed to download/extract data for {date_str}")
-        return None, []
+        raise Exception(f"No data available for {date_str}")
    
    icaos = list_icao_folders(extract_dir)
    print(f"Found {len(icaos)} ICAOs for {date_str}")
    
    return extract_dir, icaos

-
-def process_date_range(start_date: datetime, end_date: datetime) -> set[str]:
-    """Process multiple days: download, extract, combine ICAO lists.
-    
-    Args:
-        start_date: Start date (inclusive)
-        end_date: End date (inclusive)
-    
-    Returns:
-        Combined set of all ICAOs across the date range
-    """
-    all_icaos: set[str] = set()
-    current = start_date
-    
-    # Both start and end are inclusive
-    while current <= end_date:
-        _, icaos = process_single_day(current)
-        all_icaos.update(icaos)
-        current += timedelta(days=1)
-    
-    return all_icaos
+from pathlib import Path
+import tarfile
+NUMBER_PARTS = 4
+def split_folders_into_gzip_archives(extract_dir: Path, tar_output_dir: Path, icaos: list[str], parts = NUMBER_PARTS) -> list[str]:
+    traces_dir = extract_dir / "traces"
+    buckets = sorted(traces_dir.iterdir())
+    tars = []
+    for i in range(parts):
+        tar_path = tar_output_dir / f"{tar_output_dir.name}_part_{i}.tar.gz"
+        tars.append(tarfile.open(tar_path, "w:gz"))
+    for idx, bucket_path in enumerate(buckets):
+        tar_idx = idx % parts
+        tars[tar_idx].add(bucket_path, arcname=bucket_path.name)
+    for tar in tars:
+        tar.close()


 def main():
-    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
+    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data for a single day")
    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
-    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
-    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
    args = parser.parse_args()
    
-    # Determine mode: single day or date range
-    if args.start_date and args.end_date:
-        # Historical mode: process date range
-        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
-        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
-        
-        print(f"Processing date range: {args.start_date} to {args.end_date}")
-        
-        all_icaos = process_date_range(start_date, end_date)
-        
-        if not all_icaos:
-            print("No ICAOs found in date range")
-            sys.exit(1)
-        
-        # Write combined manifest with range identifier
-        manifest_id = f"{args.start_date}_{args.end_date}"
-        write_manifest(list(all_icaos), manifest_id)
-        
-        print(f"\nDone! Total ICAOs: {len(all_icaos)}")
-        
-    else:
-        # Daily mode: single day
-        if args.date:
-            target_day = datetime.strptime(args.date, "%Y-%m-%d")
-        else:
-            target_day = get_target_day()
-        
-        date_str = target_day.strftime("%Y-%m-%d")
-        
-        extract_dir, icaos = process_single_day(target_day)
-        
-        if not icaos:
-            print("No ICAOs found")
-            sys.exit(1)
-        
-        write_manifest(icaos, date_str)
-        
-        print(f"\nDone! Extract dir: {extract_dir}")
-        print(f"Total ICAOs: {len(icaos)}")
+    target_day = datetime.strptime(args.date, "%Y-%m-%d")
+    date_str = target_day.strftime("%Y-%m-%d")
+    tar_output_dir = Path(f"./data/output/adsb_archives/{date_str}")
+    
+    extract_dir, icaos = process_single_day(target_day)
+    extract_dir = Path(extract_dir)
+    print(extract_dir)
+    tar_output_dir.mkdir(parents=True, exist_ok=True)
+    split_folders_into_gzip_archives(extract_dir, tar_output_dir, icaos)
+    if not icaos:
+        print("No ICAOs found")
+        sys.exit(1)
+    
+    print(f"\nDone! Extract dir: {extract_dir}")
+    print(f"Total ICAOs: {len(icaos)}")


 if __name__ == "__main__":
@@ -41,7 +41,7 @@ def main() -> None:
    """Main entry point for GitHub Actions."""
    start_date = os.environ.get("INPUT_START_DATE")
    end_date = os.environ.get("INPUT_END_DATE")
-    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
+    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "1"))
    
    if not start_date or not end_date:
        print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
@@ -0,0 +1,78 @@
+"""
+Main pipeline for processing ADS-B data from adsb.lol.
+
+Usage:
+    python -m src.adsb.main --date 2026-01-01
+    python -m src.adsb.main --start_date 2026-01-01 --end_date 2026-01-03
+"""
+import argparse
+import subprocess
+import sys
+from datetime import datetime, timedelta
+
+import polars as pl
+
+from src.adsb.download_and_list_icaos import NUMBER_PARTS
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process ADS-B data for a single day or date range")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format")
+    parser.add_argument("--start_date", type=str, help="Start date (inclusive, YYYY-MM-DD)")
+    parser.add_argument("--end_date", type=str, help="End date (exclusive, YYYY-MM-DD)")
+    parser.add_argument("--concat_with_latest_csv", action="store_true", help="Also concatenate with latest CSV from GitHub releases")
+    args = parser.parse_args()
+
+    if args.date and (args.start_date or args.end_date):
+        raise SystemExit("Use --date or --start_date/--end_date, not both.")
+
+    if args.date:
+        start_date = datetime.strptime(args.date, "%Y-%m-%d")
+        end_date = start_date + timedelta(days=1)
+    else:
+        if not args.start_date or not args.end_date:
+            raise SystemExit("Provide --start_date and --end_date, or use --date.")
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+
+    current = start_date
+    while current < end_date:
+        date_str = current.strftime("%Y-%m-%d")
+        print(f"Processing day: {date_str}")
+
+        # Download and split
+        subprocess.run([sys.executable, "-m", "src.adsb.download_and_list_icaos", "--date", date_str], check=True)
+
+        # Process parts
+        for part_id in range(NUMBER_PARTS):
+            subprocess.run([sys.executable, "-m", "src.adsb.process_icao_chunk", "--part-id", str(part_id), "--date", date_str], check=True)
+
+        # Concatenate
+        concat_cmd = [sys.executable, "-m", "src.adsb.concat_parquet_to_final", "--date", date_str]
+        if args.concat_with_latest_csv:
+            concat_cmd.append("--concat_with_latest_csv")
+        subprocess.run(concat_cmd, check=True)
+
+        current += timedelta(days=1)
+
+    if end_date - start_date > timedelta(days=1):
+        dates = []
+        cur = start_date
+        while cur < end_date:
+            dates.append(cur.strftime("%Y-%m-%d"))
+            cur += timedelta(days=1)
+        csv_files = [
+            f"data/outputs/openairframes_adsb_{d}_{d}.csv"
+            for d in dates
+        ]
+        frames = [pl.read_csv(p) for p in csv_files]
+        df = pl.concat(frames, how="vertical", rechunk=True)
+        output_path = f"data/outputs/openairframes_adsb_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv"
+        df.write_csv(output_path)
+        print(f"Wrote combined CSV: {output_path}")
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,18 +1,9 @@
 """
-Processes a chunk of ICAOs from pre-extracted trace files.
+Processes trace files from a single archive part for a single day.
 This is the map phase of the map-reduce pipeline.

-Supports both single-day (daily) and multi-day (historical) modes.
-
-Expects extract_dir to already exist with trace files.
-Reads ICAO manifest to determine which ICAOs to process based on chunk-id.
-
 Usage:
-    # Daily mode (single day)
-    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
-    
-    # Historical mode (date range)
-    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
+    python -m src.adsb.process_icao_chunk --part-id 1 --date 2026-01-01
 """
 import gc
 import os
@@ -21,6 +12,9 @@ import argparse
 import time
 import concurrent.futures
 from datetime import datetime, timedelta
+import tarfile
+import tempfile
+import shutil

 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -37,72 +31,21 @@ from src.adsb.download_adsb_data_to_parquet import (
 )


-CHUNK_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
-os.makedirs(CHUNK_OUTPUT_DIR, exist_ok=True)
-
 # Smaller batch size for memory efficiency
 BATCH_SIZE = 100_000

-
-def get_target_day() -> datetime:
-    """Get yesterday's date (the day we're processing)."""
-    return datetime.utcnow() - timedelta(days=1)
-
-
-def read_manifest(manifest_id: str) -> list[str]:
-    """Read ICAO manifest file.
+def build_trace_file_map(archive_path: str) -> dict[str, str]:
+    """Build a map of ICAO -> trace file path by extracting tar.gz archive."""
+    print(f"Extracting {archive_path}...")
    
-    Args:
-        manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
-    """
-    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
-    if not os.path.exists(manifest_path):
-        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
+    temp_dir = tempfile.mkdtemp(prefix="adsb_extract_")
    
-    with open(manifest_path, "r") as f:
-        icaos = [line.strip() for line in f if line.strip()]
-    return icaos
-
-
-def deterministic_hash(s: str) -> int:
-    """Return a deterministic hash for a string (unlike Python's hash() which is randomized)."""
-    # Use sum of byte values - simple but deterministic
-    return sum(ord(c) for c in s)
-
-
-def get_chunk_icaos(icaos: list[str], chunk_id: int, total_chunks: int) -> list[str]:
-    """Get the subset of ICAOs for this chunk based on deterministic hash partitioning."""
-    return [icao for icao in icaos if deterministic_hash(icao) % total_chunks == chunk_id]
-
-
-def build_trace_file_map(extract_dir: str) -> dict[str, str]:
-    """Build a map of ICAO -> trace file path using find command."""
-    print(f"Building trace file map from {extract_dir}...")
+    with tarfile.open(archive_path, 'r:gz') as tar:
+        tar.extractall(path=temp_dir, filter='data')
    
-    # Debug: check what's in extract_dir
-    if os.path.isdir(extract_dir):
-        items = os.listdir(extract_dir)[:10]
-        print(f"First 10 items in extract_dir: {items}")
-        # Check if there are subdirectories
-        for item in items[:3]:
-            subpath = os.path.join(extract_dir, item)
-            if os.path.isdir(subpath):
-                subitems = os.listdir(subpath)[:5]
-                print(f"  Contents of {item}/: {subitems}")
-    
-    trace_map = collect_trace_files_with_find(extract_dir)
+    trace_map = collect_trace_files_with_find(temp_dir)
    print(f"Found {len(trace_map)} trace files")
    
-    if len(trace_map) == 0:
-        # Debug: try manual find
-        import subprocess
-        result = subprocess.run(
-            ['find', extract_dir, '-type', 'f', '-name', 'trace_full_*'],
-            capture_output=True, text=True
-        )
-        print(f"Manual find output (first 500 chars): {result.stdout[:500]}")
-        print(f"Manual find stderr: {result.stderr[:200]}")
-    
    return trace_map


@@ -125,42 +68,13 @@ def rows_to_table(rows: list) -> pa.Table:


 def process_chunk(
-    chunk_id: int,
-    total_chunks: int,
-    trace_map: dict[str, str],
-    icaos: list[str],
-    output_id: str,
+    trace_files: list[str],
+    part_id: int,
+    date_str: str,
 ) -> str | None:
-    """Process a chunk of ICAOs and write to parquet.
+    """Process trace files and write to a single parquet file."""
    
-    Args:
-        chunk_id: This chunk's ID (0-indexed)
-        total_chunks: Total number of chunks
-        trace_map: Map of ICAO -> trace file path
-        icaos: Full list of ICAOs from manifest
-        output_id: Identifier for output file (date or date range)
-    """
-    chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
-    print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
-    
-    if not chunk_icaos:
-        print(f"Chunk {chunk_id}: No ICAOs to process")
-        return None
-    
-    # Get trace file paths from the map
-    trace_files = []
-    for icao in chunk_icaos:
-        if icao in trace_map:
-            trace_files.append(trace_map[icao])
-    
-    print(f"Chunk {chunk_id}: Found {len(trace_files)} trace files")
-    
-    if not trace_files:
-        print(f"Chunk {chunk_id}: No trace files found")
-        return None
-    
-    # Process files and write parquet in batches
-    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
+    output_path = os.path.join(PARQUET_DIR, f"part_{part_id}_{date_str}.parquet")
    
    start_time = time.perf_counter()
    total_rows = 0
@@ -168,7 +82,8 @@ def process_chunk(
    writer = None
    
    try:
-        # Process in parallel batches
+        writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
+        
        files_per_batch = MAX_WORKERS * 100
        for offset in range(0, len(trace_files), files_per_batch):
            batch_files = trace_files[offset:offset + files_per_batch]
@@ -178,166 +93,72 @@ def process_chunk(
                    if rows:
                        batch_rows.extend(rows)
                        
-                        # Write when batch is full
                        if len(batch_rows) >= BATCH_SIZE:
-                            table = rows_to_table(batch_rows)
+                            writer.write_table(rows_to_table(batch_rows))
                            total_rows += len(batch_rows)
-                            
-                            if writer is None:
-                                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
-                            writer.write_table(table)
-                            
                            batch_rows = []
-                            del table
                            gc.collect()
-                            
-                            elapsed = time.perf_counter() - start_time
-                            print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
-            
            gc.collect()
        
-        # Write remaining rows
        if batch_rows:
-            table = rows_to_table(batch_rows)
+            writer.write_table(rows_to_table(batch_rows))
            total_rows += len(batch_rows)
-            
-            if writer is None:
-                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
-            writer.write_table(table)
-            del table
    
    finally:
        if writer:
            writer.close()
    
-    elapsed = time.perf_counter() - start_time
-    print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
+    print(f"Part {part_id}: Done! {total_rows} rows in {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
    
-    if total_rows > 0:
-        return output_path
-    return None
-
-
-def process_single_day(
-    chunk_id: int,
-    total_chunks: int,
-    target_day: datetime,
-) -> str | None:
-    """Process a single day for this chunk."""
-    date_str = target_day.strftime("%Y-%m-%d")
-    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
-    
-    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
-    
-    if not os.path.isdir(extract_dir):
-        print(f"Extract directory not found: {extract_dir}")
-        return None
-    
-    trace_map = build_trace_file_map(extract_dir)
-    if not trace_map:
-        print("No trace files found")
-        return None
-    
-    icaos = read_manifest(date_str)
-    print(f"Total ICAOs in manifest: {len(icaos)}")
-    
-    return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
-
-
-def process_date_range(
-    chunk_id: int,
-    total_chunks: int,
-    start_date: datetime,
-    end_date: datetime,
-) -> str | None:
-    """Process a date range for this chunk.
-    
-    Combines trace files from all days in the range.
-    
-    Args:
-        chunk_id: This chunk's ID (0-indexed)
-        total_chunks: Total number of chunks
-        start_date: Start date (inclusive)
-        end_date: End date (inclusive)
-    """
-    start_str = start_date.strftime("%Y-%m-%d")
-    end_str = end_date.strftime("%Y-%m-%d")
-    manifest_id = f"{start_str}_{end_str}"
-    
-    print(f"Processing date range: {start_str} to {end_str}")
-    
-    # Build combined trace map from all days
-    combined_trace_map: dict[str, str] = {}
-    current = start_date
-    
-    # Both start and end are inclusive
-    while current <= end_date:
-        version_date = f"v{current.strftime('%Y.%m.%d')}"
-        extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
-        
-        if os.path.isdir(extract_dir):
-            trace_map = build_trace_file_map(extract_dir)
-            # Later days override earlier days (use most recent trace file)
-            combined_trace_map.update(trace_map)
-            print(f"  {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
-        else:
-            print(f"  {current.strftime('%Y-%m-%d')}: no extract directory")
-        
-        current += timedelta(days=1)
-    
-    if not combined_trace_map:
-        print("No trace files found in date range")
-        return None
-    
-    print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
-    
-    icaos = read_manifest(manifest_id)
-    print(f"Total ICAOs in manifest: {len(icaos)}")
-    
-    return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)
+    return output_path if total_rows > 0 else None

+from pathlib import Path

 def main():
-    parser = argparse.ArgumentParser(description="Process a chunk of ICAOs")
-    parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
-    parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks")
-    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
-    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
-    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
+    parser = argparse.ArgumentParser(description="Process a single archive part for a day")
+    parser.add_argument("--part-id", type=int, required=True, help="Part ID (1-indexed)")
+    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
    args = parser.parse_args()
    
-    print(f"Processing chunk {args.chunk_id}/{args.total_chunks}")
-    print(f"OUTPUT_DIR: {OUTPUT_DIR}")
-    print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
-    print(f"Resource usage at start: {get_resource_usage()}")
+    print(f"Processing part {args.part_id} for {args.date}")
    
-    # Debug: List what's in OUTPUT_DIR
-    print(f"\nContents of {OUTPUT_DIR}:")
-    if os.path.isdir(OUTPUT_DIR):
-        for item in os.listdir(OUTPUT_DIR)[:20]:
-            print(f"  - {item}")
-    else:
-        print(f"  Directory does not exist!")
+    # Get specific archive file for this part
+    archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", args.date)
+    archive_path = os.path.join(archive_dir, f"{args.date}_part_{args.part_id}.tar.gz")
    
-    # Determine mode: single day or date range
-    if args.start_date and args.end_date:
-        # Historical mode
-        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
-        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
-        output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
-    else:
-        # Daily mode
-        if args.date:
-            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+    if not os.path.isfile(archive_path):
+        print(f"ERROR: Archive not found: {archive_path}")
+        if os.path.isdir(archive_dir):
+            print(f"Files in {archive_dir}: {os.listdir(archive_dir)}")
        else:
-            target_day = get_target_day()
-        output_path = process_single_day(args.chunk_id, args.total_chunks, target_day)
+            print(f"Directory does not exist: {archive_dir}")
+        sys.exit(1)
    
-    if output_path:
-        print(f"Output: {output_path}")
-    else:
-        print("No output generated")
+    # Extract and collect trace files
+    trace_map = build_trace_file_map(archive_path)
+    all_trace_files = list(trace_map.values())
+    
+    print(f"Total trace files: {len(all_trace_files)}")
+    
+    # Process and write output
+    output_path = process_chunk(all_trace_files, args.part_id, args.date)
+    
+    from src.adsb.compress_adsb_to_aircraft_data import compress_parquet_part
+    df_compressed = compress_parquet_part(args.part_id, args.date)
+    
+    # Write parquet
+    df_compressed_output = OUTPUT_DIR / "compressed" / args.date/ f"part_{args.part_id}_{args.date}.parquet"
+    os.makedirs(df_compressed_output.parent, exist_ok=True)
+    df_compressed.write_parquet(df_compressed_output, compression='snappy')
+    
+    # Write CSV
+    csv_output = OUTPUT_DIR / "compressed" / args.date / f"part_{args.part_id}_{args.date}.csv"
+    df_compressed.write_csv(csv_output)
+    
+    print(f"Raw output: {output_path}" if output_path else "No raw output generated")
+    print(f"Compressed parquet: {df_compressed_output}")
+    print(f"Compressed CSV: {csv_output}")


 if __name__ == "__main__":
-    main()
+    main()
@@ -1,97 +0,0 @@
-"""
-Reduce step: downloads all chunk CSVs from S3, combines them,
-deduplicates across the full dataset, and uploads the final result.
-
-Environment variables:
-  S3_BUCKET         — bucket with intermediate results
-  RUN_ID            — run identifier matching the map workers
-  GLOBAL_START_DATE — overall start date for output filename
-  GLOBAL_END_DATE   — overall end date for output filename
-"""
-import gzip
-import os
-import shutil
-from pathlib import Path
-
-import boto3
-import polars as pl
-
-from compress_adsb_to_aircraft_data import COLUMNS, deduplicate_by_signature
-
-
-def main():
-    s3_bucket = os.environ["S3_BUCKET"]
-    run_id = os.environ.get("RUN_ID", "default")
-    global_start = os.environ["GLOBAL_START_DATE"]
-    global_end = os.environ["GLOBAL_END_DATE"]
-
-    s3 = boto3.client("s3")
-    prefix = f"intermediate/{run_id}/"
-
-    # List all chunk files for this run
-    paginator = s3.get_paginator("list_objects_v2")
-    chunk_keys = []
-    for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix):
-        for obj in page.get("Contents", []):
-            if obj["Key"].endswith(".csv.gz"):
-                chunk_keys.append(obj["Key"])
-
-    chunk_keys.sort()
-    print(f"Found {len(chunk_keys)} chunks to combine")
-
-    if not chunk_keys:
-        print("No chunks found — nothing to reduce.")
-        return
-
-    # Download and concatenate all chunks
-    download_dir = Path("/tmp/chunks")
-    download_dir.mkdir(parents=True, exist_ok=True)
-
-    dfs = []
-
-    for key in chunk_keys:
-        gz_path = download_dir / Path(key).name
-        csv_path = gz_path.with_suffix("")  # Remove .gz
-        print(f"Downloading {key}...")
-        s3.download_file(s3_bucket, key, str(gz_path))
-
-        # Decompress
-        with gzip.open(gz_path, 'rb') as f_in:
-            with open(csv_path, 'wb') as f_out:
-                shutil.copyfileobj(f_in, f_out)
-        gz_path.unlink()
-
-        df_chunk = pl.read_csv(csv_path)
-        print(f"  Loaded {df_chunk.height} rows from {csv_path.name}")
-        dfs.append(df_chunk)
-
-        # Free disk space after loading
-        csv_path.unlink()
-
-    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
-    print(f"Combined: {df_accumulated.height} rows before dedup")
-
-    # Final global deduplication
-    df_accumulated = deduplicate_by_signature(df_accumulated)
-    print(f"After dedup: {df_accumulated.height} rows")
-
-    # Write and upload final result
-    output_name = f"planequery_aircraft_adsb_{global_start}_{global_end}.csv.gz"
-    csv_output = Path(f"/tmp/planequery_aircraft_adsb_{global_start}_{global_end}.csv")
-    gz_output = Path(f"/tmp/{output_name}")
-    
-    df_accumulated.write_csv(csv_output)
-    with open(csv_output, 'rb') as f_in:
-        with gzip.open(gz_output, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    csv_output.unlink()
-
-    final_key = f"final/{output_name}"
-    print(f"Uploading to s3://{s3_bucket}/{final_key}")
-    s3.upload_file(str(gz_output), s3_bucket, final_key)
-
-    print(f"Final output: {df_accumulated.height} records -> {final_key}")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,2 +0,0 @@
-polars>=1.0
-boto3>=1.34
@@ -1,5 +0,0 @@
-polars>=1.0
-pyarrow>=14.0
-orjson>=3.9
-boto3>=1.34
-zstandard>=0.22
@@ -1,89 +0,0 @@
-"""
-Map worker: processes a date range chunk, uploads result to S3.
-
-Environment variables:
-  START_DATE  — inclusive, YYYY-MM-DD
-  END_DATE    — exclusive, YYYY-MM-DD
-  S3_BUCKET   — bucket for intermediate results
-  RUN_ID      — unique run identifier for namespacing S3 keys
-"""
-import os
-import sys
-from datetime import datetime, timedelta
-from pathlib import Path
-
-import boto3
-import polars as pl
-
-from compress_adsb_to_aircraft_data import (
-    load_historical_for_day,
-    deduplicate_by_signature,
-    COLUMNS,
-)
-
-
-def main():
-    start_date_str = os.environ["START_DATE"]
-    end_date_str = os.environ["END_DATE"]
-    s3_bucket = os.environ["S3_BUCKET"]
-    run_id = os.environ.get("RUN_ID", "default")
-
-    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
-    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
-
-    total_days = (end_date - start_date).days
-    print(f"Worker: processing {total_days} days [{start_date_str}, {end_date_str})")
-
-    dfs = []
-    current_date = start_date
-
-    while current_date < end_date:
-        day_str = current_date.strftime("%Y-%m-%d")
-        print(f"  Loading {day_str}...")
-
-        df_compressed = load_historical_for_day(current_date)
-        if df_compressed.height == 0:
-            raise RuntimeError(f"No data found for {day_str}")
-
-        dfs.append(df_compressed)
-        total_rows = sum(df.height for df in dfs)
-        print(f"  +{df_compressed.height} rows (total: {total_rows})")
-
-        # Delete local cache after each day to save disk in container
-        cache_dir = Path("data/adsb")
-        if cache_dir.exists():
-            import shutil
-            shutil.rmtree(cache_dir)
-
-        current_date += timedelta(days=1)
-
-    # Concatenate all days
-    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
-
-    # Deduplicate within this chunk
-    df_accumulated = deduplicate_by_signature(df_accumulated)
-    print(f"After dedup: {df_accumulated.height} rows")
-
-    # Write to local file then upload to S3
-    local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv")
-    df_accumulated.write_csv(local_path)
-    
-    # Compress with gzip
-    import gzip
-    import shutil
-    gz_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
-    with open(local_path, 'rb') as f_in:
-        with gzip.open(gz_path, 'wb') as f_out:
-            shutil.copyfileobj(f_in, f_out)
-    local_path.unlink()  # Remove uncompressed file
-
-    s3_key = f"intermediate/{run_id}/chunk_{start_date_str}_{end_date_str}.csv.gz"
-    print(f"Uploading to s3://{s3_bucket}/{s3_key}")
-
-    s3 = boto3.client("s3")
-    s3.upload_file(str(gz_path), s3_bucket, s3_key)
-    print("Done.")
-
-
-if __name__ == "__main__":
-    main()
@@ -246,6 +246,20 @@ def process_submission(
    if schema_updated:
        schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
    
+    # Truncate JSON preview to stay under GitHub's 65536 char body limit
+    max_json_preview = 50000
+    if len(content_json) > max_json_preview:
+        # Show first few entries as a preview
+        preview_entries = submissions[:10]
+        preview_json = json.dumps(preview_entries, indent=2, sort_keys=True)
+        json_section = (
+            f"### Submissions (showing 10 of {len(submissions)})\n"
+            f"```json\n{preview_json}\n```\n\n"
+            f"*Full submission ({len(submissions)} entries, {len(content_json):,} chars) is in the committed file.*"
+        )
+    else:
+        json_section = f"### Submissions\n```json\n{content_json}\n```"
+
    pr_body = f"""## Community Submission

 Adds {len(submissions)} submission(s) from @{author_username}.
@@ -257,10 +271,7 @@ Closes #{issue_number}

 ---

-### Submissions
-```json
-{content_json}
-```"""
+{json_section}"""
    
    pr = create_pull_request(
        title=f"Community submission: {filename}",
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""
+Download ADS-B Exchange basic-ac-db.json.gz.
+
+Usage:
+    python -m src.contributions.create_daily_adsbexchange_release [--date YYYY-MM-DD]
+"""
+from __future__ import annotations
+
+import argparse
+import shutil
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.request import Request, urlopen
+
+URL = "https://downloads.adsbexchange.com/downloads/basic-ac-db.json.gz"
+OUT_ROOT = Path("data/openairframes")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Create daily ADS-B Exchange JSON release")
+    parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
+    args = parser.parse_args()
+
+    date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+
+    gz_path = OUT_ROOT / f"basic-ac-db_{date_str}.json.gz"
+
+    print(f"Downloading {URL}...")
+    req = Request(URL, headers={"User-Agent": "openairframes-downloader/1.0"}, method="GET")
+    with urlopen(req, timeout=300) as r, gz_path.open("wb") as f:
+        shutil.copyfileobj(r, f)
+
+    print(f"Wrote: {gz_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -17,14 +17,14 @@ import pandas as pd


 COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community"
-OUT_ROOT = Path("data/planequery_aircraft")
+OUT_ROOT = Path("data/openairframes")


 def read_all_submissions(community_dir: Path) -> list[dict]:
    """Read all JSON submissions from the community directory."""
    all_submissions = []
    
-    for json_file in sorted(community_dir.glob("*.json")):
+    for json_file in sorted(community_dir.glob("**/*.json")):
        try:
            with open(json_file) as f:
                data = json.load(f)
@@ -47,7 +47,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
    - creation_timestamp (first)
    - transponder_code_hex
    - registration_number  
-    - planequery_airframe_id
+    - openairframes_id
    - contributor_name
    - [other columns alphabetically]
    - contributor_uuid (last)
@@ -62,7 +62,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
        "creation_timestamp",
        "transponder_code_hex",
        "registration_number",
-        "planequery_airframe_id",
+        "openairframes_id",
        "contributor_name",
        "contributor_uuid",
    ]
@@ -78,7 +78,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
        "creation_timestamp",
        "transponder_code_hex",
        "registration_number",
-        "planequery_airframe_id",
+        "openairframes_id",
        "contributor_name",
    ]
    last_cols = ["contributor_uuid"]
@@ -108,7 +108,7 @@ def main():
            "creation_timestamp",
            "transponder_code_hex",
            "registration_number",
-            "planequery_airframe_id",
+            "openairframes_id",
            "contributor_name",
            "tags",
            "contributor_uuid",
@@ -127,7 +127,7 @@ def main():
    
    # Output
    OUT_ROOT.mkdir(parents=True, exist_ok=True)
-    output_file = OUT_ROOT / f"planequery_aircraft_community_{start_date_str}_{date_str}.csv"
+    output_file = OUT_ROOT / f"openairframes_community_{start_date_str}_{date_str}.csv"
    
    df.to_csv(output_file, index=False)
    
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""
+Download Mictronics aircraft database zip.
+
+Usage:
+    python -m src.contributions.create_daily_microtonics_release [--date YYYY-MM-DD]
+"""
+from __future__ import annotations
+
+import argparse
+import shutil
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from urllib.error import URLError
+from urllib.request import Request, urlopen
+
+URL = "https://www.mictronics.de/aircraft-database/indexedDB_old.php"
+OUT_ROOT = Path("data/openairframes")
+MAX_RETRIES = 3
+RETRY_DELAY = 30  # seconds
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Create daily Mictronics database release")
+    parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today UTC)")
+    args = parser.parse_args()
+
+    date_str = args.date or datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+
+    zip_path = OUT_ROOT / f"mictronics-db_{date_str}.zip"
+
+    for attempt in range(1, MAX_RETRIES + 1):
+        try:
+            print(f"Downloading {URL} (attempt {attempt}/{MAX_RETRIES})...")
+            req = Request(URL, headers={"User-Agent": "Mozilla/5.0 (compatible; openairframes-downloader/1.0)"}, method="GET")
+            with urlopen(req, timeout=120) as r, zip_path.open("wb") as f:
+                shutil.copyfileobj(r, f)
+            print(f"Wrote: {zip_path}")
+            return
+        except (URLError, TimeoutError) as e:
+            print(f"Attempt {attempt} failed: {e}")
+            if attempt < MAX_RETRIES:
+                print(f"Retrying in {RETRY_DELAY} seconds...")
+                time.sleep(RETRY_DELAY)
+            else:
+                print("All retries exhausted. Mictronics download failed.")
+                sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -112,8 +112,8 @@ def group_by_identifier(submissions: list[dict]) -> dict[str, list[dict]]:
            key = f"reg:{submission['registration_number']}"
        elif "transponder_code_hex" in submission:
            key = f"icao:{submission['transponder_code_hex']}"
-        elif "planequery_airframe_id" in submission:
-            key = f"id:{submission['planequery_airframe_id']}"
+        elif "openairframes_id" in submission:
+            key = f"id:{submission['openairframes_id']}"
        else:
            key = "_unknown"
        
@@ -36,6 +36,52 @@ def get_latest_schema_version() -> int:
    return max_version


+def _is_balanced_json(text: str) -> bool:
+    """
+    Check if JSON has balanced brackets/braces.
+    
+    This is a simple check to ensure we captured complete JSON.
+    Ignores brackets/braces inside strings.
+    
+    Args:
+        text: JSON text to check
+        
+    Returns:
+        True if balanced, False otherwise
+    """
+    in_string = False
+    escape = False
+    stack = []
+    pairs = {'[': ']', '{': '}'}
+    
+    for char in text:
+        if escape:
+            escape = False
+            continue
+            
+        if char == '\\':
+            escape = True
+            continue
+            
+        if char == '"' and not escape:
+            in_string = not in_string
+            continue
+            
+        if in_string:
+            continue
+            
+        if char in pairs:
+            stack.append(char)
+        elif char in pairs.values():
+            if not stack:
+                return False
+            if pairs[stack[-1]] != char:
+                return False
+            stack.pop()
+    
+    return len(stack) == 0 and not in_string
+
+
 def get_schema_path(version: int | None = None) -> Path:
    """
    Get path to a specific schema version, or latest if version is None.
@@ -111,7 +157,7 @@ def download_github_attachment(url: str) -> str | None:
    import urllib.error
    
    try:
-        req = urllib.request.Request(url, headers={"User-Agent": "PlaneQuery-Bot"})
+        req = urllib.request.Request(url, headers={"User-Agent": "OpenAirframes-Bot"})
        with urllib.request.urlopen(req, timeout=30) as response:
            return response.read().decode("utf-8")
    except (urllib.error.URLError, urllib.error.HTTPError, UnicodeDecodeError) as e:
@@ -162,10 +208,14 @@ def extract_json_from_issue_body(body: str) -> str | None:
        return match.group(1).strip()
    
    # Try: Raw JSON after "### Submission JSON" until next section or end
-    pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*?[\]}])(?=\n###|\n\n###|$)"
+    # Use greedy matching since we have a clear boundary (next ### or end)
+    pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*[\]}])(?=\s*\n###|\s*$)"
    match = re.search(pattern_raw, body)
    if match:
-        return match.group(1).strip()
+        candidate = match.group(1).strip()
+        # Validate it's complete JSON by checking balanced brackets
+        if _is_balanced_json(candidate):
+            return candidate
    
    # Try: Any JSON object/array in the body (fallback)
    pattern_any = r"([\[{][\s\S]*?[\]}])"
@@ -219,7 +269,19 @@ def parse_and_validate(json_str: str, schema: dict | None = None) -> tuple[list
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError as e:
-        return None, [f"Invalid JSON: {e}"]
+        # Provide detailed error context
+        error_msg = f"Invalid JSON: {e}"
+        
+        # Show context around the error position
+        if hasattr(e, 'pos') and e.pos is not None:
+            start = max(0, e.pos - 50)
+            end = min(len(json_str), e.pos + 50)
+            context = json_str[start:end]
+            # Escape for readability
+            context_escaped = repr(context)
+            error_msg += f"\n\nContext around position {e.pos}: {context_escaped}"
+        
+        return None, [error_msg]
    
    errors = validate_submission(data, schema)
    return data, errors
@@ -58,18 +58,9 @@ def generate_updated_schema(base_schema: dict, tag_registry: dict[str, str]) ->
    for tag_name, type_name in sorted(tag_registry.items()):
        tag_properties[tag_name] = type_name_to_json_schema(type_name)
    
-    # Update tags definition
-    schema["properties"]["tags"] = {
-        "type": "object",
-        "description": "Community-defined tags. New tags can be added, but must use consistent types.",
-        "propertyNames": {
-            "type": "string",
-            "pattern": "^[a-z][a-z0-9_]{0,63}$"
-        },
-        "properties": tag_properties,
-        # Still allow additional properties for new tags
-        "additionalProperties": {"$ref": "#/$defs/tagValue"}
-    }
+    # Only add/update the properties key within tags, preserve everything else
+    if "properties" in schema and "tags" in schema["properties"]:
+        schema["properties"]["tags"]["properties"] = tag_properties
    
    return schema

@@ -0,0 +1,49 @@
+from pathlib import Path
+from datetime import datetime, timezone, timedelta
+import argparse
+
+parser = argparse.ArgumentParser(description="Create daily FAA release")
+parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today)")
+args = parser.parse_args()
+
+if args.date:
+    date_str = args.date
+else:
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+out_dir = Path("data/faa_releasable")
+out_dir.mkdir(parents=True, exist_ok=True)
+zip_name = f"ReleasableAircraft_{date_str}.zip"
+
+zip_path = out_dir / zip_name
+if not zip_path.exists():
+    # URL and paths
+    url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
+    from urllib.request import Request, urlopen
+
+    req = Request(
+        url,
+        headers={"User-Agent": "Mozilla/5.0"},
+        method="GET",
+    )
+
+    with urlopen(req, timeout=120) as r:
+        body = r.read()
+        zip_path.write_bytes(body)
+
+OUT_ROOT = Path("data/openairframes")
+OUT_ROOT.mkdir(parents=True, exist_ok=True)
+from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
+from get_latest_release import get_latest_aircraft_faa_csv_df
+df_new = convert_faa_master_txt_to_df(zip_path, date_str)
+
+try:
+    df_base, start_date_str = get_latest_aircraft_faa_csv_df()
+    df_base = concat_faa_historical_df(df_base, df_new)
+    assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
+except Exception as e:
+    print(f"No existing FAA release found, using only new data: {e}")
+    df_base = df_new
+    start_date_str = date_str
+
+df_base.to_csv(OUT_ROOT / f"openairframes_faa_{start_date_str}_{date_str}.csv", index=False)
@@ -1,84 +0,0 @@
-from pathlib import Path
-from datetime import datetime, timezone, timedelta
-import sys
-
-import polars as pl
-
-# Add adsb directory to path
-sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation
-
-from adsb.compress_adsb_to_aircraft_data import (
-    load_historical_for_day,
-    concat_compressed_dfs,
-    get_latest_aircraft_adsb_csv_df,
-)
-
-if __name__ == '__main__':
-    # Get yesterday's date (data for the previous day)
-    day = datetime.now(timezone.utc) - timedelta(days=1)
-
-    # Find a day with complete data
-    max_attempts = 2  # Don't look back more than a week
-    for attempt in range(max_attempts):
-        date_str = day.strftime("%Y-%m-%d")
-        print(f"Processing ADS-B data for {date_str}")
-        
-        print("Loading new ADS-B data...")
-        df_new = load_historical_for_day(day)
-        if df_new.height == 0:
-            day = day - timedelta(days=1)
-            continue
-        max_time = df_new['time'].max()
-        if max_time is not None:
-            # Handle timezone
-            max_time_dt = max_time
-            if hasattr(max_time_dt, 'replace'):
-                max_time_dt = max_time_dt.replace(tzinfo=timezone.utc)
-            
-            end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5)
-            
-            # Convert polars datetime to python datetime if needed
-            if isinstance(max_time_dt, datetime):
-                if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day:
-                    break
-            else:
-                # Polars returns python datetime already
-                if max_time >= day.replace(hour=23, minute=54, second=59):
-                    break
-        
-        print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.")
-        day = day - timedelta(days=1)
-    else:
-        raise RuntimeError(f"Could not find complete data in the last {max_attempts} days")
-
-    try:
-        # Get the latest release data
-        print("Downloading latest ADS-B release...")
-        df_base, start_date_str = get_latest_aircraft_adsb_csv_df()
-        # Combine with historical data
-        print("Combining with historical data...")
-        df_combined = concat_compressed_dfs(df_base, df_new)
-    except Exception as e:
-        print(f"Error downloading latest ADS-B release: {e}")
-        df_combined = df_new
-        start_date_str = date_str
-
-    # Sort by time for consistent ordering
-    df_combined = df_combined.sort('time')
-    
-    # Convert any list columns to strings for CSV compatibility
-    for col in df_combined.columns:
-        if df_combined[col].dtype == pl.List:
-            df_combined = df_combined.with_columns(
-                pl.col(col).list.join(",").alias(col)
-            )
-
-    # Save the result
-    OUT_ROOT = Path("data/planequery_aircraft")
-    OUT_ROOT.mkdir(parents=True, exist_ok=True)
-
-    output_file = OUT_ROOT / f"planequery_aircraft_adsb_{start_date_str}_{date_str}.csv"
-    df_combined.write_csv(output_file)
-
-    print(f"Saved: {output_file}")
-    print(f"Total aircraft: {df_combined.height}")
@@ -1,33 +0,0 @@
-from pathlib import Path
-from datetime import datetime, timezone
-date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
-
-out_dir = Path("data/faa_releasable")
-out_dir.mkdir(parents=True, exist_ok=True)
-zip_name = f"ReleasableAircraft_{date_str}.zip"
-
-zip_path = out_dir / zip_name
-if not zip_path.exists():
-    # URL and paths
-    url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
-    from urllib.request import Request, urlopen
-
-    req = Request(
-        url,
-        headers={"User-Agent": "Mozilla/5.0"},
-        method="GET",
-    )
-
-    with urlopen(req, timeout=120) as r:
-        body = r.read()
-        zip_path.write_bytes(body)
-
-OUT_ROOT = Path("data/planequery_aircraft")
-OUT_ROOT.mkdir(parents=True, exist_ok=True)
-from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
-from get_latest_planequery_aircraft_release import get_latest_aircraft_faa_csv_df
-df_new = convert_faa_master_txt_to_df(zip_path, date_str)
-df_base, start_date_str = get_latest_aircraft_faa_csv_df()
-df_base = concat_faa_historical_df(df_base, df_new)
-assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
-df_base.to_csv(OUT_ROOT / f"planequery_aircraft_faa_{start_date_str}_{date_str}.csv", index=False)
@@ -29,8 +29,8 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
    certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
    df = df.drop(columns="certification").join(certification)
    
-    # Create planequery_airframe_id
-    df["planequery_airframe_id"] = (
+    # Create openairframes_id
+    df["openairframes_id"] = (
        normalize(df["aircraft_manufacturer"])
        + "|"
        + normalize(df["aircraft_model"])
@@ -38,15 +38,18 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
        + normalize(df["serial_number"])
    )
    
-    # Move planequery_airframe_id to come after registration_number
+    # Move openairframes_id to come after registration_number
    cols = df.columns.tolist()
-    cols.remove("planequery_airframe_id")
+    cols.remove("openairframes_id")
    reg_idx = cols.index("registration_number")
-    cols.insert(reg_idx + 1, "planequery_airframe_id")
+    cols.insert(reg_idx + 1, "openairframes_id")
    df = df[cols]
    
    # Convert all NaN to empty strings
    df = df.fillna("")
+    # The FAA parser can produce the literal string "None" for missing values;
+    # replace those so they match the empty-string convention used everywhere else.
+    df = df.replace("None", "")
    
    return df

@@ -84,8 +87,8 @@ def concat_faa_historical_df(df_base, df_new):
            # Convert to string
            val_str = str(val).strip()
            
-            # Handle empty strings
-            if val_str == "" or val_str == "nan":
+            # Handle empty strings and null-like literals
+            if val_str == "" or val_str == "nan" or val_str == "None":
                return ""
            
            # Check if it looks like a list representation (starts with [ )
@@ -9,7 +9,7 @@ import urllib.error
 import json


-REPO = "PlaneQuery/planequery-aircraft"
+REPO = "PlaneQuery/openairframes"
 LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest"


@@ -27,18 +27,23 @@ def _http_get_json(url: str, headers: dict[str, str]) -> dict:
    return json.loads(data.decode("utf-8"))


-def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
-    url = f"https://api.github.com/repos/{repo}/releases/latest"
+def get_releases(repo: str = REPO, github_token: Optional[str] = None, per_page: int = 30) -> list[dict]:
+    """Get a list of releases from the repository."""
+    url = f"https://api.github.com/repos/{repo}/releases?per_page={per_page}"
    headers = {
        "Accept": "application/vnd.github+json",
-        "User-Agent": "planequery-aircraft-downloader/1.0",
+        "User-Agent": "openairframes-downloader/1.0",
    }
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"

-    payload = _http_get_json(url, headers=headers)
+    return _http_get_json(url, headers=headers)
+
+
+def get_release_assets_from_release_data(release_data: dict) -> list[ReleaseAsset]:
+    """Extract assets from a release data dictionary."""
    assets = []
-    for a in payload.get("assets", []):
+    for a in release_data.get("assets", []):
        assets.append(
            ReleaseAsset(
                name=a["name"],
@@ -49,6 +54,19 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
    return assets


+def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
+    url = f"https://api.github.com/repos/{repo}/releases/latest"
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "User-Agent": "openairframes-downloader/1.0",
+    }
+    if github_token:
+        headers["Authorization"] = f"Bearer {github_token}"
+
+    payload = _http_get_json(url, headers=headers)
+    return get_release_assets_from_release_data(payload)
+
+
 def pick_asset(
    assets: Iterable[ReleaseAsset],
    *,
@@ -80,7 +98,7 @@ def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[s
    out_path.parent.mkdir(parents=True, exist_ok=True)

    headers = {
-        "User-Agent": "planequery-aircraft-downloader/1.0",
+        "User-Agent": "openairframes-downloader/1.0",
        "Accept": "application/octet-stream",
    }
    if github_token:
@@ -109,7 +127,7 @@ def download_latest_aircraft_csv(
    repo: str = REPO,
 ) -> Path:
    """
-    Download the latest planequery_aircraft_faa_*.csv file from the latest GitHub release.
+    Download the latest openairframes_faa_*.csv file from the latest GitHub release.

    Args:
        output_dir: Directory to save the downloaded file (default: "downloads")
@@ -119,12 +137,13 @@ def download_latest_aircraft_csv(
    Returns:
        Path to the downloaded file
    """
+    output_dir = Path(output_dir)
    assets = get_latest_release_assets(repo, github_token=github_token)
    try:
-        asset = pick_asset(assets, name_regex=r"^planequery_aircraft_faa_.*\.csv$")
+        asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$")
    except FileNotFoundError:
        # Fallback to old naming pattern
-        asset = pick_asset(assets, name_regex=r"^planequery_aircraft_\d{4}-\d{2}-\d{2}_.*\.csv$")
+        asset = pick_asset(assets, name_regex=r"^openairframes_\d{4}-\d{2}-\d{2}_.*\.csv$")
    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
    return saved_to
@@ -136,11 +155,11 @@ def get_latest_aircraft_faa_csv_df():
           'unique_regulatory_id': str, 
           'registrant_county': str})
    df = df.fillna("")
-    # Extract start date from filename pattern: planequery_aircraft_faa_{start_date}_{end_date}.csv
-    match = re.search(r"planequery_aircraft_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    # Extract start date from filename pattern: openairframes_faa_{start_date}_{end_date}.csv
+    match = re.search(r"openairframes_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
-        # Fallback to old naming pattern: planequery_aircraft_{start_date}_{end_date}.csv
-        match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+        # Fallback to old naming pattern: openairframes_{start_date}_{end_date}.csv
+        match = re.search(r"openairframes_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
    
@@ -154,7 +173,8 @@ def download_latest_aircraft_adsb_csv(
    repo: str = REPO,
 ) -> Path:
    """
-    Download the latest planequery_aircraft_adsb_*.csv file from the latest GitHub release.
+    Download the latest openairframes_adsb_*.csv file from GitHub releases.
+    If the latest release doesn't have the file, searches previous releases.

    Args:
        output_dir: Directory to save the downloaded file (default: "downloads")
@@ -164,26 +184,70 @@ def download_latest_aircraft_adsb_csv(
    Returns:
        Path to the downloaded file
    """
-    assets = get_latest_release_assets(repo, github_token=github_token)
-    asset = pick_asset(assets, name_regex=r"^planequery_aircraft_adsb_.*\.csv$")
-    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
-    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
-    return saved_to
-
-
-def get_latest_aircraft_adsb_csv_df():
-    csv_path = download_latest_aircraft_adsb_csv()
-    import pandas as pd
-    df = pd.read_csv(csv_path)
-    df = df.fillna("")
-    # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv
-    match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
-    if not match:
-        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+    output_dir = Path(output_dir)
    
-    date_str = match.group(1)
-    return df, date_str
+    # Get multiple releases
+    releases = get_releases(repo, github_token=github_token, per_page=30)
+    
+    # Try each release until we find one with the matching asset
+    for release in releases:
+        assets = get_release_assets_from_release_data(release)
+        try:
+            asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
+            saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
+            print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
+            return saved_to
+        except FileNotFoundError:
+            # This release doesn't have the matching asset, try the next one
+            continue
+    
+    raise FileNotFoundError(
+        f"No release in the last 30 releases has an asset matching 'openairframes_adsb_.*\\.csv(\\.gz)?$'"
+    )
+
+import polars as pl
+def get_latest_aircraft_adsb_csv_df():
+    """Download and load the latest ADS-B CSV from GitHub releases.
+    
+    Returns:
+        tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
+    """
+    import re
+    
+    csv_path = download_latest_aircraft_adsb_csv()
+    df = pl.read_csv(csv_path, null_values=[""])
+    
+    # Parse time column: values like "2025-12-31T00:00:00.040" or "2025-05-11T15:15:50.540+0000"
+    # Try with timezone first (convert to naive), then without timezone
+    df = df.with_columns(
+        pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f%z", strict=False)
+            .dt.replace_time_zone(None)  # Convert to naive datetime first
+            .fill_null(pl.col("time").str.strptime(pl.Datetime("ms"), "%Y-%m-%dT%H:%M:%S%.f", strict=False))
+    )
+
+    # Cast dbFlags and year to strings to match the schema used in compress functions
+    for col in ['dbFlags', 'year']:
+        if col in df.columns:
+            df = df.with_columns(pl.col(col).cast(pl.Utf8))
+    
+    # Fill nulls with empty strings for string columns
+    for col in df.columns:
+        if df[col].dtype == pl.Utf8:
+            df = df.with_columns(pl.col(col).fill_null(""))
+    
+    # Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
+    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
+    if not match:
+        raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
+    
+    start_date = match.group(1)
+    end_date = match.group(2)
+    print(df.columns)
+    print(df.dtypes)
+    return df, start_date, end_date
+


 if __name__ == "__main__":
    download_latest_aircraft_csv()
+    download_latest_aircraft_adsb_csv()
@@ -1,90 +0,0 @@
-"""
-Generate Step Functions input and start the pipeline.
-
-Usage:
-  python trigger_pipeline.py 2024-01-01 2025-01-01
-  python trigger_pipeline.py 2024-01-01 2025-01-01 --chunk-days 30
-  python trigger_pipeline.py 2024-01-01 2025-01-01 --dry-run
-"""
-import argparse
-import json
-import os
-import uuid
-from datetime import datetime, timedelta
-
-import boto3
-
-
-def generate_chunks(start_date: str, end_date: str, chunk_days: int = 1):
-    """Split a date range into chunks of chunk_days."""
-    start = datetime.strptime(start_date, "%Y-%m-%d")
-    end = datetime.strptime(end_date, "%Y-%m-%d")
-
-    chunks = []
-    current = start
-    while current < end:
-        chunk_end = min(current + timedelta(days=chunk_days), end)
-        chunks.append({
-            "start_date": current.strftime("%Y-%m-%d"),
-            "end_date": chunk_end.strftime("%Y-%m-%d"),
-        })
-        current = chunk_end
-
-    return chunks
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Trigger ADS-B map-reduce pipeline")
-    parser.add_argument("start_date", help="Start date (YYYY-MM-DD, inclusive)")
-    parser.add_argument("end_date", help="End date (YYYY-MM-DD, exclusive)")
-    parser.add_argument("--chunk-days", type=int, default=1,
-                        help="Days per chunk (default: 1)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print input JSON without starting execution")
-    args = parser.parse_args()
-
-    run_id = f"run-{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}-{uuid.uuid4().hex[:8]}"
-    chunks = generate_chunks(args.start_date, args.end_date, args.chunk_days)
-
-    # Inject run_id into each chunk
-    for chunk in chunks:
-        chunk["run_id"] = run_id
-
-    sfn_input = {
-        "run_id": run_id,
-        "global_start_date": args.start_date,
-        "global_end_date": args.end_date,
-        "chunks": chunks,
-    }
-
-    print(f"Run ID:    {run_id}")
-    print(f"Chunks:    {len(chunks)} (at {args.chunk_days} days each)")
-    print(f"Max concurrency: 3 (enforced by Step Functions Map state)")
-    print()
-    print(json.dumps(sfn_input, indent=2))
-
-    if args.dry_run:
-        print("\n--dry-run: not starting execution")
-        return
-
-    client = boto3.client("stepfunctions")
-
-    # Find the state machine ARN
-    machines = client.list_state_machines()["stateMachines"]
-    arn = next(
-        m["stateMachineArn"]
-        for m in machines
-        if m["name"] == "adsb-map-reduce"
-    )
-
-    response = client.start_execution(
-        stateMachineArn=arn,
-        name=run_id,
-        input=json.dumps(sfn_input),
-    )
-
-    print(f"\nStarted execution: {response['executionArn']}")
-
-
-if __name__ == "__main__":
-    main()
Author	SHA1	Message	Date
JG	2829e5fb6e	Merge pull request #35 from PlaneQuery/develop update readme.md	2026-03-18 14:31:29 -04:00
ggman12	9c744b0baf	update readme.md	2026-03-18 14:29:13 -04:00
JG	ebda04767f	Merge pull request #34 from PlaneQuery/develop Develop to main: theairtraffic google sheet	2026-03-10 05:12:11 -04:00
ggman12	3fdf443894	add russia_ukraine	2026-03-10 05:08:19 -04:00
ggman12	24313603c5	works	2026-03-10 05:08:19 -04:00
JG	2bb0a5eac3	Merge pull request #33 from PlaneQuery/develop Develop to Main: Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb	2026-02-26 15:32:59 -05:00
ggman12	b54f33aa56	Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb	2026-02-26 15:31:47 -05:00
JG	2dda3d341c	Merge pull request #32 from PlaneQuery/develop Develop to Main: Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.	2026-02-24 15:37:54 -05:00
ggman12	b0526f0a95	Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.	2026-02-24 15:36:10 -05:00
JG	4b6a043a9d	Merge pull request #31 from PlaneQuery/develop Develop to Main Fix adsb asset retrival to be more fault tolerant. Fix download issue	2026-02-24 02:17:08 -05:00
ggman12	55c464aad7	Fix adsb asset retrival to be more fault tolerant. Fix download issue for 2024-07-03	2026-02-24 02:12:55 -05:00
ggman12	aa509e8560	attempt to fix download issue for 2024-07-03	2026-02-19 17:51:49 -05:00
ggman12	82d11d8d24	try less strict tar extract for 2025-10-15 and other days that fail	2026-02-19 00:20:03 -05:00
ggman12	76a217ad14	src/contributions/approve_submission.py handle big json files	2026-02-18 23:18:19 -05:00
ggman12	ec2d1a1291	update download.sh	2026-02-18 23:18:19 -05:00
ggman12	97284c69a9	verify downlaod asssets	2026-02-18 23:18:19 -05:00
JG	892ffa78af	Merge pull request #28 from PlaneQuery/community-submission-27 Community submission: ggman12_2026-02-18_5ddbb8bd.json	2026-02-18 17:18:49 -05:00
github-actions[bot]	f77a91db2c	Update schema with new tags: manufacturer_icao, manufacturer_name, model, type_code, serial_number, icao_aircraft_type, operator, operator_callsign, operator_icao, citation_0	2026-02-18 22:18:12 +00:00
github-actions[bot]	b3bd654998	Add community submission from @ggman12 (closes #27 )	2026-02-18 22:18:12 +00:00
ggman12	302be8b8dc	update checker for arrays issue	2026-02-18 17:11:14 -05:00
ggman12	b61dc0f5e5	provide more error	2026-02-18 17:08:43 -05:00
ggman12	1ff17cc6a8	allow adsb to fail for when adsb.lol hasen't uploaded file yet.	2026-02-18 16:49:02 -05:00
ggman12	d216ea9329	Daily ADSB and Histoircal updates. Update readme.md	2026-02-18 16:34:06 -05:00
ggman12	4015a5fcf1	OpenAirframes 1.0	2026-02-13 11:37:31 -05:00