update histoircal

split large file into chuncks
delete aws
2026-05-06 17:36:43 +02:00 · 2026-02-13 00:12:18 -05:00 · 2026-02-12 20:22:36 -05:00 · 2026-02-12 20:13:40 -05:00 · 2026-02-12 19:32:34 -05:00 · 2026-02-12 19:09:35 -05:00
54 changed files with 22441 additions and 322 deletions
@@ -0,0 +1,82 @@
+name: Community submission (JSON)
+description: Submit one or more community records (JSON) to be reviewed and approved.
+title: "Community submission: "
+labels:
+  - community
+  - submission
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Submit **one object** or an **array of objects** that matches the community submission schema.
+
+        **Rules (enforced on review/automation):**
+        - Each object must include **at least one** of:
+          - `registration_number`
+          - `transponder_code_hex` (6 uppercase hex chars, e.g., `ABC123`)
+          - `openairframes_id`
+        - Your contributor name (entered below) will be applied to all objects.
+        - `contributor_uuid` is derived from your GitHub account automatically.
+        - `creation_timestamp` is created by the system (you may omit it).
+
+        **Optional date scoping:**
+        - `start_date` - When the tags become valid (ISO 8601: `YYYY-MM-DD`)
+        - `end_date` - When the tags stop being valid (ISO 8601: `YYYY-MM-DD`)
+
+        **Example: single object**
+        ```json
+        {
+            "registration_number": "N12345",
+            "tags": {"owner": "John Doe"},
+            "start_date": "2025-01-01"
+        }
+        ```
+        **Example: multiple objects (array)**
+        ```json
+        [
+        {
+            "registration_number": "N12345",
+            "tags": {"internet": "starlink"},
+            "start_date": "2025-05-01"
+        },
+        {
+            "registration_number": "N12345",
+            "tags": {"owner": "John Doe"},
+            "start_date": "2025-01-01",
+            "end_date": "2025-07-20"
+        },
+        {
+            "transponder_code_hex": "ABC123",
+            "tags": {"internet": "viasat", "owner": "John Doe"}
+        }
+        ]
+        ```
+
+  - type: input
+    id: contributor_name
+    attributes:
+      label: Contributor Name
+      description: Your display name for attribution. Leave blank for no attribution. Max 150 characters.
+      placeholder: "e.g., JamesBerry.com or leave blank"
+    validations:
+      required: false
+
+  - type: textarea
+    id: submission_json
+    attributes:
+      label: Submission JSON
+      description: |
+        Paste JSON directly, OR drag-and-drop a .json file here.
+        Must be valid JSON. Do not include contributor_name or contributor_uuid.
+      placeholder: |
+        Paste JSON here, or drag-and-drop a .json file...
+    validations:
+      required: true
+
+  - type: textarea
+    id: notes
+    attributes:
+      label: Notes (optional)
+      description: Any context, sources, or links that help validate your submission.
+    validations:
+      required: false
@@ -0,0 +1,47 @@
+name: Approve Community Submission
+
+on:
+  issues:
+    types: [labeled]
+
+permissions:
+  contents: write
+  pull-requests: write
+  issues: write
+
+jobs:
+  approve:
+    if: github.event.label.name == 'approved' && contains(github.event.issue.labels.*.name, 'validated')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install jsonschema
+
+      - name: Get issue author ID
+        id: author
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const issue = context.payload.issue;
+            core.setOutput('username', issue.user.login);
+            core.setOutput('user_id', issue.user.id);
+
+      - name: Process and create PR
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          ISSUE_BODY: ${{ github.event.issue.body }}
+        run: |
+          python -m src.contributions.approve_submission \
+            --issue-number ${{ github.event.issue.number }} \
+            --issue-body "$ISSUE_BODY" \
+            --author "${{ steps.author.outputs.username }}" \
+            --author-id ${{ steps.author.outputs.user_id }}
@@ -0,0 +1,268 @@
+name: Historical ADS-B Processing
+
+on:
+  workflow_dispatch:
+    inputs:
+      start_date:
+        description: 'Start date (YYYY-MM-DD, inclusive)'
+        required: true
+        type: string
+      end_date:
+        description: 'End date (YYYY-MM-DD, exclusive)'
+        required: true
+        type: string
+      chunk_days:
+        description: 'Days per job chunk (default: 7)'
+        required: false
+        type: number
+        default: 7
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      chunks: ${{ steps.generate.outputs.chunks }}
+      global_start: ${{ inputs.start_date }}
+      global_end: ${{ inputs.end_date }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Generate date chunks
+        id: generate
+        env:
+          INPUT_START_DATE: ${{ inputs.start_date }}
+          INPUT_END_DATE: ${{ inputs.end_date }}
+          INPUT_CHUNK_DAYS: ${{ inputs.chunk_days }}
+        run: python src/adsb/historical_generate_matrix.py
+
+  adsb-extract:
+    needs: generate-matrix
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      matrix:
+        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
+      max-parallel: 3
+      fail-fast: true
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+
+      - name: Download and extract ADS-B data
+        env:
+          START_DATE: ${{ matrix.chunk.start_date }}
+          END_DATE: ${{ matrix.chunk.end_date }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
+          ls -lah data/output/
+
+      - name: Create tar of extracted data and split into chunks
+        run: |
+          cd data/output
+          echo "=== Disk space before tar ==="
+          df -h .
+          echo "=== Files to tar ==="
+          ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
+          
+          # Create tar with explicit error checking
+          if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
+            tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
+            echo "=== Tar file created ==="
+            ls -lah extracted_data.tar
+            # Verify tar integrity
+            tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
+            
+            # Create checksum of the FULL tar before splitting (for verification after reassembly)
+            echo "=== Creating checksum of full tar ==="
+            sha256sum extracted_data.tar > full_tar.sha256
+            cat full_tar.sha256
+            
+            # Split into 500MB chunks to avoid artifact upload issues
+            echo "=== Splitting tar into 500MB chunks ==="
+            mkdir -p tar_chunks
+            split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
+            rm extracted_data.tar
+            mv full_tar.sha256 tar_chunks/
+            
+            echo "=== Chunks created ==="
+            ls -lah tar_chunks/
+          else
+            echo "ERROR: No extracted directories found, cannot create tar"
+            exit 1
+          fi
+
+      - name: Upload extracted data chunks
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
+          path: data/output/tar_chunks/
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: warn
+
+  adsb-map:
+    needs: [generate-matrix, adsb-extract]
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      fail-fast: true
+      matrix:
+        chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
+        icao_chunk: [0, 1, 2, 3]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Free disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /usr/local/share/boost
+          df -h
+
+      - name: Download extracted data
+        uses: actions/download-artifact@v4
+        with:
+          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
+          path: data/output/tar_chunks/
+
+      - name: Reassemble and extract tar
+        id: extract
+        run: |
+          cd data/output
+          if [ -d tar_chunks ] && ls tar_chunks/extracted_data.tar.part_* 1>/dev/null 2>&1; then
+            echo "=== Chunk files info ==="
+            ls -lah tar_chunks/
+            
+            cd tar_chunks
+            
+            # Reassemble tar with explicit sorting
+            echo "=== Reassembling tar file ==="
+            ls -1 extracted_data.tar.part_?? | sort | while read part; do
+              echo "Appending $part..."
+              cat "$part" >> ../extracted_data.tar
+            done
+            cd ..
+            
+            echo "=== Reassembled tar file info ==="
+            ls -lah extracted_data.tar
+            
+            # Verify checksum of reassembled tar matches original
+            echo "=== Verifying reassembled tar checksum ==="
+            echo "Original checksum:"
+            cat tar_chunks/full_tar.sha256
+            echo "Reassembled checksum:"
+            sha256sum extracted_data.tar
+            sha256sum -c tar_chunks/full_tar.sha256 || { echo "ERROR: Reassembled tar checksum mismatch - data corrupted during transfer"; exit 1; }
+            echo "Checksum verified - data integrity confirmed"
+            
+            rm -rf tar_chunks
+            
+            echo "=== Extracting ==="
+            tar -xvf extracted_data.tar
+            rm extracted_data.tar
+            echo "has_data=true" >> "$GITHUB_OUTPUT"
+            echo "=== Contents of data/output ==="
+            ls -lah
+          else
+            echo "No tar chunks found"
+            echo "has_data=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Process ICAO chunk
+        if: steps.extract.outputs.has_data == 'true'
+        env:
+          START_DATE: ${{ matrix.chunk.start_date }}
+          END_DATE: ${{ matrix.chunk.end_date }}
+        run: |
+          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.icao_chunk }} --total-chunks 4 --start-date "$START_DATE" --end-date "$END_DATE"
+          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
+
+      - name: Upload chunk artifacts
+        if: steps.extract.outputs.has_data == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-map-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}-chunk-${{ matrix.icao_chunk }}
+          path: data/output/adsb_chunks/
+          retention-days: 1
+          if-no-files-found: ignore
+
+  adsb-reduce:
+    needs: [generate-matrix, adsb-map]
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download all chunk artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: adsb-map-*
+          path: data/output/adsb_chunks/
+          merge-multiple: true
+
+      - name: Debug downloaded files
+        run: |
+          echo "=== Disk space before processing ==="
+          df -h
+          echo "=== Listing data/output/adsb_chunks/ ==="
+          find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
+          echo "=== Total parquet size ==="
+          du -sh data/output/adsb_chunks/ || echo "No chunks dir"
+
+      - name: Combine chunks to CSV
+        env:
+          START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
+          END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
+        run: |
+          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
+          ls -lah data/openairframes/
+
+      - name: Upload final artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
+          path: data/openairframes/*.csv
+          retention-days: 30
@@ -0,0 +1,395 @@
+name: OpenAirframes Daily Release
+
+on:
+  schedule:
+    # 6:00pm UTC every day - runs on default branch, triggers both
+    - cron: "0 06 * * *"
+  workflow_dispatch:
+    inputs:
+      date:
+        description: 'Date to process (YYYY-MM-DD format, default: yesterday)'
+        required: false
+        type: string
+
+permissions:
+  contents: write
+  actions: write
+
+jobs:
+  trigger-releases:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'schedule'
+    steps:
+      - name: Trigger main branch release
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.actions.createWorkflowDispatch({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              workflow_id: 'openairframes-daily-release.yaml',
+              ref: 'main'
+            });
+      
+      - name: Trigger develop branch release
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.actions.createWorkflowDispatch({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              workflow_id: 'openairframes-daily-release.yaml',
+              ref: 'develop'
+            });
+
+  build-faa:
+    runs-on: ubuntu-24.04-arm
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Run FAA release script
+        run: |
+          python src/create_daily_faa_release.py ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
+          ls -lah data/faa_releasable
+          ls -lah data/openairframes
+
+      - name: Upload FAA artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: faa-release
+          path: |
+            data/openairframes/openairframes_faa_*.csv
+            data/faa_releasable/ReleasableAircraft_*.zip
+          retention-days: 1
+
+  adsb-extract:
+    runs-on: ubuntu-24.04-arm
+    if: github.event_name != 'schedule'
+    outputs:
+      manifest-exists: ${{ steps.check.outputs.exists }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download and extract ADS-B data
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          python -m src.adsb.download_and_list_icaos ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
+          ls -lah data/output/
+
+      - name: Check manifest exists
+        id: check
+        run: |
+          if ls data/output/icao_manifest_*.txt 1>/dev/null 2>&1; then
+            echo "exists=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "exists=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Create tar of extracted data
+        run: |
+          cd data/output
+          tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
+          ls -lah extracted_data.tar
+
+      - name: Upload extracted data
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-extracted
+          path: data/output/extracted_data.tar
+          retention-days: 1
+          compression-level: 0  # Already compressed trace files
+
+  adsb-map:
+    runs-on: ubuntu-24.04-arm
+    needs: adsb-extract
+    if: github.event_name != 'schedule' && needs.adsb-extract.outputs.manifest-exists == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        chunk: [0, 1, 2, 3]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download extracted data
+        uses: actions/download-artifact@v4
+        with:
+          name: adsb-extracted
+          path: data/output/
+
+      - name: Extract tar
+        run: |
+          cd data/output
+          tar -xf extracted_data.tar
+          rm extracted_data.tar
+          echo "=== Contents of data/output ==="
+          ls -lah
+          echo "=== Looking for manifest ==="
+          cat icao_manifest_*.txt | head -20 || echo "No manifest found"
+          echo "=== Looking for extracted dirs ==="
+          ls -d *-planes-readsb-prod-0* 2>/dev/null || echo "No extracted dirs"
+
+      - name: Process chunk ${{ matrix.chunk }}
+        run: |
+          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4 ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
+          mkdir -p data/output/adsb_chunks
+          ls -lah data/output/adsb_chunks/ || echo "No chunks created"
+
+      - name: Upload chunk artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-chunk-${{ matrix.chunk }}
+          path: data/output/adsb_chunks/
+          retention-days: 1
+
+  adsb-reduce:
+    runs-on: ubuntu-24.04-arm
+    needs: adsb-map
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download all chunk artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: adsb-chunk-*
+          path: data/output/adsb_chunks/
+          merge-multiple: true
+
+      - name: Debug downloaded files
+        run: |
+          echo "=== Listing data/ ==="
+          find data/ -type f 2>/dev/null | head -50 || echo "No files in data/"
+          echo "=== Looking for parquet files ==="
+          find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found"
+
+      - name: Combine chunks to CSV
+        run: |
+          mkdir -p data/output/adsb_chunks
+          ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
+          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
+          ls -lah data/openairframes/
+
+      - name: Upload ADS-B artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-release
+          path: data/openairframes/openairframes_adsb_*.csv
+          retention-days: 1
+
+  build-community:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.14"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pandas
+
+      - name: Run Community release script
+        run: |
+          python -m src.contributions.create_daily_community_release
+          ls -lah data/openairframes
+
+      - name: Upload Community artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: community-release
+          path: data/openairframes/openairframes_community_*.csv
+          retention-days: 1
+
+  create-release:
+    runs-on: ubuntu-latest
+    needs: [build-faa, adsb-reduce, build-community]
+    if: github.event_name != 'schedule'
+    steps:
+      - name: Checkout for gh CLI
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            .github
+          sparse-checkout-cone-mode: false
+
+      - name: Download FAA artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: faa-release
+          path: artifacts/faa
+
+      - name: Download ADS-B artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: adsb-release
+          path: artifacts/adsb
+
+      - name: Download Community artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: community-release
+          path: artifacts/community
+
+      - name: Debug artifact structure
+        run: |
+          echo "=== Full artifacts tree ==="
+          find artifacts -type f 2>/dev/null || echo "No files found in artifacts"
+          echo "=== FAA artifacts ==="
+          find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
+          echo "=== ADS-B artifacts ==="
+          find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
+          echo "=== Community artifacts ==="
+          find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
+
+      - name: Prepare release metadata
+        id: meta
+        run: |
+          DATE=$(date -u +"%Y-%m-%d")
+          BRANCH_NAME="${GITHUB_REF#refs/heads/}"
+          BRANCH_SUFFIX=""
+          if [ "$BRANCH_NAME" = "main" ]; then
+            BRANCH_SUFFIX="-main"
+          elif [ "$BRANCH_NAME" = "develop" ]; then
+            BRANCH_SUFFIX="-develop"
+          fi
+          TAG="openairframes-${DATE}${BRANCH_SUFFIX}"
+          
+          # Find files from artifacts using find (handles nested structures)
+          CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
+          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" -type f 2>/dev/null | head -1)
+          CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
+          ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
+          
+          # Validate required files exist
+          MISSING_FILES=""
+          if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then
+            MISSING_FILES="$MISSING_FILES FAA_CSV"
+          fi
+          if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
+            MISSING_FILES="$MISSING_FILES ADSB_CSV"
+          fi
+          if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then
+            MISSING_FILES="$MISSING_FILES FAA_ZIP"
+          fi
+          
+          if [ -n "$MISSING_FILES" ]; then
+            echo "ERROR: Missing required release files:$MISSING_FILES"
+            echo "FAA CSV: $CSV_FILE_FAA"
+            echo "ADSB CSV: $CSV_FILE_ADSB"
+            echo "ZIP: $ZIP_FILE"
+            exit 1
+          fi
+          
+          # Get basenames for display
+          CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
+          CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
+          CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
+          ZIP_BASENAME=$(basename "$ZIP_FILE")
+          
+          echo "date=$DATE" >> "$GITHUB_OUTPUT"
+          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
+          echo "csv_file_faa=$CSV_FILE_FAA" >> "$GITHUB_OUTPUT"
+          echo "csv_basename_faa=$CSV_BASENAME_FAA" >> "$GITHUB_OUTPUT"
+          echo "csv_file_adsb=$CSV_FILE_ADSB" >> "$GITHUB_OUTPUT"
+          echo "csv_basename_adsb=$CSV_BASENAME_ADSB" >> "$GITHUB_OUTPUT"
+          echo "csv_file_community=$CSV_FILE_COMMUNITY" >> "$GITHUB_OUTPUT"
+          echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT"
+          echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
+          echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
+          echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
+          
+          echo "Found files:"
+          echo "  FAA CSV: $CSV_FILE_FAA"
+          echo "  ADSB CSV: $CSV_FILE_ADSB"
+          echo "  Community CSV: $CSV_FILE_COMMUNITY"
+          echo "  ZIP: $ZIP_FILE"
+
+      - name: Delete existing release if exists
+        run: |
+          echo "Attempting to delete release: ${{ steps.meta.outputs.tag }}"
+          gh release delete "${{ steps.meta.outputs.tag }}" --yes --cleanup-tag || echo "No existing release to delete"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Create GitHub Release and upload assets
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ steps.meta.outputs.tag }}
+          name: ${{ steps.meta.outputs.name }}
+          fail_on_unmatched_files: true
+          body: |
+            Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
+
+            Assets:
+            - ${{ steps.meta.outputs.csv_basename_faa }}
+            - ${{ steps.meta.outputs.csv_basename_adsb }}
+            - ${{ steps.meta.outputs.csv_basename_community }}
+            - ${{ steps.meta.outputs.zip_basename }}
+          files: |
+            ${{ steps.meta.outputs.csv_file_faa }}
+            ${{ steps.meta.outputs.csv_file_adsb }}
+            ${{ steps.meta.outputs.csv_file_community }}
+            ${{ steps.meta.outputs.zip_file }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -1,67 +0,0 @@
-name: planequery-aircraft Daily Release
-
-on:
-  schedule:
-    # 6:00pm UTC every day
-    - cron: "0 06 * * *"
-  workflow_dispatch: {}
-
-permissions:
-  contents: write
-
-jobs:
-  build-and-release:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.12"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Run daily release script
-        run: |
-          python src/create_daily_planequery_aircraft_release.py
-          ls -lah data/faa_releasable
-          ls -lah data/planequery_aircraft
-
-      - name: Prepare release metadata
-        id: meta
-        run: |
-          DATE=$(date -u +"%Y-%m-%d")
-          TAG="planequery-aircraft-${DATE}"
-          # Find the CSV file in data/planequery_aircraft matching the pattern
-          CSV_FILE=$(ls data/planequery_aircraft/planequery_aircraft_*_${DATE}.csv | head -1)
-          CSV_BASENAME=$(basename "$CSV_FILE")
-          echo "date=$DATE" >> "$GITHUB_OUTPUT"
-          echo "tag=$TAG" >> "$GITHUB_OUTPUT"
-          echo "csv_file=$CSV_FILE" >> "$GITHUB_OUTPUT"
-          echo "csv_basename=$CSV_BASENAME" >> "$GITHUB_OUTPUT"
-          echo "name=planequery-aircraft snapshot ($DATE)" >> "$GITHUB_OUTPUT"
-
-      - name: Create GitHub Release and upload assets
-        uses: softprops/action-gh-release@v2
-        with:
-          tag_name: ${{ steps.meta.outputs.tag }}
-          name: ${{ steps.meta.outputs.name }}
-          body: |
-            Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
-
-            Assets:
-            - ${{ steps.meta.outputs.csv_basename }}
-            - ReleasableAircraft_${{ steps.meta.outputs.date }}.zip
-          files: |
-            ${{ steps.meta.outputs.csv_file }}
-            data/faa_releasable/ReleasableAircraft_${{ steps.meta.outputs.date }}.zip
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -166,6 +166,6 @@ jobs:
            Combined historical FAA aircraft data (all chunks concatenated)
            Processing period: 2023-08-16 to 2026-01-01
            Generated: ${{ github.event.repository.updated_at }}
-          files: data/planequery_aircraft/*.csv
+          files: data/openairframes/*.csv
          draft: false
          prerelease: false
@@ -0,0 +1,77 @@
+name: Update Community PRs After Merge
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'community/**'
+      - 'schemas/community_submission.v1.schema.json'
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  update-open-prs:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install jsonschema
+
+      - name: Find and update open community PRs
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          # Get list of open community PRs
+          prs=$(gh pr list --label community --state open --json number,headRefName --jq '.[] | "\(.number) \(.headRefName)"')
+          
+          if [ -z "$prs" ]; then
+            echo "No open community PRs found"
+            exit 0
+          fi
+          
+          echo "$prs" | while read pr_number branch_name; do
+            echo "Processing PR #$pr_number (branch: $branch_name)"
+            
+            # Checkout PR branch
+            git fetch origin "$branch_name"
+            git checkout "$branch_name"
+            
+            # Merge main into PR branch
+            git config user.name "github-actions[bot]"
+            git config user.email "github-actions[bot]@users.noreply.github.com"
+            
+            if git merge origin/main -m "Merge main to update schema"; then
+              # Regenerate schema for this PR's submission (adds any new tags)
+              python -m src.contributions.regenerate_pr_schema || true
+              
+              # If there are changes, commit and push
+              if [ -n "$(git status --porcelain schemas/)" ]; then
+                git add schemas/
+                git commit -m "Update schema with new tags"
+                git push origin "$branch_name"
+                echo "  Updated PR #$pr_number with schema changes"
+              else
+                git push origin "$branch_name"
+                echo "  Merged main into PR #$pr_number"
+              fi
+            else
+              echo "  Merge conflict in PR #$pr_number, adding comment"
+              gh pr comment "$pr_number" --body $'⚠️ **Merge Conflict**\n\nAnother community submission was merged and this PR has conflicts.\n\nA maintainer may need to:\n1. Close this PR\n2. Remove the `approved` label from the original issue\n3. Re-add the `approved` label to regenerate the PR'
+              git merge --abort
+              fi
+            fi
+            
+            git checkout main
+          done
@@ -0,0 +1,46 @@
+name: Validate Community Submission
+
+on:
+  issues:
+    types: [opened, edited]
+
+permissions:
+  issues: write
+
+jobs:
+  validate:
+    if: contains(github.event.issue.labels.*.name, 'submission')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: pip install jsonschema
+
+      - name: Debug issue body
+        run: |
+          echo "=== Issue Body ==="
+          cat << 'ISSUE_BODY_EOF'
+          ${{ github.event.issue.body }}
+          ISSUE_BODY_EOF
+
+      - name: Save issue body to file
+        run: |
+          cat << 'ISSUE_BODY_EOF' > /tmp/issue_body.txt
+          ${{ github.event.issue.body }}
+          ISSUE_BODY_EOF
+
+      - name: Validate submission
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+        run: |
+          python -m src.contributions.validate_submission \
+            --issue-body-file /tmp/issue_body.txt \
+            --issue-number ${{ github.event.issue.number }}
@@ -218,4 +218,67 @@ __marimo__/
 # Custom
 data/
 .DS_Store
-notebooks/
+
+# --- CDK ---
+# VSCode extension
+
+# Store launch config in repo but not settings
+.vscode/settings.json
+/.favorites.json
+
+# TypeScript incremental build states
+*.tsbuildinfo
+
+# Local state files & OS specifics
+.DS_Store
+node_modules/
+lerna-debug.log
+dist/
+pack/
+.BUILD_COMPLETED
+.local-npm/
+.tools/
+coverage/
+.nyc_output
+.nycrc
+.LAST_BUILD
+*.sw[a-z]
+*~
+.idea
+*.iml
+junit.xml
+
+# We don't want tsconfig at the root
+/tsconfig.json
+
+# CDK Context & Staging files
+cdk.context.json
+.cdk.staging/
+cdk.out/
+*.tabl.json
+cdk-integ.out.*/
+
+# Yarn error log
+yarn-error.log
+
+# VSCode history plugin
+.vscode/.history/
+
+# Cloud9
+.c9
+.nzm-*
+
+/.versionrc.json
+RELEASE_NOTES.md
+
+# Produced by integ tests
+read*lock
+
+# VSCode jest plugin
+.test-output
+
+# Nx cache
+.nx/
+
+# jsii-rosetta files
+type-fingerprints.txt
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2026 PlaneQuery
+Copyright (c) 2026 OpenAirframes

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1 @@
+ecosystem.config.cjs
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 Clem
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,229 @@
+# ✈️ AF-KLM Fleet Catalog
+
+Open source, community-maintained catalog of **Air France** and **KLM** fleets with real-time tracking of aircraft properties, WiFi connectivity, and historical changes.
+
+---
+
+## 📊 Fleet Overview
+
+| Airline | Total | 📶 WiFi | 🛜 High-Speed | % Starlink |
+|---------|-------|---------|---------------|------------|
+| 🇫🇷 Air France | 220 | 220 (100%) | 41 | **19%** |
+| 🇳🇱 KLM | 117 | 94 (80%) | 0 | **0%** |
+| **Combined** | **337** | **314 (93%)** | **41** | **12%** |
+
+
+> 🛜 **High-Speed** = Starlink satellite internet (50+ Mbps)  
+> 📶 **WiFi** = Any WiFi connectivity (low-speed or high-speed)
+
+*Last updated: 2026-02-04*
+
+---
+
+## 🛫 Fleet Breakdown
+
+### 🇫🇷 Air France (AF)
+
+| Aircraft Type | Count |
+|---------------|-------|
+| A220-300 PASSENGER | 46 |
+| 777-300ER | 43 |
+| A350-900 | 41 |
+| A320 | 29 |
+| 777-200-200ER | 18 |
+| A321 | 12 |
+| 787-9 | 10 |
+| A330-200 | 8 |
+| A320 (SHARKLETS) | 6 |
+| A318 | 4 |
+| A319 | 3 |
+| **Total** | **220** |
+
+### 🇳🇱 KLM (KL)
+
+| Aircraft Type | Count |
+|---------------|-------|
+| 737-800 | 29 |
+| 777-300ER | 15 |
+| 787-10 | 15 |
+| 777-200-200ER | 14 |
+| A321NEO | 12 |
+| 787-9 | 12 |
+| A330-200 | 5 |
+| A330-300 | 5 |
+| 737-900 | 5 |
+| 737-700 | 5 |
+| **Total** | **117** |
+
+
+
+---
+
+## 📋 Detailed Configuration
+
+### 🇫🇷 Air France — Detailed Configuration
+
+| Aircraft | Config | Seats | Count | 🛜 Starlink |
+|----------|--------|-------|-------|-------------|
+| 777-200-200ER | `J028W032Y268` | 328 | 18 | - |
+| 777-300ER | `J014W028Y430` | 472 | 12 | - |
+| 777-300ER | `J048W048Y273` | 369 | 8 | - |
+| 777-300ER | `P004J058W028Y206` | 296 | 14 | 1/14 (7%) |
+| 777-300ER | `P004J060W044Y204` | 312 | 9 | 1/9 (11%) |
+| 787-9 | `J030W021Y228` | 279 | 10 | - |
+| A220-300 PASSENGER | `Y148` | 148 | 46 | 12/46 (26%) |
+| A318 | `Y131` | 131 | 4 | - |
+| A319 | `C072Y071` | 143 | 2 | - |
+| A319 | `Y142` | 142 | 1 | - |
+| A320 | `C108Y066` | 174 | 22 | 2/22 (9%) |
+| A320 | `Y178` | 178 | 7 | - |
+| A320 (SHARKLETS) | `C108Y066` | 174 | 6 | - |
+| A321 | `C082Y130` | 212 | 8 | - |
+| A321 | `Y212` | 212 | 4 | - |
+| A330-200 | `J036W021Y167` | 224 | 8 | 1/8 (13%) |
+| A350-900 | `J034W024Y266` | 324 | 20 | 10/20 (50%) |
+| A350-900 | `J048W032Y210` | 290 | 1 | 1/1 (100%) |
+| A350-900 | `J048W032Y212` | 292 | 20 | 13/20 (65%) |
+
+### 🇳🇱 KLM — Detailed Configuration
+
+| Aircraft | Config | Seats | Count | 🛜 Starlink |
+|----------|--------|-------|-------|-------------|
+| 737-700 | `C036M106` | 142 | 5 | - |
+| 737-800 | `C036M150` | 186 | 29 | - |
+| 737-900 | `C056M132` | 188 | 5 | - |
+| 777-200-200ER | `C035W024M229` | 288 | 12 | - |
+| 777-200-200ER | `C035W032M219` | 286 | 2 | - |
+| 777-300ER | `C035W024M322` | 381 | 15 | - |
+| 787-10 | `C038W028M252` | 318 | 15 | - |
+| 787-9 | `C030W021M224` | 275 | 12 | - |
+| A321NEO | `C030M197` | 227 | 12 | - |
+| A330-200 | `C018M246` | 264 | 5 | - |
+| A330-300 | `C030M262` | 292 | 5 | - |
+
+
+
+---
+
+## 🚀 Quick Start
+
+### Update the Catalog
+
+```bash
+# Set your API key
+export AFKLM_API_KEY=your_api_key_here
+
+# Update Air France
+node fleet-update.js --airline AF
+
+# Update KLM  
+node fleet-update.js --airline KL
+
+# Preview changes without saving
+node fleet-update.js --airline KL --dry-run
+
+# Regenerate this README with latest stats
+node generate-readme.js
+```
+
+### Using the Data
+
+```javascript
+// Load Air France fleet
+const response = await fetch('https://raw.githubusercontent.com/.../airlines/AF.json');
+const fleet = await response.json();
+
+// Find all Starlink aircraft
+const starlink = fleet.aircraft.filter(a => a.connectivity.wifi === 'high-speed');
+console.log(`${starlink.length} aircraft with Starlink`);
+
+// Get aircraft by type
+const a350s = fleet.aircraft.filter(a => a.aircraft_type.full_name?.includes('A350'));
+```
+
+---
+
+## 📁 Data Structure
+
+```
+af-klm/
+├── airlines/
+│   ├── AF.json         # Air France fleet
+│   └── KL.json         # KLM fleet
+├── schema/
+│   └── aircraft.schema.json
+├── fleet-update.js     # Update script
+└── generate-readme.js  # This stats generator
+```
+
+### Aircraft Schema
+
+```json
+{
+  "registration": "F-HTYA",
+  "aircraft_type": {
+    "iata_code": "359",
+    "manufacturer": "Airbus",
+    "model": "A350",
+    "full_name": "AIRBUS A350-900"
+  },
+  "cabin": {
+    "physical_configuration": "J034W024Y266",
+    "total_seats": 324,
+    "classes": { "business": 34, "premium_economy": 24, "economy": 266 }
+  },
+  "connectivity": {
+    "wifi": "high-speed",
+    "wifi_provider": "Starlink",
+    "satellite": true
+  },
+  "tracking": {
+    "first_seen": "2025-01-15",
+    "last_seen": "2026-02-04",
+    "total_flights": 1250
+  },
+  "history": [
+    {
+      "timestamp": "2026-01-20",
+      "property": "connectivity.wifi",
+      "old_value": "low-speed",
+      "new_value": "high-speed",
+      "source": "airline_api"
+    }
+  ]
+}
+```
+
+---
+
+## 🤝 Contributing
+
+### Daily Updates
+
+Community members are encouraged to run the update script daily:
+
+1. Fork this repo
+2. Set your `AFKLM_API_KEY` 
+3. Run `node fleet-update.js --airline AF` and `--airline KL`
+4. Run `node generate-readme.js` to update stats
+5. Submit a PR
+
+### API Key
+
+Get a free API key at [developer.airfranceklm.com](https://developer.airfranceklm.com)
+
+---
+
+## 📋 Schema Version
+
+Current: **1.0.0**
+
+---
+
+## 📄 License
+
+Under MIT License
+
+---
+
+Made with ✈️  by the aviation community
@@ -0,0 +1,116 @@
+#!/usr/bin/env node
+
+/**
+ * Weekly Fleet Update Cron Job
+ * 
+ * Updates AF and KL fleet data, regenerates README, and pushes to GitHub.
+ * 
+ * Usage:
+ *   node cron-update.js                                    # Run once
+ *   pm2 start cron-update.js --cron "0 6 * * 0" --no-autorestart  # Every Sunday 6am
+ * 
+ * Environment:
+ *   AFKLM_API_KEY - API key for Air France/KLM API
+ */
+
+import { execSync, spawn } from 'child_process';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+function log(msg) {
+  console.log(`[${new Date().toISOString()}] ${msg}`);
+}
+
+function exec(cmd) {
+  log(`> ${cmd}`);
+  try {
+    const result = execSync(cmd, { cwd: __dirname, encoding: 'utf-8' });
+    if (result.trim()) console.log(result.trim());
+    return true;
+  } catch (error) {
+    console.error(`Error: ${error.stderr || error.message}`);
+    return false;
+  }
+}
+
+async function runUpdate(airline) {
+  return new Promise((resolve) => {
+    log(`Updating ${airline} fleet...`);
+    
+    const child = spawn('node', ['fleet-update.js', '--airline', airline], {
+      cwd: __dirname,
+      env: process.env,
+      stdio: 'inherit',
+    });
+    
+    child.on('close', (code) => {
+      if (code === 0) {
+        log(`✅ ${airline} complete`);
+        resolve(true);
+      } else {
+        log(`❌ ${airline} failed (code ${code})`);
+        resolve(false);
+      }
+    });
+    
+    child.on('error', (err) => {
+      log(`❌ ${airline} error: ${err.message}`);
+      resolve(false);
+    });
+  });
+}
+
+async function main() {
+  log('🚀 Weekly fleet update starting...\n');
+  
+  // Check API key
+  if (!process.env.AFKLM_API_KEY && !process.env.AFKLM_API_KEYS) {
+    log('❌ No API key found. Set AFKLM_API_KEY environment variable.');
+    process.exit(1);
+  }
+  
+  // Update each airline
+  for (const airline of ['AF', 'KL']) {
+    await runUpdate(airline);
+  }
+  
+  // Regenerate README
+  log('\n📊 Regenerating README...');
+  exec('node generate-readme.js');
+  
+  // Check for changes
+  log('\n📝 Checking for changes...');
+  
+  try {
+    const status = execSync('git status --porcelain', { cwd: __dirname, encoding: 'utf-8' });
+    
+    if (!status.trim()) {
+      log('✅ No changes to commit');
+      return;
+    }
+    
+    log(`Changes:\n${status}`);
+    
+    // Git add, commit, push
+    log('\n📤 Pushing to GitHub...');
+    exec('git add -A');
+    
+    const date = new Date().toISOString().split('T')[0];
+    exec(`git commit -m "Auto-update fleet data - ${date}"`);
+    exec('git push origin main');
+    
+    log('\n✅ Successfully pushed to GitHub!');
+  } catch (error) {
+    log(`Git error: ${error.message}`);
+  }
+  
+  log('\n🏁 Done!');
+}
+
+main().catch(error => {
+  log(`❌ Fatal error: ${error.message}`);
+  process.exit(1);
+});
+
@@ -0,0 +1,488 @@
+# Open Source Airline Fleet Catalog - Schema Proposal
+
+> **Author:** Clément Wehrung  
+> **Date:** February 4, 2026  
+> **Status:** Draft for Discussion  
+> **Implementation:** See `fleet-catalog/` directory
+
+## Overview
+
+This document proposes a standardized JSON schema for an open source catalog of airline fleets. The goal is to track aircraft properties (WiFi, cabin configuration, IFE, etc.) across multiple airlines with a consistent format and change history.
+
+## Design Principles
+
+1. **One JSON file per airline** - Easy to maintain, review PRs, and avoid merge conflicts
+2. **Standardized enums** - Consistent values across all airlines (e.g., WiFi types)
+3. **History tracking** - Record property changes over time with timestamps
+4. **Extensible** - Room for airline-specific fields without breaking the schema
+5. **Machine-readable** - JSON Schema validation for data quality
+
+## Current Implementation
+
+The schema has been implemented with Air France data exported from the fleet database:
+- **220 aircraft** with full property data
+- **History tracking** for WiFi upgrades, seat config changes, etc.
+- **ICAO24 hex codes** for ADS-B tracking correlation
+
+---
+
+## Proposed Directory Structure
+
+```
+fleet-catalog/
+├── schema/
+│   └── aircraft.schema.json      # JSON Schema for validation
+├── airlines/
+│   ├── AF.json                   # Air France
+│   ├── BA.json                   # British Airways
+│   ├── DL.json                   # Delta
+│   ├── LH.json                   # Lufthansa
+│   └── ...
+├── reference/
+│   ├── aircraft-types.json       # ICAO/IATA aircraft type codes
+│   ├── wifi-providers.json       # Known WiFi providers & capabilities
+│   └── cabin-class-codes.json    # Cabin class code mappings
+└── README.md
+```
+
+---
+
+## Schema Definition
+
+### Root Object (Airline File)
+
+```json
+{
+  "schema_version": "1.0.0",
+  "airline": {
+    "iata_code": "AF",
+    "icao_code": "AFR",
+    "name": "Air France",
+    "country": "FR"
+  },
+  "generated_at": "2026-02-04T18:32:20.803Z",
+  "aircraft": [...]
+}
+```
+
+### Aircraft Object
+
+```json
+{
+  "registration": "FHPND",
+  "icao24": "39bda3",
+  
+  "aircraft_type": {
+    "iata_code": "223",
+    "icao_code": "A223",
+    "manufacturer": "Airbus",
+    "model": "A220",
+    "variant": "300",
+    "full_name": "AIRBUS A220-300 PASSENGER"
+  },
+
+  "operator": {
+    "sub_fleet_code": "CA",
+    "cabin_crew_employer": "AF",
+    "cockpit_crew_employer": "AF"
+  },
+
+  "cabin": {
+    "physical_configuration": "Y148",
+    "operational_configuration": "C008Y135",
+    "saleable_configuration": null,
+    "total_seats": 148,
+    "classes": {
+      "first": 0,
+      "business": 0,
+      "premium_economy": 0,
+      "economy": 148
+    },
+    "freight_configuration": "PP000LL000"
+  },
+
+  "connectivity": {
+    "wifi": "high-speed",
+    "wifi_provider": "Starlink",
+    "satellite": true,
+    "live_tv": false,
+    "power_outlets": true,
+    "usb_ports": true
+  },
+
+  "ife": {
+    "type": "streaming",
+    "personal_screens": false
+  },
+
+  "status": "active",
+
+  "tracking": {
+    "first_seen": "2025-12-20",
+    "last_seen": "2026-02-05",
+    "total_flights": 3214
+  },
+
+  "metadata": {
+    "delivery_date": null,
+    "msn": null,
+    "line_number": null,
+    "production_site": null,
+    "engine_type": null,
+    "aircraft_name": null,
+    "livery": null,
+    "comments": null
+  },
+
+  "history": [...]
+}
+```
+
+---
+
+## Standardized Enums
+
+### `connectivity.wifi`
+
+| Value | Description | Examples |
+|-------|-------------|----------|
+| `"none"` | No WiFi available | — |
+| `"low-speed"` | Basic WiFi, typically < 10 Mbps | Gogo ATG, old Ku-band systems |
+| `"high-speed"` | Fast WiFi, typically > 50 Mbps | Starlink, Viasat Ka-band, Gogo 2Ku |
+
+### `connectivity.wifi_provider`
+
+Suggested standardized provider names:
+
+| Provider | Notes |
+|----------|-------|
+| `"Starlink"` | SpaceX LEO constellation |
+| `"Viasat"` | Ka-band GEO satellites |
+| `"Gogo 2Ku"` | Dual Ku-band antennas |
+| `"Gogo ATG"` | Air-to-ground (US only) |
+| `"Panasonic Ku"` | Ku-band system |
+| `"Inmarsat GX"` | Global Xpress Ka-band |
+| `"Anuvu"` | Formerly Global Eagle |
+
+### `ife.type`
+
+| Value | Description |
+|-------|-------------|
+| `"none"` | No IFE system |
+| `"overhead"` | Shared overhead screens only |
+| `"seatback"` | Personal seatback screens |
+| `"streaming"` | BYOD streaming to personal devices |
+| `"hybrid"` | Both seatback screens and streaming |
+
+### `status`
+
+| Value | Description |
+|-------|-------------|
+| `"active"` | Currently in service |
+| `"stored"` | Temporarily stored/parked |
+| `"maintenance"` | In heavy maintenance |
+| `"retired"` | Permanently removed from fleet |
+
+### Cabin Class Codes
+
+Standard codes used in `configuration_raw`:
+
+| Code | Class | Notes |
+|------|-------|-------|
+| `F` | First Class | Traditional first |
+| `P` | First Class | Premium first (e.g., La Première) |
+| `J` | Business Cla ss | Standard code |
+| `C` | Business Class | Alternative code |
+| `W` | Premium Economy | |
+| `Y` | Economy | |
+
+---
+
+## History Tracking
+
+Each time a property changes, append an entry to the `history` array:
+
+```json
+{
+  "history": [
+    {
+      "timestamp": "2026-01-15T14:30:00.000Z",
+      "property": "connectivity.wifi",
+      "old_value": "low-speed",
+      "new_value": "high-speed",
+      "source": "flight_api"
+    },
+    {
+      "timestamp": "2026-01-15T14:30:00.000Z",
+      "property": "connectivity.wifi_provider",
+      "old_value": "Gogo",
+      "new_value": "Starlink",
+      "source": "flight_api"
+    },
+    {
+      "timestamp": "2025-06-01T00:00:00.000Z",
+      "property": "cabin.configuration_raw",
+      "old_value": "Y146",
+      "new_value": "Y148",
+      "source": "manual"
+    }
+  ]
+}
+```
+
+### History Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `timestamp` | ISO 8601 | When the change was detected |
+| `property` | string | Dot-notation path to the changed field |
+| `old_value` | any | Previous value (or `null` if new) |
+| `new_value` | any | New value |
+| `source` | string | How the change was detected |
+
+### Source Values
+
+| Value | Description |
+|-------|-------------|
+| `"flight_api"` | Detected via flight data API |
+| `"airline_api"` | From airline's official API |
+| `"manual"` | Manual update/correction |
+| `"seatguru"` | SeatGuru or similar source |
+| `"community"` | Community contribution |
+
+---
+
+## Example: Air France A220-300
+
+```json
+{
+  "registration": "FHPND",
+  
+  "aircraft_type": {
+    "icao_code": "A223",
+    "iata_code": "223",
+    "manufacturer": "Airbus",
+    "model": "A220-300",
+    "variant": null
+  },
+
+  "cabin": {
+    "configuration_raw": "Y148",
+    "total_seats": 148,
+    "classes": {
+      "first": 0,
+      "business": 0,
+      "premium_economy": 0,
+      "economy": 148
+    }
+  },
+
+  "connectivity": {
+    "wifi": "high-speed",
+    "wifi_provider": "Starlink",
+    "live_tv": false,
+    "power_outlets": true,
+    "usb_ports": true
+  },
+
+  "ife": {
+    "type": "streaming",
+    "personal_screens": false
+  },
+
+  "status": "active",
+
+  "tracking": {
+    "first_seen": "2025-12-20",
+    "last_seen": "2026-02-05",
+    "total_flights": 3214
+  },
+
+  "history": [
+    {
+      "timestamp": "2026-01-15T14:30:00.000Z",
+      "property": "connectivity.wifi",
+      "old_value": "low-speed",
+      "new_value": "high-speed",
+      "source": "flight_api"
+    }
+  ]
+}
+```
+
+---
+
+## Example: Air France 777-300ER (Multi-Class)
+
+```json
+{
+  "registration": "FGSQA",
+  
+  "aircraft_type": {
+    "icao_code": "B77W",
+    "iata_code": "77W",
+    "manufacturer": "Boeing",
+    "model": "777-300ER",
+    "variant": null
+  },
+
+  "cabin": {
+    "configuration_raw": "P004J058W028Y206",
+    "total_seats": 296,
+    "classes": {
+      "first": 4,
+      "business": 58,
+      "premium_economy": 28,
+      "economy": 206
+    }
+  },
+
+  "connectivity": {
+    "wifi": "high-speed",
+    "wifi_provider": "Starlink",
+    "live_tv": true,
+    "power_outlets": true,
+    "usb_ports": true
+  },
+
+  "ife": {
+    "type": "seatback",
+    "personal_screens": true
+  },
+
+  "status": "active",
+
+  "tracking": {
+    "first_seen": "2025-12-20",
+    "last_seen": "2026-02-05",
+    "total_flights": 1137
+  },
+
+  "history": []
+}
+```
+
+---
+
+## Migration from Current Format
+
+For existing data (e.g., Air France tracking), here's the field mapping:
+
+| Current Field | New Path | Transformation |
+|--------------|----------|----------------|
+| `registration` | `registration` | Keep as-is (no dash) |
+| `type_code` | `aircraft_type.iata_code` | Direct mapping |
+| `type_name` | `aircraft_type.*` | Parse into manufacturer/model |
+| `owner_airline_code` | Top-level `airline.iata_code` | Move to file level |
+| `owner_airline_name` | Top-level `airline.name` | Move to file level |
+| `wifi_enabled` | `connectivity.wifi` | Combine with `high_speed_wifi` |
+| `high_speed_wifi` | `connectivity.wifi` | `Y` → `"high-speed"`, else `"low-speed"` |
+| `physical_pax_configuration` | `cabin.configuration_raw` | Direct mapping |
+| — | `cabin.classes` | Parse from configuration |
+| `first_seen_date` | `tracking.first_seen` | Direct mapping |
+| `last_seen_date` | `tracking.last_seen` | Direct mapping |
+| `total_flights_tracked` | `tracking.total_flights` | Direct mapping |
+
+### WiFi Conversion Logic
+
+```javascript
+function convertWifi(wifi_enabled, high_speed_wifi) {
+  if (wifi_enabled !== 'Y') return 'none';
+  if (high_speed_wifi === 'Y') return 'high-speed';
+  return 'low-speed';
+}
+```
+
+### Cabin Configuration Parser
+
+```javascript
+function parseCabinConfig(config) {
+  // "P004J058W028Y206" → { first: 4, business: 58, premium_economy: 28, economy: 206 }
+  const mapping = { P: 'first', F: 'first', J: 'business', C: 'business', W: 'premium_economy', Y: 'economy' };
+  const classes = { first: 0, business: 0, premium_economy: 0, economy: 0 };
+  const regex = /([PFJCWY])(\d{3})/g;
+  let match;
+  while ((match = regex.exec(config)) !== null) {
+    const classKey = mapping[match[1]];
+    classes[classKey] += parseInt(match[2], 10);
+  }
+  return classes;
+}
+```
+
+---
+
+## Metadata Fields (for PlaneSpotters-style data)
+
+These fields capture additional data often found on PlaneSpotters.net:
+
+```json
+{
+  "metadata": {
+    "delivery_date": "2022-03-15",
+    "msn": "55012",
+    "line_number": "1234",
+    "production_site": "Mirabel",
+    "engine_type": "PW1500G",
+    "aircraft_name": "Fort-de-France",
+    "livery": "standard",
+    "comments": "Olympic Games 2024 special livery"
+  }
+}
+```
+
+### Metadata Field Descriptions
+
+| Field | Description | Example |
+|-------|-------------|---------|
+| `delivery_date` | Date aircraft was delivered to airline | `2022-03-15` |
+| `msn` | Manufacturer Serial Number | `55012` |
+| `line_number` | Production line number | `1234` |
+| `production_site` | Factory location | `Toulouse`, `Hamburg`, `Mirabel`, `Charleston` |
+| `engine_type` | Engine model | `Trent XWB-84`, `GE90-115B`, `PW1500G` |
+| `aircraft_name` | Given name (if any) | `Fort-de-France`, `Château de Versailles` |
+| `livery` | Special paint scheme | `standard`, `SkyTeam`, `Olympic 2024` |
+| `comments` | Additional notes | Free text |
+
+### Production Sites Reference
+
+| Manufacturer | Sites |
+|--------------|-------|
+| Airbus | Toulouse (France), Hamburg (Germany), Tianjin (China), Mobile (USA) |
+| Boeing | Everett (USA), Renton (USA), Charleston (USA) |
+| Airbus Canada | Mirabel (Canada) |
+| Embraer | São José dos Campos (Brazil) |
+
+---
+
+## Validation
+
+A JSON Schema file should be maintained at `schema/aircraft.schema.json` for:
+- CI validation on PRs
+- Editor autocomplete
+- Documentation generation
+
+---
+
+## Open Questions
+
+1. **Registration format:** ✅ Decided: Strip dashes (`FHPND` not `F-HPND`)
+2. **ICAO24 hex codes:** ✅ Decided: Yes, include for ADS-B correlation
+3. **Frequency of updates:** Real-time vs. daily snapshots?
+4. **Historical snapshots:** Keep full point-in-time snapshots or just deltas?
+5. **API access:** Should we provide a read-only API for querying?
+6. **PlaneSpotters integration:** How to merge MSN, delivery dates, aircraft names?
+
+---
+
+## Implementation Status
+
+- [x] Finalize schema based on feedback
+- [x] Create JSON Schema for validation (`schema/aircraft.schema.json`)
+- [x] Migrate Air France data to new format (`airlines/AF.json`)
+- [x] Set up repo structure
+- [x] Document contribution guidelines (`README.md`)
+- [ ] Add CI for schema validation
+- [ ] Add more airlines (KLM, Delta, etc.)
+- [ ] Integrate PlaneSpotters metadata (MSN, delivery dates, names)
+
@@ -0,0 +1,669 @@
+#!/usr/bin/env node
+
+/**
+ * Air France / KLM Fleet Catalog Updater
+ * 
+ * Standalone script to update AF.json or KL.json without a database.
+ * Fetches flights from the Air France/KLM API and updates the catalog.
+ * 
+ * Usage:
+ *   node fleet-update.js --airline AF              # Update Air France
+ *   node fleet-update.js --airline KL              # Update KLM
+ *   node fleet-update.js --airline KL --bootstrap  # Build from scratch (7 days)
+ *   node fleet-update.js --airline KL --dry-run    # Preview changes
+ * 
+ * Environment:
+ *   AFKLM_API_KEY  - Single API key for Air France/KLM API
+ *   AFKLM_API_KEYS - Comma-separated API keys (for rotation)
+ */
+
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+// Airline metadata
+const AIRLINES = {
+  AF: {
+    code: 'AF',
+    name: 'Air France',
+    country: 'France',
+    registrationPrefix: 'F-',
+  },
+  KL: {
+    code: 'KL',
+    name: 'KLM Royal Dutch Airlines',
+    country: 'Netherlands',
+    registrationPrefix: 'PH-',
+  },
+};
+
+// Configuration (loaded dynamically)
+let CONFIG = {
+  apiKeys: [],
+  baseUrl: 'https://api.airfranceklm.com/opendata',
+  pageSize: 100,
+  requestDelay: 5000,
+  catalogPath: null,
+  airlineCode: null,
+};
+
+// Track API usage
+let currentKeyIndex = 0;
+let lastRequestTime = 0;
+let totalRequests = 0;
+
+// ============================================================================
+// API Functions
+// ============================================================================
+
+function getApiKey() {
+  return CONFIG.apiKeys[currentKeyIndex];
+}
+
+function rotateKey() {
+  currentKeyIndex = (currentKeyIndex + 1) % CONFIG.apiKeys.length;
+  return getApiKey();
+}
+
+async function throttle() {
+  const now = Date.now();
+  const timeSince = now - lastRequestTime;
+  if (timeSince < CONFIG.requestDelay) {
+    await new Promise(r => setTimeout(r, CONFIG.requestDelay - timeSince));
+  }
+  lastRequestTime = Date.now();
+}
+
+async function apiRequest(endpoint, params = {}, retryCount = 0) {
+  await throttle();
+  totalRequests++;
+
+  const url = new URL(`${CONFIG.baseUrl}${endpoint}`);
+  Object.entries(params).forEach(([key, value]) => {
+    if (value !== undefined && value !== null) {
+      url.searchParams.append(key, value);
+    }
+  });
+
+  // Rotate key before each request
+  if (CONFIG.apiKeys.length > 1 && retryCount === 0) {
+    rotateKey();
+  }
+
+  const response = await fetch(url.toString(), {
+    method: 'GET',
+    headers: {
+      'API-Key': getApiKey(),
+      'Accept': 'application/hal+json',
+      'Accept-Language': 'en-GB',
+    },
+  });
+
+  if (!response.ok) {
+    // Retry on rate limit (silently rotate key)
+    if ((response.status === 429 || response.status === 403) && retryCount < CONFIG.apiKeys.length - 1) {
+      rotateKey();
+      await new Promise(r => setTimeout(r, 1000));
+      return apiRequest(endpoint, params, retryCount + 1);
+    }
+    throw new Error(`API Error: ${response.status} ${response.statusText}`);
+  }
+
+  return response.json();
+}
+
+// ============================================================================
+// Data Extraction
+// ============================================================================
+
+function extractAircraftFromFlight(flight, airlineCode) {
+  const leg = flight.flightLegs?.[0];
+  if (!leg?.aircraft?.registration) return null;
+
+  const aircraft = leg.aircraft;
+  
+  // Filter by owner airline
+  if (aircraft.ownerAirlineCode !== airlineCode) return null;
+  
+  return {
+    registration: aircraft.registration,
+    typeCode: aircraft.typeCode || null,
+    typeName: aircraft.typeName || null,
+    subFleetCode: aircraft.subFleetCodeId || null,
+    ownerAirlineCode: aircraft.ownerAirlineCode || null,
+    ownerAirlineName: aircraft.ownerAirlineName || null,
+    cabinCrewEmployer: aircraft.cabinCrewEmployer || null,
+    cockpitCrewEmployer: aircraft.cockpitCrewEmployer || null,
+    wifiEnabled: aircraft.wifiEnabled || null,
+    highSpeedWifi: aircraft.highSpeedWifi || null,
+    satelliteConnectivity: aircraft.satelliteConnectivityOnBoard || null,
+    physicalPaxConfiguration: aircraft.physicalPaxConfiguration || null,
+  };
+}
+
+function parseCabinConfig(config) {
+  if (!config) return { first: 0, business: 0, premium_economy: 0, economy: 0 };
+  
+  // P/F = First, J/C = Business, W/S = Premium Economy, Y/M = Economy
+  const mapping = { 
+    P: 'first', F: 'first', 
+    J: 'business', C: 'business', 
+    W: 'premium_economy', S: 'premium_economy',
+    Y: 'economy', M: 'economy'
+  };
+  const classes = { first: 0, business: 0, premium_economy: 0, economy: 0 };
+  
+  const regex = /([PFJCWSYM])(\d{2,3})/g;
+  let match;
+  while ((match = regex.exec(config)) !== null) {
+    const classKey = mapping[match[1]];
+    if (classKey) classes[classKey] += parseInt(match[2], 10);
+  }
+  
+  return classes;
+}
+
+function convertWifi(wifiEnabled, highSpeedWifi) {
+  if (wifiEnabled !== 'Y') return 'none';
+  if (highSpeedWifi === 'Y') return 'high-speed';
+  return 'low-speed';
+}
+
+function transformToSchema(raw, firstSeenDate) {
+  const cabinClasses = parseCabinConfig(raw.physicalPaxConfiguration);
+  
+  return {
+    registration: raw.registration,
+    icao24: null,
+    
+    aircraft_type: {
+      iata_code: raw.typeCode,
+      icao_code: null,
+      manufacturer: guessManufacturer(raw.typeName),
+      model: guessModel(raw.typeName),
+      variant: guessVariant(raw.typeName),
+      full_name: raw.typeName,
+    },
+    
+    operator: {
+      sub_fleet_code: raw.subFleetCode,
+      cabin_crew_employer: raw.cabinCrewEmployer,
+      cockpit_crew_employer: raw.cockpitCrewEmployer,
+    },
+    
+    cabin: {
+      physical_configuration: raw.physicalPaxConfiguration,
+      saleable_configuration: null,
+      total_seats: Object.values(cabinClasses).reduce((a, b) => a + b, 0) || null,
+      classes: cabinClasses,
+      freight_configuration: null,
+    },
+    
+    connectivity: {
+      wifi: convertWifi(raw.wifiEnabled, raw.highSpeedWifi),
+      wifi_provider: raw.highSpeedWifi === 'Y' ? 'Starlink' : null,
+      satellite: raw.satelliteConnectivity === 'Y',
+    },
+    
+    status: 'active',
+    
+    tracking: {
+      first_seen: firstSeenDate,
+      last_seen: firstSeenDate,
+      total_flights: 1,
+    },
+    
+    metadata: {
+      created_at: new Date().toISOString(),
+      updated_at: new Date().toISOString(),
+    },
+    
+    history: [],
+  };
+}
+
+function guessManufacturer(typeName) {
+  if (!typeName) return null;
+  if (typeName.toUpperCase().includes('AIRBUS')) return 'Airbus';
+  if (typeName.toUpperCase().includes('BOEING')) return 'Boeing';
+  if (typeName.toUpperCase().includes('EMBRAER')) return 'Embraer';
+  return null;
+}
+
+function guessModel(typeName) {
+  if (!typeName) return null;
+  const match = typeName.match(/A(\d{3})|(\d{3})/);
+  if (match) return match[1] ? `A${match[1]}` : match[2];
+  return null;
+}
+
+function guessVariant(typeName) {
+  if (!typeName) return null;
+  const match = typeName.match(/-(\d+)/);
+  return match ? match[1] : null;
+}
+
+function formatDate(date) {
+  return date.toISOString().split('T')[0];
+}
+
+// ============================================================================
+// Fetch Flights
+// ============================================================================
+
+async function fetchFlightsForDate(dateStr, airlineCode) {
+  const dayStart = `${dateStr}T00:00:00Z`;
+  const dayEnd = `${dateStr}T23:59:59Z`;
+
+  const allFlights = [];
+  let pageNumber = 0;
+  let hasMore = true;
+
+  while (hasMore) {
+    try {
+      const response = await apiRequest('/flightstatus', {
+        startRange: dayStart,
+        endRange: dayEnd,
+        movementType: 'D',
+        timeOriginType: 'S',
+        timeType: 'U',
+        pageSize: CONFIG.pageSize,
+        pageNumber,
+        operatingAirlineCode: airlineCode,
+      });
+
+      const flights = response.operationalFlights || [];
+      allFlights.push(...flights);
+
+      const page = response.page || {};
+      const totalPages = page.totalPages || 1;
+      
+      process.stdout.write(`\r   ${dateStr}: Page ${pageNumber + 1}/${totalPages} (${allFlights.length} flights)`);
+
+      hasMore = pageNumber < (totalPages - 1);
+      pageNumber++;
+
+      if (pageNumber > 100) break;
+    } catch (error) {
+      if (error.message.includes('403') || error.message.includes('429')) {
+        console.log(`\n   ⚠️  API rate limit reached after ${pageNumber} pages`);
+        break;
+      }
+      throw error;
+    }
+  }
+
+  process.stdout.write('\n');
+  return allFlights;
+}
+
+// ============================================================================
+// Update Logic
+// ============================================================================
+
+function detectChanges(existing, newData, dateStr) {
+  const changes = [];
+
+  if (existing.connectivity?.wifi !== newData.connectivity?.wifi) {
+    changes.push({
+      timestamp: dateStr,
+      property: 'connectivity.wifi',
+      old_value: existing.connectivity?.wifi,
+      new_value: newData.connectivity?.wifi,
+      source: 'airline_api',
+    });
+  }
+
+  if (existing.connectivity?.wifi_provider !== newData.connectivity?.wifi_provider) {
+    changes.push({
+      timestamp: dateStr,
+      property: 'connectivity.wifi_provider',
+      old_value: existing.connectivity?.wifi_provider,
+      new_value: newData.connectivity?.wifi_provider,
+      source: 'airline_api',
+    });
+  }
+
+  if (existing.cabin?.physical_configuration !== newData.cabin?.physical_configuration) {
+    changes.push({
+      timestamp: dateStr,
+      property: 'cabin.physical_configuration',
+      old_value: existing.cabin?.physical_configuration,
+      new_value: newData.cabin?.physical_configuration,
+      source: 'airline_api',
+    });
+  }
+
+  if (existing.operator?.sub_fleet_code !== newData.operator?.sub_fleet_code) {
+    changes.push({
+      timestamp: dateStr,
+      property: 'operator.sub_fleet_code',
+      old_value: existing.operator?.sub_fleet_code,
+      new_value: newData.operator?.sub_fleet_code,
+      source: 'airline_api',
+    });
+  }
+
+  return changes;
+}
+
+function mergeAircraft(existing, newData, changes, dateStr) {
+  existing.connectivity = newData.connectivity;
+  existing.cabin.physical_configuration = newData.cabin.physical_configuration;
+  existing.cabin.total_seats = newData.cabin.total_seats;
+  existing.cabin.classes = newData.cabin.classes;
+  existing.operator = newData.operator;
+  existing.aircraft_type = newData.aircraft_type;
+  
+  existing.tracking.last_seen = dateStr;
+  existing.tracking.total_flights = (existing.tracking.total_flights || 0) + 1;
+  existing.metadata.updated_at = new Date().toISOString();
+  
+  if (changes.length > 0) {
+    const existingKeys = new Set(
+      existing.history.map(h => `${h.timestamp}|${h.property}|${h.old_value}|${h.new_value}`)
+    );
+    
+    for (const change of changes) {
+      const key = `${change.timestamp}|${change.property}|${change.old_value}|${change.new_value}`;
+      if (!existingKeys.has(key)) {
+        existing.history.push(change);
+      }
+    }
+  }
+  
+  return existing;
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+
+function printHelp() {
+  console.log(`
+✈️  Air France / KLM Fleet Catalog Updater
+
+Usage:
+  node fleet-update.js --airline <CODE> [options]
+
+Required:
+  --airline <CODE>    Airline code: AF (Air France) or KL (KLM)
+
+Options:
+  --dry-run           Preview changes without saving
+  --date <YYYY-MM-DD> Use specific date instead of today
+  --bootstrap         Build catalog from scratch (crawl last 7 days)
+  --days <N>          Number of days for bootstrap (default: 7)
+  --verbose           Show detailed output
+  --output-changes    Export changes to changes.json
+  --stale-days <N>    Days threshold for stale aircraft (default: 30)
+  --help              Show this help message
+
+Environment:
+  AFKLM_API_KEY       Single API key
+  AFKLM_API_KEYS      Comma-separated API keys (for rotation)
+
+Examples:
+  node fleet-update.js --airline AF                  # Update Air France
+  node fleet-update.js --airline KL --bootstrap      # Build KLM catalog
+  node fleet-update.js --airline KL --dry-run        # Preview KLM changes
+`);
+}
+
+function getDateRange(startDate, days) {
+  const dates = [];
+  for (let i = days - 1; i >= 0; i--) {
+    const d = new Date(startDate);
+    d.setDate(d.getDate() - i);
+    dates.push(formatDate(d));
+  }
+  return dates;
+}
+
+async function main() {
+  const args = process.argv.slice(2);
+  
+  if (args.includes('--help') || args.includes('-h')) {
+    printHelp();
+    process.exit(0);
+  }
+  
+  // Parse arguments
+  const airlineArg = args.find((_, i) => args[i - 1] === '--airline');
+  const dryRun = args.includes('--dry-run');
+  const verbose = args.includes('--verbose') || args.includes('-v');
+  const outputChanges = args.includes('--output-changes');
+  const bootstrap = args.includes('--bootstrap');
+  const dateArg = args.find((_, i) => args[i - 1] === '--date');
+  const daysArg = args.find((_, i) => args[i - 1] === '--days');
+  const staleDaysArg = args.find((_, i) => args[i - 1] === '--stale-days');
+  
+  const staleDays = parseInt(staleDaysArg || '30', 10);
+  const bootstrapDays = parseInt(daysArg || '7', 10);
+  
+  // Validate airline
+  if (!airlineArg || !AIRLINES[airlineArg]) {
+    console.error('❌ Error: --airline is required (AF or KL)');
+    printHelp();
+    process.exit(1);
+  }
+  
+  const airlineCode = airlineArg.toUpperCase();
+  const airline = AIRLINES[airlineCode];
+  
+  // Load API keys from environment
+  const apiKeys = (process.env.AFKLM_API_KEYS || process.env.AFKLM_API_KEY || '').split(',').filter(k => k);
+  if (apiKeys.length === 0) {
+    console.error('❌ Error: No API key found. Set AFKLM_API_KEY or AFKLM_API_KEYS environment variable.');
+    process.exit(1);
+  }
+  
+  // Configure
+  CONFIG.apiKeys = apiKeys;
+  CONFIG.airlineCode = airlineCode;
+  CONFIG.catalogPath = path.join(__dirname, 'airlines', `${airlineCode}.json`);
+  
+  console.log(`\n✈️  ${airline.name} Fleet Catalog Updater\n`);
+  console.log(`   🔑 API keys loaded: ${apiKeys.length}`);
+  
+  if (dryRun) {
+    console.log('   🔍 DRY RUN - no changes will be saved\n');
+  }
+
+  // Load or create catalog
+  let catalog;
+  const catalogExists = fs.existsSync(CONFIG.catalogPath);
+  
+  if (catalogExists && !bootstrap) {
+    console.log(`📂 Loading ${CONFIG.catalogPath}...`);
+    const content = fs.readFileSync(CONFIG.catalogPath, 'utf-8');
+    catalog = JSON.parse(content);
+    console.log(`   Found ${catalog.aircraft_count} aircraft\n`);
+  } else {
+    if (bootstrap) {
+      console.log(`🚀 Bootstrap mode: Creating new catalog for ${airline.name}\n`);
+    } else {
+      console.log(`📂 No existing catalog found, creating new one\n`);
+    }
+    catalog = {
+      schema_version: '1.0.0',
+      airline: {
+        iata_code: airlineCode,
+        name: airline.name,
+        country: airline.country,
+      },
+      generated_at: new Date().toISOString(),
+      aircraft_count: 0,
+      aircraft: [],
+    };
+  }
+
+  // Build lookup
+  const aircraftByReg = new Map();
+  catalog.aircraft.forEach(a => aircraftByReg.set(a.registration, a));
+
+  // Determine dates to process
+  let datesToProcess;
+  if (bootstrap) {
+    datesToProcess = getDateRange(new Date(), bootstrapDays);
+    console.log(`📅 Crawling ${bootstrapDays} days: ${datesToProcess[0]} → ${datesToProcess[datesToProcess.length - 1]}\n`);
+  } else {
+    const targetDate = dateArg || formatDate(new Date());
+    datesToProcess = [targetDate];
+    console.log(`📅 Processing: ${targetDate}\n`);
+  }
+
+  // Process each date
+  let totalNew = 0;
+  let totalUpdated = 0;
+  let totalSeen = 0;
+  const allChanges = [];
+  const seenAircraftAll = new Map();
+
+  for (const dateStr of datesToProcess) {
+    console.log(`📡 Fetching ${airlineCode} flights for ${dateStr}...`);
+    
+    const flights = await fetchFlightsForDate(dateStr, airlineCode);
+    
+    // Extract aircraft
+    const seenToday = new Map();
+    for (const flight of flights) {
+      const extracted = extractAircraftFromFlight(flight, airlineCode);
+      if (extracted && extracted.registration) {
+        seenToday.set(extracted.registration, extracted);
+        seenAircraftAll.set(extracted.registration, { data: extracted, date: dateStr });
+      }
+    }
+
+    console.log(`   ✈️  ${seenToday.size} unique ${airlineCode} aircraft\n`);
+
+    // Process
+    for (const [reg, rawData] of seenToday) {
+      const newData = transformToSchema(rawData, dateStr);
+      const existing = aircraftByReg.get(reg);
+
+      if (!existing) {
+        totalNew++;
+        if (verbose || bootstrap) {
+          console.log(`   ➕ NEW: ${reg} (${rawData.typeName || 'Unknown'})`);
+        }
+        
+        if (!dryRun) {
+          catalog.aircraft.push(newData);
+          aircraftByReg.set(reg, newData);
+        }
+      } else {
+        const changes = detectChanges(existing, newData, dateStr);
+        
+        if (changes.length > 0) {
+          totalUpdated++;
+          if (verbose) {
+            console.log(`   🔄 UPDATED: ${reg}`);
+            changes.forEach(c => console.log(`      ${c.property}: ${c.old_value} → ${c.new_value}`));
+          }
+          allChanges.push(...changes.map(c => ({ registration: reg, ...c })));
+          
+          if (!dryRun) {
+            mergeAircraft(existing, newData, changes, dateStr);
+          }
+        } else {
+          totalSeen++;
+          if (!dryRun) {
+            existing.tracking.last_seen = dateStr;
+            existing.tracking.total_flights = (existing.tracking.total_flights || 0) + 1;
+          }
+        }
+      }
+    }
+  }
+
+  // Summary
+  console.log('\n' + '═'.repeat(50));
+  console.log('📊 Summary');
+  console.log('═'.repeat(50));
+  console.log(`   New aircraft:     ${totalNew}`);
+  console.log(`   Updated aircraft: ${totalUpdated}`);
+  console.log(`   Seen (no change): ${totalSeen}`);
+  console.log(`   Total in catalog: ${catalog.aircraft.length}`);
+  console.log(`   Total changes:    ${allChanges.length}`);
+  console.log(`   API requests:     ${totalRequests}`);
+
+  // Stale aircraft
+  if (!bootstrap) {
+    const notSeen = catalog.aircraft.filter(a => !seenAircraftAll.has(a.registration));
+    const todayDate = new Date();
+    const staleThreshold = new Date(todayDate.getTime() - staleDays * 24 * 60 * 60 * 1000);
+    const staleAircraft = notSeen.filter(a => {
+      if (!a.tracking?.last_seen) return true;
+      return new Date(a.tracking.last_seen) < staleThreshold;
+    });
+    
+    if (staleAircraft.length > 0) {
+      console.log(`\n⚠️  Stale aircraft (not seen in ${staleDays}+ days): ${staleAircraft.length}`);
+      staleAircraft.slice(0, 5).forEach(a => {
+        console.log(`   - ${a.registration} (last: ${a.tracking?.last_seen || 'never'})`);
+      });
+      if (staleAircraft.length > 5) console.log(`   ... and ${staleAircraft.length - 5} more`);
+    }
+  }
+
+  // WiFi stats
+  const wifiStats = { none: 0, 'low-speed': 0, 'high-speed': 0 };
+  catalog.aircraft.forEach(a => {
+    const wifi = a.connectivity?.wifi || 'none';
+    wifiStats[wifi] = (wifiStats[wifi] || 0) + 1;
+  });
+  const total = catalog.aircraft.length;
+  console.log('\n📶 Fleet WiFi Status:');
+  console.log(`   High-speed (Starlink): ${wifiStats['high-speed']} (${total ? Math.round(wifiStats['high-speed'] / total * 100) : 0}%)`);
+  console.log(`   Low-speed:             ${wifiStats['low-speed']} (${total ? Math.round(wifiStats['low-speed'] / total * 100) : 0}%)`);
+  console.log(`   None:                  ${wifiStats['none']} (${total ? Math.round(wifiStats['none'] / total * 100) : 0}%)`);
+
+  // Export changes
+  if (outputChanges && allChanges.length > 0) {
+    const changesPath = path.join(__dirname, `${airlineCode.toLowerCase()}-changes.json`);
+    fs.writeFileSync(changesPath, JSON.stringify({
+      generated_at: new Date().toISOString(),
+      airline: airlineCode,
+      changes: allChanges,
+    }, null, 2));
+    console.log(`\n📝 Changes exported to ${changesPath}`);
+  }
+
+  // Save
+  if (!dryRun && (totalNew > 0 || totalUpdated > 0 || totalSeen > 0)) {
+    catalog.generated_at = new Date().toISOString();
+    catalog.aircraft_count = catalog.aircraft.length;
+
+    catalog.aircraft.sort((a, b) => {
+      const typeCompare = (a.aircraft_type?.iata_code || '').localeCompare(b.aircraft_type?.iata_code || '');
+      if (typeCompare !== 0) return typeCompare;
+      return a.registration.localeCompare(b.registration);
+    });
+
+    // Ensure directory exists
+    const dir = path.dirname(CONFIG.catalogPath);
+    if (!fs.existsSync(dir)) {
+      fs.mkdirSync(dir, { recursive: true });
+    }
+
+    console.log(`\n💾 Saving to ${CONFIG.catalogPath}...`);
+    fs.writeFileSync(CONFIG.catalogPath, JSON.stringify(catalog, null, 2));
+    console.log('✅ Done!');
+  } else if (dryRun) {
+    console.log('\n🔍 Dry run complete - no changes saved');
+  } else {
+    console.log('\n✅ No changes to save');
+  }
+
+  console.log();
+}
+
+main().catch(error => {
+  console.error(`\n❌ Error: ${error.message}`);
+  if (process.env.DEBUG) console.error(error.stack);
+  process.exit(1);
+});
+
@@ -0,0 +1,393 @@
+#!/usr/bin/env node
+
+/**
+ * Generate README with fleet statistics
+ * 
+ * Automatically updates README.md with current fleet data from JSON files.
+ * Run this after updating fleet data to keep stats in sync.
+ * 
+ * Usage:
+ *   node generate-readme.js
+ */
+
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+// Airline display info
+const AIRLINE_INFO = {
+  AF: { name: 'Air France', flag: '🇫🇷', country: 'France' },
+  KL: { name: 'KLM', flag: '🇳🇱', country: 'Netherlands' },
+};
+
+// Load all airline data
+function loadAirlines() {
+  const airlinesDir = path.join(__dirname, 'airlines');
+  const files = fs.readdirSync(airlinesDir).filter(f => f.endsWith('.json'));
+  
+  const airlines = {};
+  for (const file of files) {
+    const code = file.replace('.json', '');
+    const content = fs.readFileSync(path.join(airlinesDir, file), 'utf-8');
+    airlines[code] = JSON.parse(content);
+  }
+  return airlines;
+}
+
+// Get fleet breakdown by type
+function getFleetBreakdown(aircraft) {
+  const breakdown = {};
+  
+  for (const a of aircraft) {
+    const typeName = a.aircraft_type?.full_name || 'Unknown';
+    // Simplify type name
+    let simpleType = typeName
+      .replace('AIRBUS ', '')
+      .replace('BOEING ', '')
+      .replace(' (WINGLETS) PASSENGER/BBJ1', '')
+      .replace(' (WINGLETS) PASSENGER/BBJ2', '')
+      .replace(' (WINGLETS) PASSENGER/BBJ3', '')
+      .replace('/200 ER', '-200ER')
+      .replace('-200/200 ER', '-200ER')
+      .trim();
+    
+    breakdown[simpleType] = (breakdown[simpleType] || 0) + 1;
+  }
+  
+  // Sort by count descending
+  return Object.entries(breakdown)
+    .sort((a, b) => b[1] - a[1]);
+}
+
+// Get WiFi stats
+function getWifiStats(aircraft) {
+  const stats = { none: 0, 'low-speed': 0, 'high-speed': 0 };
+  
+  for (const a of aircraft) {
+    const wifi = a.connectivity?.wifi || 'none';
+    stats[wifi] = (stats[wifi] || 0) + 1;
+  }
+  
+  const total = aircraft.length;
+  return {
+    total,
+    none: stats.none,
+    lowSpeed: stats['low-speed'],
+    highSpeed: stats['high-speed'],
+    nonePercent: total ? Math.round(stats.none / total * 100) : 0,
+    lowSpeedPercent: total ? Math.round(stats['low-speed'] / total * 100) : 0,
+    highSpeedPercent: total ? Math.round(stats['high-speed'] / total * 100) : 0,
+  };
+}
+
+// Generate markdown table for fleet breakdown
+function generateFleetTable(airlines) {
+  let md = '';
+  
+  for (const [code, data] of Object.entries(airlines)) {
+    const info = AIRLINE_INFO[code] || { name: code, flag: '✈️' };
+    const breakdown = getFleetBreakdown(data.aircraft);
+    const wifi = getWifiStats(data.aircraft);
+    
+    md += `### ${info.flag} ${info.name} (${code})\n\n`;
+    md += `| Aircraft Type | Count |\n`;
+    md += `|---------------|-------|\n`;
+    
+    for (const [type, count] of breakdown) {
+      md += `| ${type} | ${count} |\n`;
+    }
+    
+    md += `| **Total** | **${wifi.total}** |\n\n`;
+  }
+  
+  return md;
+}
+
+// Get detailed breakdown by type and config
+function getDetailedBreakdown(aircraft) {
+  const breakdown = {};
+  
+  for (const a of aircraft) {
+    const typeName = a.aircraft_type?.full_name || 'Unknown';
+    // Simplify type name
+    let simpleType = typeName
+      .replace('AIRBUS ', '')
+      .replace('BOEING ', '')
+      .replace(' (WINGLETS) PASSENGER/BBJ1', '')
+      .replace(' (WINGLETS) PASSENGER/BBJ2', '')
+      .replace(' (WINGLETS) PASSENGER/BBJ3', '')
+      .replace('/200 ER', '-200ER')
+      .replace('-200/200 ER', '-200ER')
+      .trim();
+    
+    const config = a.cabin?.physical_configuration || '-';
+    const wifi = a.connectivity?.wifi || 'none';
+    const seats = a.cabin?.total_seats || 0;
+    
+    const key = `${simpleType}|||${config}`;
+    
+    if (!breakdown[key]) {
+      breakdown[key] = {
+        type: simpleType,
+        config,
+        seats,
+        wifi,
+        count: 0,
+        highSpeed: 0,
+      };
+    }
+    
+    breakdown[key].count++;
+    if (wifi === 'high-speed') {
+      breakdown[key].highSpeed++;
+    }
+  }
+  
+  // Sort by type name, then by config (to group similar aircraft together)
+  return Object.values(breakdown)
+    .sort((a, b) => {
+      const typeCompare = a.type.localeCompare(b.type);
+      if (typeCompare !== 0) return typeCompare;
+      return a.config.localeCompare(b.config);
+    });
+}
+
+// Generate detailed fleet table per airline
+function generateDetailedFleetTable(airlines) {
+  let md = '';
+  
+  for (const [code, data] of Object.entries(airlines)) {
+    const info = AIRLINE_INFO[code] || { name: code, flag: '✈️' };
+    const breakdown = getDetailedBreakdown(data.aircraft);
+    
+    md += `### ${info.flag} ${info.name} — Detailed Configuration\n\n`;
+    md += `| Aircraft | Config | Seats | Count | 🛜 Starlink |\n`;
+    md += `|----------|--------|-------|-------|-------------|\n`;
+    
+    for (const item of breakdown) {
+      const starlinkInfo = item.highSpeed > 0 
+        ? `${item.highSpeed}/${item.count} (${Math.round(item.highSpeed / item.count * 100)}%)`
+        : '-';
+      md += `| ${item.type} | \`${item.config}\` | ${item.seats || '-'} | ${item.count} | ${starlinkInfo} |\n`;
+    }
+    
+    md += `\n`;
+  }
+  
+  return md;
+}
+
+// Generate WiFi summary table
+function generateWifiSummary(airlines) {
+  let md = '| Airline | Total | 📶 WiFi | 🛜 High-Speed | % Starlink |\n';
+  md += '|---------|-------|---------|---------------|------------|\n';
+  
+  let grandTotal = 0;
+  let grandWifi = 0;
+  let grandHighSpeed = 0;
+  
+  for (const [code, data] of Object.entries(airlines)) {
+    const info = AIRLINE_INFO[code] || { name: code, flag: '✈️' };
+    const wifi = getWifiStats(data.aircraft);
+    
+    const wifiTotal = wifi.lowSpeed + wifi.highSpeed;
+    const wifiPercent = wifi.total ? Math.round(wifiTotal / wifi.total * 100) : 0;
+    
+    md += `| ${info.flag} ${info.name} | ${wifi.total} | ${wifiTotal} (${wifiPercent}%) | ${wifi.highSpeed} | **${wifi.highSpeedPercent}%** |\n`;
+    
+    grandTotal += wifi.total;
+    grandWifi += wifiTotal;
+    grandHighSpeed += wifi.highSpeed;
+  }
+  
+  const grandWifiPercent = grandTotal ? Math.round(grandWifi / grandTotal * 100) : 0;
+  const grandHighSpeedPercent = grandTotal ? Math.round(grandHighSpeed / grandTotal * 100) : 0;
+  
+  md += `| **Combined** | **${grandTotal}** | **${grandWifi} (${grandWifiPercent}%)** | **${grandHighSpeed}** | **${grandHighSpeedPercent}%** |\n`;
+  
+  return md;
+}
+
+// Generate the full README
+function generateReadme(airlines) {
+  const now = new Date().toISOString().split('T')[0];
+  
+  return `# ✈️ AF-KLM Fleet Catalog
+
+Open source, community-maintained catalog of **Air France** and **KLM** fleets with real-time tracking of aircraft properties, WiFi connectivity, and historical changes.
+
+---
+
+## 📊 Fleet Overview
+
+${generateWifiSummary(airlines)}
+
+> 🛜 **High-Speed** = Starlink satellite internet (50+ Mbps)  
+> 📶 **WiFi** = Any WiFi connectivity (low-speed or high-speed)
+
+*Last updated: ${now}*
+
+---
+
+## 🛫 Fleet Breakdown
+
+${generateFleetTable(airlines)}
+
+---
+
+## 📋 Detailed Configuration
+
+${generateDetailedFleetTable(airlines)}
+
+---
+
+## 🚀 Quick Start
+
+### Update the Catalog
+
+\`\`\`bash
+# Set your API key
+export AFKLM_API_KEY=your_api_key_here
+
+# Update Air France
+node fleet-update.js --airline AF
+
+# Update KLM  
+node fleet-update.js --airline KL
+
+# Preview changes without saving
+node fleet-update.js --airline KL --dry-run
+
+# Regenerate this README with latest stats
+node generate-readme.js
+\`\`\`
+
+### Using the Data
+
+\`\`\`javascript
+// Load Air France fleet
+const response = await fetch('https://raw.githubusercontent.com/.../airlines/AF.json');
+const fleet = await response.json();
+
+// Find all Starlink aircraft
+const starlink = fleet.aircraft.filter(a => a.connectivity.wifi === 'high-speed');
+console.log(\`\${starlink.length} aircraft with Starlink\`);
+
+// Get aircraft by type
+const a350s = fleet.aircraft.filter(a => a.aircraft_type.full_name?.includes('A350'));
+\`\`\`
+
+---
+
+## 📁 Data Structure
+
+\`\`\`
+af-klm/
+├── airlines/
+│   ├── AF.json         # Air France fleet
+│   └── KL.json         # KLM fleet
+├── schema/
+│   └── aircraft.schema.json
+├── fleet-update.js     # Update script
+└── generate-readme.js  # This stats generator
+\`\`\`
+
+### Aircraft Schema
+
+\`\`\`json
+{
+  "registration": "F-HTYA",
+  "aircraft_type": {
+    "iata_code": "359",
+    "manufacturer": "Airbus",
+    "model": "A350",
+    "full_name": "AIRBUS A350-900"
+  },
+  "cabin": {
+    "physical_configuration": "J034W024Y266",
+    "total_seats": 324,
+    "classes": { "business": 34, "premium_economy": 24, "economy": 266 }
+  },
+  "connectivity": {
+    "wifi": "high-speed",
+    "wifi_provider": "Starlink",
+    "satellite": true
+  },
+  "tracking": {
+    "first_seen": "2025-01-15",
+    "last_seen": "2026-02-04",
+    "total_flights": 1250
+  },
+  "history": [
+    {
+      "timestamp": "2026-01-20",
+      "property": "connectivity.wifi",
+      "old_value": "low-speed",
+      "new_value": "high-speed",
+      "source": "airline_api"
+    }
+  ]
+}
+\`\`\`
+
+---
+
+## 🤝 Contributing
+
+### Daily Updates
+
+Community members are encouraged to run the update script daily:
+
+1. Fork this repo
+2. Set your \`AFKLM_API_KEY\` 
+3. Run \`node fleet-update.js --airline AF\` and \`--airline KL\`
+4. Run \`node generate-readme.js\` to update stats
+5. Submit a PR
+
+### API Key
+
+Get a free API key at [developer.airfranceklm.com](https://developer.airfranceklm.com)
+
+---
+
+## 📋 Schema Version
+
+Current: **1.0.0**
+
+---
+
+## 📄 License
+
+Under MIT License
+
+---
+
+Made with ✈️  by the aviation community
+`;
+}
+
+// Main
+function main() {
+  console.log('📊 Generating README with fleet statistics...\n');
+  
+  const airlines = loadAirlines();
+  
+  // Show summary
+  for (const [code, data] of Object.entries(airlines)) {
+    const info = AIRLINE_INFO[code] || { name: code };
+    const wifi = getWifiStats(data.aircraft);
+    console.log(`${info.name}: ${wifi.total} aircraft, ${wifi.highSpeed} Starlink (${wifi.highSpeedPercent}%)`);
+  }
+  
+  // Generate and save README
+  const readme = generateReadme(airlines);
+  const readmePath = path.join(__dirname, 'README.md');
+  fs.writeFileSync(readmePath, readme);
+  
+  console.log(`\n✅ README.md updated!`);
+}
+
+main();
+
@@ -0,0 +1,38 @@
+{
+  "name": "fleet-catalog",
+  "version": "1.0.0",
+  "description": "Open-source catalog of airline fleets with historical tracking",
+  "type": "module",
+  "scripts": {
+    "update:af": "node fleet-update.js --airline AF",
+    "update:kl": "node fleet-update.js --airline KL",
+    "update:all": "node fleet-update.js --airline AF && node fleet-update.js --airline KL && node generate-readme.js",
+    "update:af:dry": "node fleet-update.js --airline AF --dry-run",
+    "update:kl:dry": "node fleet-update.js --airline KL --dry-run",
+    "bootstrap:af": "node fleet-update.js --airline AF --bootstrap",
+    "bootstrap:kl": "node fleet-update.js --airline KL --bootstrap",
+    "readme": "node generate-readme.js",
+    "validate": "node scripts/validate.js"
+  },
+  "keywords": [
+    "aviation",
+    "airlines",
+    "fleet",
+    "aircraft",
+    "tracking"
+  ],
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/fleet-catalog/fleet-catalog"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  },
+  "files": [
+    "airlines/",
+    "schema/",
+    "reference/"
+  ]
+}
+
@@ -0,0 +1,75 @@
+{
+  "$schema": "../schema/reference.schema.json",
+  "description": "Cabin class codes used in seat configuration strings",
+  "codes": [
+    {
+      "code": "P",
+      "class": "first",
+      "name": "First Class / La Première",
+      "notes": "Premium first class, used by Air France for La Première"
+    },
+    {
+      "code": "F",
+      "class": "first",
+      "name": "First Class",
+      "notes": "Traditional first class"
+    },
+    {
+      "code": "J",
+      "class": "business",
+      "name": "Business Class",
+      "notes": "Standard business class code"
+    },
+    {
+      "code": "C",
+      "class": "business",
+      "name": "Business Class",
+      "notes": "Alternative business class code, sometimes used for intra-European business"
+    },
+    {
+      "code": "W",
+      "class": "premium_economy",
+      "name": "Premium Economy",
+      "notes": "Premium economy class"
+    },
+    {
+      "code": "Y",
+      "class": "economy",
+      "name": "Economy Class",
+      "notes": "Standard economy class"
+    }
+  ],
+  "parsing_notes": [
+    "Configuration strings follow format: [CLASS_CODE][SEAT_COUNT]",
+    "Seat count is typically 2-3 digits (e.g., J034, Y266, or J34, Y266)",
+    "Multiple classes are concatenated: P004J058W028Y206",
+    "Parse left-to-right, extracting each class code followed by its count"
+  ],
+  "examples": [
+    {
+      "configuration": "Y148",
+      "parsed": { "economy": 148 },
+      "total": 148,
+      "description": "Single-class economy (e.g., A220)"
+    },
+    {
+      "configuration": "J034W024Y266",
+      "parsed": { "business": 34, "premium_economy": 24, "economy": 266 },
+      "total": 324,
+      "description": "Three-class long-haul (e.g., A350-900)"
+    },
+    {
+      "configuration": "P004J058W028Y206",
+      "parsed": { "first": 4, "business": 58, "premium_economy": 28, "economy": 206 },
+      "total": 296,
+      "description": "Four-class with La Première (e.g., 777-300ER)"
+    },
+    {
+      "configuration": "C108Y066",
+      "parsed": { "business": 108, "economy": 66 },
+      "total": 174,
+      "description": "Two-class short-haul with business (e.g., A320)"
+    }
+  ]
+}
+
@@ -0,0 +1,87 @@
+{
+  "$schema": "../schema/reference.schema.json",
+  "description": "Known WiFi providers and their characteristics",
+  "providers": [
+    {
+      "id": "starlink",
+      "name": "Starlink",
+      "company": "SpaceX",
+      "technology": "LEO satellite",
+      "speed_tier": "high-speed",
+      "typical_speed_mbps": "50-200",
+      "coverage": "global",
+      "notes": "Low-earth orbit constellation, low latency"
+    },
+    {
+      "id": "viasat",
+      "name": "Viasat",
+      "company": "Viasat Inc.",
+      "technology": "Ka-band GEO satellite",
+      "speed_tier": "high-speed",
+      "typical_speed_mbps": "12-100",
+      "coverage": "regional",
+      "notes": "ViaSat-1, ViaSat-2, ViaSat-3 satellites"
+    },
+    {
+      "id": "gogo_2ku",
+      "name": "Gogo 2Ku",
+      "company": "Gogo",
+      "technology": "Dual Ku-band satellite",
+      "speed_tier": "high-speed",
+      "typical_speed_mbps": "15-70",
+      "coverage": "global",
+      "notes": "Dual antenna system for better coverage"
+    },
+    {
+      "id": "gogo_atg",
+      "name": "Gogo ATG",
+      "company": "Gogo",
+      "technology": "Air-to-ground",
+      "speed_tier": "low-speed",
+      "typical_speed_mbps": "3-10",
+      "coverage": "continental_us",
+      "notes": "Ground-based towers, US domestic only"
+    },
+    {
+      "id": "panasonic_ku",
+      "name": "Panasonic Ku-band",
+      "company": "Panasonic Avionics",
+      "technology": "Ku-band satellite",
+      "speed_tier": "low-speed",
+      "typical_speed_mbps": "5-20",
+      "coverage": "global",
+      "notes": "eXConnect service"
+    },
+    {
+      "id": "inmarsat_gx",
+      "name": "Inmarsat GX Aviation",
+      "company": "Inmarsat",
+      "technology": "Ka-band GEO satellite",
+      "speed_tier": "high-speed",
+      "typical_speed_mbps": "15-50",
+      "coverage": "global",
+      "notes": "Global Xpress network"
+    },
+    {
+      "id": "anuvu",
+      "name": "Anuvu",
+      "company": "Anuvu (formerly Global Eagle)",
+      "technology": "Ku-band satellite",
+      "speed_tier": "low-speed",
+      "typical_speed_mbps": "5-15",
+      "coverage": "regional",
+      "notes": "Formerly Global Eagle Entertainment"
+    },
+    {
+      "id": "thales_flexvue",
+      "name": "Thales FlexVue",
+      "company": "Thales",
+      "technology": "Ku/Ka-band satellite",
+      "speed_tier": "high-speed",
+      "typical_speed_mbps": "20-50",
+      "coverage": "global",
+      "notes": "Part of Thales InFlyt Experience"
+    }
+  ]
+}
+
@@ -0,0 +1,333 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://github.com/fleet-catalog/schema/aircraft.schema.json",
+  "title": "Airline Fleet Catalog",
+  "description": "Standardized schema for tracking airline fleet properties across multiple carriers",
+  "type": "object",
+  "required": ["schema_version", "airline", "generated_at", "aircraft"],
+  "properties": {
+    "schema_version": {
+      "type": "string",
+      "description": "Semantic version of the schema",
+      "pattern": "^\\d+\\.\\d+\\.\\d+$"
+    },
+    "airline": {
+      "type": "object",
+      "required": ["iata_code", "name"],
+      "properties": {
+        "iata_code": {
+          "type": "string",
+          "description": "2-letter IATA airline code",
+          "pattern": "^[A-Z0-9]{2}$"
+        },
+        "icao_code": {
+          "type": ["string", "null"],
+          "description": "3-letter ICAO airline code",
+          "pattern": "^[A-Z]{3}$"
+        },
+        "name": {
+          "type": "string",
+          "description": "Full airline name"
+        },
+        "country": {
+          "type": ["string", "null"],
+          "description": "ISO 3166-1 alpha-2 country code"
+        }
+      }
+    },
+    "generated_at": {
+      "type": "string",
+      "format": "date-time",
+      "description": "ISO 8601 timestamp when this file was generated"
+    },
+    "aircraft_count": {
+      "type": "integer",
+      "description": "Total number of aircraft in this file"
+    },
+    "aircraft": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/aircraft"
+      }
+    }
+  },
+  "$defs": {
+    "aircraft": {
+      "type": "object",
+      "required": ["registration"],
+      "properties": {
+        "registration": {
+          "type": "string",
+          "description": "Aircraft registration (tail number) without dashes"
+        },
+        "icao24": {
+          "type": ["string", "null"],
+          "description": "24-bit ICAO Mode-S transponder address in hexadecimal"
+        },
+        "aircraft_type": {
+          "$ref": "#/$defs/aircraft_type"
+        },
+        "operator": {
+          "$ref": "#/$defs/operator"
+        },
+        "cabin": {
+          "$ref": "#/$defs/cabin"
+        },
+        "connectivity": {
+          "$ref": "#/$defs/connectivity"
+        },
+        "ife": {
+          "$ref": "#/$defs/ife"
+        },
+        "status": {
+          "type": "string",
+          "enum": ["active", "stored", "maintenance", "retired"],
+          "description": "Current operational status"
+        },
+        "tracking": {
+          "$ref": "#/$defs/tracking"
+        },
+        "metadata": {
+          "$ref": "#/$defs/metadata"
+        },
+        "history": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/history_entry"
+          }
+        }
+      }
+    },
+    "aircraft_type": {
+      "type": "object",
+      "properties": {
+        "iata_code": {
+          "type": ["string", "null"],
+          "description": "IATA aircraft type code (e.g., 77W, 359)"
+        },
+        "icao_code": {
+          "type": ["string", "null"],
+          "description": "ICAO aircraft type designator (e.g., B77W, A359)"
+        },
+        "manufacturer": {
+          "type": ["string", "null"],
+          "description": "Aircraft manufacturer (Airbus, Boeing, Embraer, etc.)"
+        },
+        "model": {
+          "type": ["string", "null"],
+          "description": "Aircraft model (A350, 777, etc.)"
+        },
+        "variant": {
+          "type": ["string", "null"],
+          "description": "Aircraft variant (900, 300ER, etc.)"
+        },
+        "full_name": {
+          "type": ["string", "null"],
+          "description": "Full aircraft type name"
+        }
+      }
+    },
+    "operator": {
+      "type": "object",
+      "description": "Operational details specific to this aircraft",
+      "properties": {
+        "sub_fleet_code": {
+          "type": ["string", "null"],
+          "description": "Internal sub-fleet code (e.g., AB, CA, AR)"
+        },
+        "cabin_crew_employer": {
+          "type": ["string", "null"],
+          "description": "Airline code of cabin crew employer"
+        },
+        "cockpit_crew_employer": {
+          "type": ["string", "null"],
+          "description": "Airline code of cockpit crew employer"
+        }
+      }
+    },
+    "cabin": {
+      "type": "object",
+      "properties": {
+        "physical_configuration": {
+          "type": ["string", "null"],
+          "description": "Physical seat configuration code (e.g., J034W024Y266)"
+        },
+        "operational_configuration": {
+          "type": ["string", "null"],
+          "description": "Operational/saleable seat configuration"
+        },
+        "saleable_configuration": {
+          "type": ["string", "null"],
+          "description": "Saleable seat configuration"
+        },
+        "total_seats": {
+          "type": ["integer", "null"],
+          "description": "Total number of passenger seats"
+        },
+        "classes": {
+          "type": "object",
+          "properties": {
+            "first": {
+              "type": "integer",
+              "description": "Number of first class seats"
+            },
+            "business": {
+              "type": "integer",
+              "description": "Number of business class seats"
+            },
+            "premium_economy": {
+              "type": "integer",
+              "description": "Number of premium economy seats"
+            },
+            "economy": {
+              "type": "integer",
+              "description": "Number of economy seats"
+            }
+          }
+        },
+        "freight_configuration": {
+          "type": ["string", "null"],
+          "description": "Cargo hold configuration (e.g., PP008LL012)"
+        }
+      }
+    },
+    "connectivity": {
+      "type": "object",
+      "properties": {
+        "wifi": {
+          "type": "string",
+          "enum": ["none", "low-speed", "high-speed"],
+          "description": "WiFi availability and speed tier"
+        },
+        "wifi_provider": {
+          "type": ["string", "null"],
+          "description": "WiFi service provider (Starlink, Viasat, Gogo 2Ku, etc.)"
+        },
+        "satellite": {
+          "type": ["boolean", "null"],
+          "description": "Whether satellite connectivity is available"
+        },
+        "live_tv": {
+          "type": ["boolean", "null"],
+          "description": "Whether live TV is available"
+        },
+        "power_outlets": {
+          "type": ["boolean", "null"],
+          "description": "Whether AC power outlets are available"
+        },
+        "usb_ports": {
+          "type": ["boolean", "null"],
+          "description": "Whether USB charging ports are available"
+        }
+      }
+    },
+    "ife": {
+      "type": "object",
+      "description": "In-flight entertainment system",
+      "properties": {
+        "type": {
+          "type": ["string", "null"],
+          "enum": ["none", "overhead", "seatback", "streaming", "hybrid", null],
+          "description": "Type of IFE system"
+        },
+        "personal_screens": {
+          "type": ["boolean", "null"],
+          "description": "Whether personal seatback screens are available"
+        }
+      }
+    },
+    "tracking": {
+      "type": "object",
+      "description": "Flight tracking statistics",
+      "properties": {
+        "first_seen": {
+          "type": ["string", "null"],
+          "format": "date",
+          "description": "Date when aircraft was first tracked"
+        },
+        "last_seen": {
+          "type": ["string", "null"],
+          "format": "date",
+          "description": "Date when aircraft was last tracked"
+        },
+        "total_flights": {
+          "type": ["integer", "null"],
+          "description": "Total number of flights tracked"
+        }
+      }
+    },
+    "metadata": {
+      "type": "object",
+      "description": "Additional metadata about the aircraft",
+      "properties": {
+        "delivery_date": {
+          "type": ["string", "null"],
+          "format": "date",
+          "description": "Date aircraft was delivered to airline"
+        },
+        "msn": {
+          "type": ["string", "null"],
+          "description": "Manufacturer Serial Number"
+        },
+        "line_number": {
+          "type": ["string", "null"],
+          "description": "Production line number"
+        },
+        "production_site": {
+          "type": ["string", "null"],
+          "description": "Factory/production site (e.g., Toulouse, Hamburg, Mirabel)"
+        },
+        "engine_type": {
+          "type": ["string", "null"],
+          "description": "Engine model (e.g., Trent XWB-84, GE90-115B)"
+        },
+        "aircraft_name": {
+          "type": ["string", "null"],
+          "description": "Aircraft given name (e.g., 'Fort-de-France')"
+        },
+        "livery": {
+          "type": ["string", "null"],
+          "description": "Special livery or paint scheme"
+        },
+        "comments": {
+          "type": ["string", "null"],
+          "description": "Additional notes or comments"
+        },
+        "created_at": {
+          "type": ["string", "null"],
+          "format": "date-time"
+        },
+        "updated_at": {
+          "type": ["string", "null"],
+          "format": "date-time"
+        }
+      }
+    },
+    "history_entry": {
+      "type": "object",
+      "required": ["timestamp", "property"],
+      "properties": {
+        "timestamp": {
+          "type": "string",
+          "description": "Date or datetime when change was detected"
+        },
+        "property": {
+          "type": "string",
+          "description": "Dot-notation path to the changed property"
+        },
+        "old_value": {
+          "description": "Previous value"
+        },
+        "new_value": {
+          "description": "New value"
+        },
+        "source": {
+          "type": ["string", "null"],
+          "enum": ["flight_api", "airline_api", "manual", "planespotters", "community", null],
+          "description": "Source of the change detection"
+        }
+      }
+    }
+  }
+}
+
@@ -1,3 +1,6 @@
 faa-aircraft-registry==0.1.0
 pandas==3.0.0
-
+pyarrow==23.0.0
+orjson==3.11.7
+polars==1.38.1
+jsonschema==4.26.0
@@ -0,0 +1,113 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "OpenAirframes Community Submission (v1)",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "registration_number": {
+      "type": "string",
+      "minLength": 1
+    },
+    "transponder_code_hex": {
+      "type": "string",
+      "pattern": "^[0-9A-F]{6}$"
+    },
+    "openairframes_id": {
+      "type": "string",
+      "minLength": 1
+    },
+    "contributor_uuid": {
+      "type": "string",
+      "format": "uuid"
+    },
+    "contributor_name": {
+      "type": "string",
+      "minLength": 0,
+      "maxLength": 150,
+      "description": "Display name (may be blank)"
+    },
+    "creation_timestamp": {
+      "type": "string",
+      "format": "date-time",
+      "description": "Set by the system when the submission is persisted/approved.",
+      "readOnly": true
+    },
+    "start_date": {
+      "type": "string",
+      "format": "date",
+      "pattern": "^\\d{4}-\\d{2}-\\d{2}$",
+      "description": "Optional start date for when this submission's tags are valid (ISO 8601, e.g., 2025-05-01)."
+    },
+    "end_date": {
+      "type": "string",
+      "format": "date",
+      "pattern": "^\\d{4}-\\d{2}-\\d{2}$",
+      "description": "Optional end date for when this submission's tags are valid (ISO 8601, e.g., 2025-07-03)."
+    },
+    "tags": {
+      "type": "object",
+      "description": "Additional community-defined tags as key/value pairs (values may be scalar, array, or object).",
+      "propertyNames": {
+        "type": "string",
+        "pattern": "^[a-z][a-z0-9_]{0,63}$"
+      },
+      "additionalProperties": {
+        "$ref": "#/$defs/tagValue"
+      },
+      "properties": {}
+    }
+  },
+  "allOf": [
+    {
+      "anyOf": [
+        {
+          "required": [
+            "registration_number"
+          ]
+        },
+        {
+          "required": [
+            "transponder_code_hex"
+          ]
+        },
+        {
+          "required": [
+            "openairframes_id"
+          ]
+        }
+      ]
+    }
+  ],
+  "$defs": {
+    "tagScalar": {
+      "type": [
+        "string",
+        "number",
+        "integer",
+        "boolean",
+        "null"
+      ]
+    },
+    "tagValue": {
+      "anyOf": [
+        {
+          "$ref": "#/$defs/tagScalar"
+        },
+        {
+          "type": "array",
+          "maxItems": 50,
+          "items": {
+            "$ref": "#/$defs/tagScalar"
+          }
+        },
+        {
+          "type": "object",
+          "maxProperties": 50,
+          "additionalProperties": {
+            "$ref": "#/$defs/tagScalar"
+          }
+        }
+      ]
+    }
+  }
+}
@@ -0,0 +1,11 @@
+FROM --platform=linux/arm64 python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.reducer.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY compress_adsb_to_aircraft_data.py .
+COPY reducer.py .
+
+CMD ["python", "-u", "reducer.py"]
@@ -0,0 +1,12 @@
+FROM --platform=linux/arm64 python:3.12-slim
+
+WORKDIR /app
+
+COPY requirements.worker.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY compress_adsb_to_aircraft_data.py .
+COPY download_adsb_data_to_parquet.py .
+COPY worker.py .
+
+CMD ["python", "-u", "worker.py"]
@@ -0,0 +1,250 @@
+"""
+Combines chunk parquet files and compresses to final aircraft CSV.
+This is the reduce phase of the map-reduce pipeline.
+
+Supports both single-day (daily) and multi-day (historical) modes.
+
+Memory-efficient: processes each chunk separately, compresses, then combines.
+
+Usage:
+    # Daily mode
+    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
+    
+    # Historical mode
+    python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date 2024-01-01 --end-date 2024-01-07 --skip-base
+"""
+import gc
+import os
+import sys
+import glob
+import argparse
+from datetime import datetime, timedelta
+
+import polars as pl
+
+from src.adsb.download_adsb_data_to_parquet import OUTPUT_DIR, get_resource_usage
+from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLUMNS
+
+
+DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
+FINAL_OUTPUT_DIR = "./data/openairframes"
+os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
+
+
+def get_target_day() -> datetime:
+    """Get yesterday's date (the day we're processing)."""
+    return datetime.utcnow() - timedelta(days=1)
+
+
+def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
+    """Load and compress a single chunk parquet file.
+    
+    Args:
+        chunk_path: Path to parquet file
+        delete_after_load: If True, delete the parquet file after loading to free disk space
+    """
+    print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
+    
+    # Load chunk - only columns we need
+    needed_columns = ['time', 'icao'] + COLUMNS
+    df = pl.read_parquet(chunk_path, columns=needed_columns)
+    print(f"  Loaded {len(df)} rows")
+    
+    # Delete file immediately after loading to free disk space
+    if delete_after_load:
+        try:
+            os.remove(chunk_path)
+            print(f"  Deleted {chunk_path} to free disk space")
+        except Exception as e:
+            print(f"  Warning: Failed to delete {chunk_path}: {e}")
+    
+    # Compress to aircraft records (one per ICAO) using shared function
+    compressed = compress_multi_icao_df(df, verbose=True)
+    print(f"  Compressed to {len(compressed)} aircraft records")
+    
+    del df
+    gc.collect()
+    
+    return compressed
+
+
+def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFrame:
+    """Combine multiple compressed DataFrames.
+    
+    Since chunks are partitioned by ICAO hash, each ICAO only appears in one chunk.
+    No deduplication needed here - just concatenate.
+    """
+    print(f"Combining {len(compressed_dfs)} compressed chunks... | {get_resource_usage()}")
+    
+    # Concat all
+    combined = pl.concat(compressed_dfs)
+    print(f"Combined: {len(combined)} records")
+    
+    return combined
+
+
+def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame:
+    """Download base release and merge with new data."""
+    from src.get_latest_release import download_latest_aircraft_adsb_csv
+    
+    print("Downloading base ADS-B release...")
+    try:
+        base_path = download_latest_aircraft_adsb_csv(
+            output_dir="./data/openairframes_base"
+        )
+        print(f"Download returned: {base_path}")
+        
+        if base_path and os.path.exists(str(base_path)):
+            print(f"Loading base release from {base_path}")
+            base_df = pl.read_csv(base_path)
+            print(f"Base release has {len(base_df)} records")
+            
+            # Ensure columns match
+            base_cols = set(base_df.columns)
+            new_cols = set(compressed_df.columns)
+            print(f"Base columns: {sorted(base_cols)}")
+            print(f"New columns: {sorted(new_cols)}")
+            
+            # Add missing columns
+            for col in new_cols - base_cols:
+                base_df = base_df.with_columns(pl.lit(None).alias(col))
+            for col in base_cols - new_cols:
+                compressed_df = compressed_df.with_columns(pl.lit(None).alias(col))
+            
+            # Reorder columns to match
+            compressed_df = compressed_df.select(base_df.columns)
+            
+            # Concat and deduplicate by icao (keep new data - it comes last)
+            combined = pl.concat([base_df, compressed_df])
+            print(f"After concat: {len(combined)} records")
+            
+            deduplicated = combined.unique(subset=["icao"], keep="last")
+            
+            print(f"Combined with base: {len(combined)} -> {len(deduplicated)} after dedup")
+            
+            del base_df, combined
+            gc.collect()
+            
+            return deduplicated
+        else:
+            print(f"No base release found at {base_path}, using only new data")
+            return compressed_df
+    except Exception as e:
+        import traceback
+        print(f"Failed to download base release: {e}")
+        traceback.print_exc()
+        return compressed_df
+
+
+def cleanup_chunks(output_id: str, chunks_dir: str):
+    """Delete chunk parquet files after successful merge."""
+    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
+    chunk_files = glob.glob(pattern)
+    for f in chunk_files:
+        try:
+            os.remove(f)
+            print(f"Deleted {f}")
+        except Exception as e:
+            print(f"Failed to delete {f}: {e}")
+
+
+def find_chunk_files(chunks_dir: str, output_id: str) -> list[str]:
+    """Find chunk parquet files matching the output ID."""
+    pattern = os.path.join(chunks_dir, f"chunk_*_{output_id}.parquet")
+    chunk_files = sorted(glob.glob(pattern))
+    
+    if not chunk_files:
+        # Try recursive search for historical mode with merged artifacts
+        pattern = os.path.join(chunks_dir, "**", "*.parquet")
+        chunk_files = sorted(glob.glob(pattern, recursive=True))
+    
+    return chunk_files
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Combine chunk parquets to final CSV")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
+    parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
+    parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
+    parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
+    parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
+    args = parser.parse_args()
+    
+    # Determine output ID and filename based on mode
+    if args.start_date and args.end_date:
+        # Historical mode
+        output_id = f"{args.start_date}_{args.end_date}"
+        output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv"
+        print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
+    else:
+        # Daily mode - use same date for start and end
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        
+        date_str = target_day.strftime("%Y-%m-%d")
+        output_id = date_str
+        output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv"
+        print(f"Combining chunks for {date_str}")
+    
+    chunks_dir = args.chunks_dir
+    print(f"Chunks directory: {chunks_dir}")
+    print(f"Resource usage at start: {get_resource_usage()}")
+    
+    # Find chunk files
+    chunk_files = find_chunk_files(chunks_dir, output_id)
+    
+    if not chunk_files:
+        print(f"No chunk files found in: {chunks_dir}")
+        sys.exit(1)
+    
+    print(f"Found {len(chunk_files)} chunk files")
+    
+    # Process each chunk separately to save memory
+    # With --stream, delete parquet files immediately after loading to save disk space
+    compressed_chunks = []
+    for chunk_path in chunk_files:
+        compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
+        compressed_chunks.append(compressed)
+        gc.collect()
+    
+    # Combine all compressed chunks
+    combined = combine_compressed_chunks(compressed_chunks)
+    
+    # Free memory from individual chunks
+    del compressed_chunks
+    gc.collect()
+    print(f"After combining: {get_resource_usage()}")
+    
+    # Merge with base release (unless skipped)
+    if not args.skip_base:
+        combined = download_and_merge_base_release(combined)
+    
+    # Convert list columns to strings for CSV compatibility
+    for col in combined.columns:
+        if combined[col].dtype == pl.List:
+            combined = combined.with_columns(
+                pl.col(col).list.join(",").alias(col)
+            )
+    
+    # Sort by time for consistent output
+    if 'time' in combined.columns:
+        combined = combined.sort('time')
+    
+    # Write final CSV
+    output_path = os.path.join(FINAL_OUTPUT_DIR, output_filename)
+    combined.write_csv(output_path)
+    print(f"Wrote {len(combined)} records to {output_path}")
+    
+    # Cleanup
+    if not args.keep_chunks:
+        cleanup_chunks(output_id, chunks_dir)
+    
+    print(f"Done! | {get_resource_usage()}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,274 @@
+# Shared compression logic for ADS-B aircraft data
+import os
+import polars as pl
+
+COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']
+
+
+def deduplicate_by_signature(df: pl.DataFrame) -> pl.DataFrame:
+    """For each icao, keep only the earliest row with each unique signature.
+    
+    This is used for deduplicating across multiple compressed chunks.
+    """
+    # Create signature column
+    df = df.with_columns(
+        pl.concat_str([pl.col(c).cast(pl.Utf8).fill_null("") for c in COLUMNS], separator="|").alias("_signature")
+    )
+    # Group by icao and signature, take first row (earliest due to time sort)
+    df = df.sort("time")
+    df_deduped = df.group_by(["icao", "_signature"]).first()
+    df_deduped = df_deduped.drop("_signature")
+    df_deduped = df_deduped.sort("time")
+    return df_deduped
+
+
+def compress_df_polars(df: pl.DataFrame, icao: str) -> pl.DataFrame:
+    """Compress a single ICAO group to its most informative row using Polars."""
+    # Create signature string
+    df = df.with_columns(
+        pl.concat_str([pl.col(c).cast(pl.Utf8) for c in COLUMNS], separator="|").alias("_signature")
+    )
+    
+    # Compute signature counts
+    signature_counts = df.group_by("_signature").len().rename({"len": "_sig_count"})
+    
+    # Group by signature and take first row
+    df = df.group_by("_signature").first()
+    
+    if df.height == 1:
+        # Only one unique signature, return it
+        result = df.drop("_signature").with_columns(pl.lit(icao).alias("icao"))
+        return result
+    
+    # For each row, create dict of non-empty column values and check subsets
+    # Convert to list of dicts for subset checking (same logic as pandas version)
+    rows_data = []
+    for row in df.iter_rows(named=True):
+        non_empty = {col: row[col] for col in COLUMNS if row[col] != '' and row[col] is not None}
+        rows_data.append({
+            'signature': row['_signature'],
+            'non_empty_dict': non_empty,
+            'non_empty_count': len(non_empty),
+            'row_data': row
+        })
+    
+    # Check if row i's non-empty values are a subset of row j's non-empty values
+    def is_subset_of_any(idx):
+        row_dict = rows_data[idx]['non_empty_dict']
+        row_count = rows_data[idx]['non_empty_count']
+        
+        for other_idx, other_data in enumerate(rows_data):
+            if idx == other_idx:
+                continue
+            other_dict = other_data['non_empty_dict']
+            other_count = other_data['non_empty_count']
+            
+            # Check if all non-empty values in current row match those in other row
+            if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):
+                # If they match and other has more defined columns, current row is redundant
+                if other_count > row_count:
+                    return True
+        return False
+    
+    # Keep rows that are not subsets of any other row
+    keep_indices = [i for i in range(len(rows_data)) if not is_subset_of_any(i)]
+    
+    if len(keep_indices) == 0:
+        keep_indices = [0]  # Fallback: keep first row
+    
+    remaining_signatures = [rows_data[i]['signature'] for i in keep_indices]
+    df = df.filter(pl.col("_signature").is_in(remaining_signatures))
+    
+    if df.height > 1:
+        # Use signature counts to pick the most frequent one
+        df = df.join(signature_counts, on="_signature", how="left")
+        max_count = df["_sig_count"].max()
+        df = df.filter(pl.col("_sig_count") == max_count).head(1)
+        df = df.drop("_sig_count")
+    
+    result = df.drop("_signature").with_columns(pl.lit(icao).alias("icao"))
+    
+    # Ensure empty strings are preserved
+    for col in COLUMNS:
+        if col in result.columns:
+            result = result.with_columns(pl.col(col).fill_null(""))
+    
+    return result
+
+
+def compress_multi_icao_df(df: pl.DataFrame, verbose: bool = True) -> pl.DataFrame:
+    """Compress a DataFrame with multiple ICAOs to one row per ICAO.
+    
+    This is the main entry point for compressing ADS-B data.
+    Used by both daily GitHub Actions runs and historical AWS runs.
+    
+    Args:
+        df: DataFrame with columns ['time', 'icao'] + COLUMNS
+        verbose: Whether to print progress
+    
+    Returns:
+        Compressed DataFrame with one row per ICAO
+    """
+    if df.height == 0:
+        return df
+    
+    # Sort by icao and time
+    df = df.sort(['icao', 'time'])
+    
+    # Fill null values with empty strings for COLUMNS
+    for col in COLUMNS:
+        if col in df.columns:
+            df = df.with_columns(pl.col(col).cast(pl.Utf8).fill_null(""))
+    
+    # First pass: quick deduplication of exact duplicates
+    df = df.unique(subset=['icao'] + COLUMNS, keep='first')
+    if verbose:
+        print(f"After quick dedup: {df.height} records")
+    
+    # Second pass: sophisticated compression per ICAO
+    if verbose:
+        print("Compressing per ICAO...")
+    
+    # Process each ICAO group
+    icao_groups = df.partition_by('icao', as_dict=True, maintain_order=True)
+    compressed_dfs = []
+    
+    for icao_key, group_df in icao_groups.items():
+        # partition_by with as_dict=True returns tuple keys, extract first element
+        icao = icao_key[0] if isinstance(icao_key, tuple) else icao_key
+        compressed = compress_df_polars(group_df, str(icao))
+        compressed_dfs.append(compressed)
+    
+    if compressed_dfs:
+        df_compressed = pl.concat(compressed_dfs)
+    else:
+        df_compressed = df.head(0)  # Empty with same schema
+    
+    if verbose:
+        print(f"After compress: {df_compressed.height} records")
+    
+    # Reorder columns: time first, then icao
+    cols = df_compressed.columns
+    ordered_cols = ['time', 'icao'] + [c for c in cols if c not in ['time', 'icao']]
+    df_compressed = df_compressed.select(ordered_cols)
+    
+    return df_compressed
+
+
+def load_raw_adsb_for_day(day):
+    """Load raw ADS-B data for a day from parquet file."""
+    from datetime import timedelta
+    from pathlib import Path
+    
+    start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)
+    
+    # Check for parquet file first
+    version_date = f"v{start_time.strftime('%Y.%m.%d')}"
+    parquet_file = Path(f"data/output/parquet_output/{version_date}.parquet")
+    
+    if not parquet_file.exists():
+        # Try to generate parquet file by calling the download function
+        print(f"  Parquet file not found: {parquet_file}")
+        print(f"  Attempting to download and generate parquet for {start_time.strftime('%Y-%m-%d')}...")
+        
+        from download_adsb_data_to_parquet import create_parquet_for_day
+        result_path = create_parquet_for_day(start_time, keep_folders=False)
+        
+        if result_path:
+            print(f"  Successfully generated parquet file: {result_path}")
+        else:
+            raise Exception("Failed to generate parquet file")
+    
+    if parquet_file.exists():
+        print(f"  Loading from parquet: {parquet_file}")
+        df = pl.read_parquet(
+            parquet_file, 
+            columns=['time', 'icao', 'r', 't', 'dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category']
+        )
+        
+        # Convert to timezone-naive datetime
+        if df["time"].dtype == pl.Datetime:
+            df = df.with_columns(pl.col("time").dt.replace_time_zone(None))
+        
+        return df
+    else:
+        # Return empty DataFrame if parquet file doesn't exist
+        print(f"  No data available for {start_time.strftime('%Y-%m-%d')}")
+        return pl.DataFrame(schema={
+            'time': pl.Datetime,
+            'icao': pl.Utf8,
+            'r': pl.Utf8,
+            't': pl.Utf8,
+            'dbFlags': pl.Int64,
+            'ownOp': pl.Utf8,
+            'year': pl.Int64,
+            'desc': pl.Utf8,
+            'aircraft_category': pl.Utf8
+        })
+
+
+def load_historical_for_day(day):
+    """Load and compress historical ADS-B data for a day."""
+    df = load_raw_adsb_for_day(day)
+    if df.height == 0:
+        return df
+    
+    print(f"Loaded {df.height} raw records for {day.strftime('%Y-%m-%d')}")
+    
+    # Use shared compression function
+    return compress_multi_icao_df(df, verbose=True)
+
+
+def concat_compressed_dfs(df_base, df_new):
+    """Concatenate base and new compressed dataframes, keeping the most informative row per ICAO."""
+    # Combine both dataframes
+    df_combined = pl.concat([df_base, df_new])
+    
+    # Sort by ICAO and time
+    df_combined = df_combined.sort(['icao', 'time'])
+    
+    # Fill null values
+    for col in COLUMNS:
+        if col in df_combined.columns:
+            df_combined = df_combined.with_columns(pl.col(col).fill_null(""))
+    
+    # Apply compression logic per ICAO to get the best row
+    icao_groups = df_combined.partition_by('icao', as_dict=True, maintain_order=True)
+    compressed_dfs = []
+    
+    for icao, group_df in icao_groups.items():
+        compressed = compress_df_polars(group_df, icao)
+        compressed_dfs.append(compressed)
+    
+    if compressed_dfs:
+        df_compressed = pl.concat(compressed_dfs)
+    else:
+        df_compressed = df_combined.head(0)
+    
+    # Sort by time
+    df_compressed = df_compressed.sort('time')
+    
+    return df_compressed
+
+
+def get_latest_aircraft_adsb_csv_df():
+    """Download and load the latest ADS-B CSV from GitHub releases."""
+    from get_latest_release import download_latest_aircraft_adsb_csv
+    import re
+    
+    csv_path = download_latest_aircraft_adsb_csv()
+    df = pl.read_csv(csv_path, null_values=[""])
+    
+    # Fill nulls with empty strings
+    for col in df.columns:
+        if df[col].dtype == pl.Utf8:
+            df = df.with_columns(pl.col(col).fill_null(""))
+    
+    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
+    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    if not match:
+        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+    
+    date_str = match.group(1)
+    return df, date_str
+
@@ -0,0 +1,747 @@
+"""
+Downloads adsb.lol data and writes to Parquet files.
+
+Usage:
+    python -m src.process_historical_adsb_data.download_to_parquet 2025-01-01 2025-01-02
+
+This will download trace data for the specified date range and output Parquet files.
+
+This file is self-contained and does not import from other project modules.
+"""
+import gc
+import glob
+import gzip
+import resource
+import shutil
+import sys
+import logging
+import time
+import re
+import signal
+import concurrent.futures
+import subprocess
+import os
+import argparse
+import datetime as dt
+from datetime import datetime, timedelta, timezone
+import urllib.request
+import urllib.error
+
+import orjson
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+
+# ============================================================================
+# Configuration
+# ============================================================================
+
+OUTPUT_DIR = "./data/output"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+PARQUET_DIR = os.path.join(OUTPUT_DIR, "parquet_output")
+os.makedirs(PARQUET_DIR, exist_ok=True)
+
+TOKEN = os.environ.get('GITHUB_TOKEN')  # Optional: for higher GitHub API rate limits
+HEADERS = {"Authorization": f"token {TOKEN}"} if TOKEN else {}
+
+
+def get_resource_usage() -> str:
+    """Get current RAM and disk usage as a formatted string."""
+    # RAM usage (RSS = Resident Set Size)
+    ram_bytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+    # On macOS, ru_maxrss is in bytes; on Linux, it's in KB
+    if sys.platform == 'darwin':
+        ram_gb = ram_bytes / (1024**3)
+    else:
+        ram_gb = ram_bytes / (1024**2)  # Convert KB to GB
+    
+    # Disk usage
+    disk = shutil.disk_usage('.')
+    disk_free_gb = disk.free / (1024**3)
+    disk_total_gb = disk.total / (1024**3)
+    
+    return f"RAM: {ram_gb:.2f}GB | Disk: {disk_free_gb:.1f}GB free / {disk_total_gb:.1f}GB total"
+
+
+# ============================================================================
+# GitHub Release Fetching and Downloading
+# ============================================================================
+
+class DownloadTimeoutException(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    raise DownloadTimeoutException("Download timed out after 40 seconds")
+
+
+def fetch_releases(version_date: str) -> list:
+    """Fetch GitHub releases for a given version date from adsblol."""
+    year = version_date.split('.')[0][1:]
+    if version_date == "v2024.12.31":
+        year = "2025"
+    BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
+    # Match exact release name, exclude tmp releases
+    PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+$"
+    releases = []
+    page = 1
+    
+    while True:
+        max_retries = 10
+        retry_delay = 60
+        
+        for attempt in range(1, max_retries + 1):
+            try:
+                req = urllib.request.Request(f"{BASE_URL}?page={page}", headers=HEADERS)
+                with urllib.request.urlopen(req) as response:
+                    if response.status == 200:
+                        data = orjson.loads(response.read())
+                        break
+                    else:
+                        print(f"Failed to fetch releases (attempt {attempt}/{max_retries}): {response.status} {response.reason}")
+                        if attempt < max_retries:
+                            print(f"Waiting {retry_delay} seconds before retry...")
+                            time.sleep(retry_delay)
+                        else:
+                            print(f"Giving up after {max_retries} attempts")
+                            return releases
+            except Exception as e:
+                print(f"Request exception (attempt {attempt}/{max_retries}): {e}")
+                if attempt < max_retries:
+                    print(f"Waiting {retry_delay} seconds before retry...")
+                    time.sleep(retry_delay)
+                else:
+                    print(f"Giving up after {max_retries} attempts")
+                    return releases
+        if not data:
+            break
+        for release in data:
+            if re.match(PATTERN, release["tag_name"]):
+                releases.append(release)
+        page += 1
+    return releases
+
+
+def download_asset(asset_url: str, file_path: str) -> bool:
+    """Download a single release asset."""
+    os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
+    
+    if os.path.exists(file_path):
+        print(f"[SKIP] {file_path} already downloaded.")
+        return True
+    
+    print(f"Downloading {asset_url}...")
+    try:
+        signal.signal(signal.SIGALRM, timeout_handler)
+        signal.alarm(40)  # 40-second timeout
+        
+        req = urllib.request.Request(asset_url, headers=HEADERS)
+        with urllib.request.urlopen(req) as response:
+            signal.alarm(0)
+            
+            if response.status == 200:
+                with open(file_path, "wb") as file:
+                    while True:
+                        chunk = response.read(8192)
+                        if not chunk:
+                            break
+                        file.write(chunk)
+                print(f"Saved {file_path}")
+                return True
+            else:
+                print(f"Failed to download {asset_url}: {response.status} {response.msg}")
+                return False
+    except DownloadTimeoutException as e:
+        print(f"Download aborted for {asset_url}: {e}")
+        return False
+    except Exception as e:
+        print(f"An error occurred while downloading {asset_url}: {e}")
+        return False
+
+
+def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
+    """
+    Extracts a split archive by concatenating the parts using 'cat'
+    and then extracting with 'tar' in one pipeline.
+    Deletes the tar files immediately after extraction to save disk space.
+    """
+    if os.path.isdir(extract_dir):
+        print(f"[SKIP] Extraction directory already exists: {extract_dir}")
+        return True
+    
+    def sort_key(path: str):
+        base = os.path.basename(path)
+        parts = base.rsplit('.', maxsplit=1)
+        if len(parts) == 2:
+            suffix = parts[1]
+            if suffix.isdigit():
+                return (0, int(suffix))
+            if re.fullmatch(r'[a-zA-Z]+', suffix):
+                return (1, suffix)
+        return (2, base)
+    
+    file_paths = sorted(file_paths, key=sort_key)
+    os.makedirs(extract_dir, exist_ok=True)
+    
+    try:
+        cat_proc = subprocess.Popen(
+            ["cat"] + file_paths,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE
+        )
+        tar_cmd = ["tar", "xf", "-", "-C", extract_dir, "--strip-components=1"]
+        result = subprocess.run(
+            tar_cmd,
+            stdin=cat_proc.stdout,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True
+        )
+        cat_proc.stdout.close()
+        cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
+        cat_proc.wait()
+        
+        if cat_stderr:
+            print(f"cat stderr: {cat_stderr}")
+        
+        print(f"Successfully extracted archive to {extract_dir}")
+        
+        # Delete tar files immediately after extraction
+        for tar_file in file_paths:
+            try:
+                os.remove(tar_file)
+                print(f"Deleted tar file: {tar_file}")
+            except Exception as e:
+                print(f"Failed to delete {tar_file}: {e}")
+        
+        # Check disk usage after deletion
+        disk = shutil.disk_usage('.')
+        free_gb = disk.free / (1024**3)
+        print(f"Disk space after tar deletion: {free_gb:.1f}GB free")
+        
+        return True
+    except subprocess.CalledProcessError as e:
+        stderr_output = e.stderr.decode() if e.stderr else ""
+        print(f"Failed to extract split archive: {e}")
+        if stderr_output:
+            print(f"tar stderr: {stderr_output}")
+        return False
+
+
+# ============================================================================
+# Trace File Processing (with alt_baro/on_ground handling)
+# ============================================================================
+
+ALLOWED_DATA_SOURCE = {'', 'adsb.lol', 'adsbexchange', 'airplanes.live'}
+
+
+def process_file(filepath: str) -> list:
+    """
+    Process a single trace file and return list of rows.
+    Handles alt_baro/on_ground: if altitude == "ground", on_ground=True and alt_baro=None.
+    """
+    insert_rows = []
+    with gzip.open(filepath, 'rb') as f:
+        data = orjson.loads(f.read())
+        icao = data.get('icao', None)
+        if icao is None:
+            print(f"Skipping file {filepath} as it does not contain 'icao'")
+            return []
+        
+        r = data.get('r', "")
+        t = data.get('t', "")
+        dbFlags = data.get('dbFlags', 0)
+        noRegData = data.get('noRegData', False)
+        ownOp = data.get('ownOp', "")
+        year = int(data.get('year', 0))
+        timestamp = data.get('timestamp', None)
+        desc = data.get('desc', "")
+        trace_data = data.get('trace', None)
+        
+        if timestamp is None or trace_data is None:
+            print(f"Skipping file {filepath} as it does not contain 'timestamp' or 'trace'")
+            return []
+        
+        for row in trace_data:
+            time_offset = row[0]
+            lat = row[1]
+            lon = row[2]
+            altitude = row[3]
+            
+            # Handle alt_baro/on_ground
+            alt_baro = None
+            on_ground = False
+            if type(altitude) is str and altitude == "ground":
+                on_ground = True
+            elif type(altitude) is int:
+                alt_baro = altitude
+            elif type(altitude) is float:
+                alt_baro = int(altitude)
+            
+            ground_speed = row[4]
+            track_degrees = row[5]
+            flags = row[6]
+            vertical_rate = row[7]
+            aircraft = row[8]
+            source = row[9]
+            data_source_value = "adsb.lol" if "adsb.lol" in ALLOWED_DATA_SOURCE else ""
+            geometric_altitude = row[10]
+            geometric_vertical_rate = row[11]
+            indicated_airspeed = row[12]
+            roll_angle = row[13]
+            
+            time_val = timestamp + time_offset
+            dt64 = dt.datetime.fromtimestamp(time_val, tz=dt.timezone.utc)
+            
+            # Prepare base fields
+            inserted_row = [
+                dt64, icao, r, t, dbFlags, noRegData, ownOp, year, desc,
+                lat, lon, alt_baro, on_ground, ground_speed, track_degrees,
+                flags, vertical_rate
+            ]
+            next_part = [
+                source, geometric_altitude, geometric_vertical_rate,
+                indicated_airspeed, roll_angle
+            ]
+            inserted_row.extend(next_part)
+            
+            if aircraft is None or type(aircraft) is not dict:
+                aircraft = dict()
+            
+            aircraft_data = {
+                'alert': aircraft.get('alert', None),
+                'alt_geom': aircraft.get('alt_geom', None),
+                'gva': aircraft.get('gva', None),
+                'nac_p': aircraft.get('nac_p', None),
+                'nac_v': aircraft.get('nac_v', None),
+                'nic': aircraft.get('nic', None),
+                'nic_baro': aircraft.get('nic_baro', None),
+                'rc': aircraft.get('rc', None),
+                'sda': aircraft.get('sda', None),
+                'sil': aircraft.get('sil', None),
+                'sil_type': aircraft.get('sil_type', ""),
+                'spi': aircraft.get('spi', None),
+                'track': aircraft.get('track', None),
+                'type': aircraft.get('type', ""),
+                'version': aircraft.get('version', None),
+                'category': aircraft.get('category', ''),
+                'emergency': aircraft.get('emergency', ''),
+                'flight': aircraft.get('flight', ""),
+                'squawk': aircraft.get('squawk', ""),
+                'baro_rate': aircraft.get('baro_rate', None),
+                'nav_altitude_fms': aircraft.get('nav_altitude_fms', None),
+                'nav_altitude_mcp': aircraft.get('nav_altitude_mcp', None),
+                'nav_modes': aircraft.get('nav_modes', []),
+                'nav_qnh': aircraft.get('nav_qnh', None),
+                'geom_rate': aircraft.get('geom_rate', None),
+                'ias': aircraft.get('ias', None),
+                'mach': aircraft.get('mach', None),
+                'mag_heading': aircraft.get('mag_heading', None),
+                'oat': aircraft.get('oat', None),
+                'roll': aircraft.get('roll', None),
+                'tas': aircraft.get('tas', None),
+                'tat': aircraft.get('tat', None),
+                'true_heading': aircraft.get('true_heading', None),
+                'wd': aircraft.get('wd', None),
+                'ws': aircraft.get('ws', None),
+                'track_rate': aircraft.get('track_rate', None),
+                'nav_heading': aircraft.get('nav_heading', None)
+            }
+            
+            aircraft_list = list(aircraft_data.values())
+            inserted_row.extend(aircraft_list)
+            inserted_row.append(data_source_value)
+            
+            insert_rows.append(inserted_row)
+    
+    if insert_rows:
+        # print(f"Got {len(insert_rows)} rows from {filepath}")
+        return insert_rows
+    else:
+        return []
+
+
+# ============================================================================
+# Parquet Writing
+# ============================================================================
+
+# Column names matching the order of data in inserted_row
+COLUMNS = [
+    "time", "icao",
+    "r", "t", "dbFlags", "noRegData", "ownOp", "year", "desc",
+    "lat", "lon", "alt_baro", "on_ground", "ground_speed", "track_degrees",
+    "flags", "vertical_rate", "source", "geometric_altitude",
+    "geometric_vertical_rate", "indicated_airspeed", "roll_angle",
+    "aircraft_alert", "aircraft_alt_geom", "aircraft_gva", "aircraft_nac_p",
+    "aircraft_nac_v", "aircraft_nic", "aircraft_nic_baro", "aircraft_rc",
+    "aircraft_sda", "aircraft_sil", "aircraft_sil_type", "aircraft_spi",
+    "aircraft_track", "aircraft_type", "aircraft_version", "aircraft_category",
+    "aircraft_emergency", "aircraft_flight", "aircraft_squawk",
+    "aircraft_baro_rate", "aircraft_nav_altitude_fms", "aircraft_nav_altitude_mcp",
+    "aircraft_nav_modes", "aircraft_nav_qnh", "aircraft_geom_rate",
+    "aircraft_ias", "aircraft_mach", "aircraft_mag_heading", "aircraft_oat",
+    "aircraft_roll", "aircraft_tas", "aircraft_tat", "aircraft_true_heading",
+    "aircraft_wd", "aircraft_ws", "aircraft_track_rate", "aircraft_nav_heading",
+    "data_source",
+]
+
+
+OS_CPU_COUNT = os.cpu_count() or 1
+MAX_WORKERS = OS_CPU_COUNT if OS_CPU_COUNT > 4 else 1
+CHUNK_SIZE = MAX_WORKERS * 500  # Reduced for lower RAM usage
+BATCH_SIZE = 250_000  # Fixed size for predictable memory usage (~500MB per batch)
+
+# PyArrow schema for efficient Parquet writing
+PARQUET_SCHEMA = pa.schema([
+    ("time", pa.timestamp("ms", tz="UTC")),
+    ("icao", pa.string()),
+    ("r", pa.string()),
+    ("t", pa.string()),
+    ("dbFlags", pa.int32()),
+    ("noRegData", pa.bool_()),
+    ("ownOp", pa.string()),
+    ("year", pa.uint16()),
+    ("desc", pa.string()),
+    ("lat", pa.float64()),
+    ("lon", pa.float64()),
+    ("alt_baro", pa.int32()),
+    ("on_ground", pa.bool_()),
+    ("ground_speed", pa.float32()),
+    ("track_degrees", pa.float32()),
+    ("flags", pa.uint32()),
+    ("vertical_rate", pa.int32()),
+    ("source", pa.string()),
+    ("geometric_altitude", pa.int32()),
+    ("geometric_vertical_rate", pa.int32()),
+    ("indicated_airspeed", pa.int32()),
+    ("roll_angle", pa.float32()),
+    ("aircraft_alert", pa.int64()),
+    ("aircraft_alt_geom", pa.int64()),
+    ("aircraft_gva", pa.int64()),
+    ("aircraft_nac_p", pa.int64()),
+    ("aircraft_nac_v", pa.int64()),
+    ("aircraft_nic", pa.int64()),
+    ("aircraft_nic_baro", pa.int64()),
+    ("aircraft_rc", pa.int64()),
+    ("aircraft_sda", pa.int64()),
+    ("aircraft_sil", pa.int64()),
+    ("aircraft_sil_type", pa.string()),
+    ("aircraft_spi", pa.int64()),
+    ("aircraft_track", pa.float64()),
+    ("aircraft_type", pa.string()),
+    ("aircraft_version", pa.int64()),
+    ("aircraft_category", pa.string()),
+    ("aircraft_emergency", pa.string()),
+    ("aircraft_flight", pa.string()),
+    ("aircraft_squawk", pa.string()),
+    ("aircraft_baro_rate", pa.int64()),
+    ("aircraft_nav_altitude_fms", pa.int64()),
+    ("aircraft_nav_altitude_mcp", pa.int64()),
+    ("aircraft_nav_modes", pa.list_(pa.string())),
+    ("aircraft_nav_qnh", pa.float64()),
+    ("aircraft_geom_rate", pa.int64()),
+    ("aircraft_ias", pa.int64()),
+    ("aircraft_mach", pa.float64()),
+    ("aircraft_mag_heading", pa.float64()),
+    ("aircraft_oat", pa.int64()),
+    ("aircraft_roll", pa.float64()),
+    ("aircraft_tas", pa.int64()),
+    ("aircraft_tat", pa.int64()),
+    ("aircraft_true_heading", pa.float64()),
+    ("aircraft_wd", pa.int64()),
+    ("aircraft_ws", pa.int64()),
+    ("aircraft_track_rate", pa.float64()),
+    ("aircraft_nav_heading", pa.float64()),
+    ("data_source", pa.string()),
+])
+
+
+def collect_trace_files_with_find(root_dir):
+    """Find all trace_full_*.json files in the extracted directory."""
+    trace_dict: dict[str, str] = {}
+    cmd = ['find', root_dir, '-type', 'f', '-name', 'trace_full_*.json']
+    
+    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    
+    if result.returncode != 0:
+        print(f"Error executing find: {result.stderr}")
+        return trace_dict
+    
+    for file_path in result.stdout.strip().split('\n'):
+        if file_path:
+            filename = os.path.basename(file_path)
+            if filename.startswith("trace_full_") and filename.endswith(".json"):
+                icao = filename[len("trace_full_"):-len(".json")]
+                trace_dict[icao] = file_path
+                    
+    return trace_dict
+
+
+def generate_version_dates(start_date: str, end_date: str) -> list:
+    """Generate a list of dates from start_date to end_date inclusive."""
+    start = datetime.strptime(start_date, "%Y-%m-%d")
+    end = datetime.strptime(end_date, "%Y-%m-%d")
+    delta = end - start
+    return [start + timedelta(days=i) for i in range(delta.days + 1)]
+
+
+def safe_process(fp):
+    """Safely process a file, returning empty list on error."""
+    try:
+        return process_file(fp)
+    except Exception as e:
+        logging.error(f"Error processing {fp}: {e}")
+        return []
+
+
+def rows_to_arrow_table(rows: list) -> pa.Table:
+    """Convert list of rows to a PyArrow Table directly (no pandas)."""
+    # Transpose rows into columns
+    columns = list(zip(*rows))
+    
+    # Build arrays for each column according to schema
+    arrays = []
+    for i, field in enumerate(PARQUET_SCHEMA):
+        col_data = list(columns[i]) if i < len(columns) else [None] * len(rows)
+        arrays.append(pa.array(col_data, type=field.type))
+    
+    return pa.Table.from_arrays(arrays, schema=PARQUET_SCHEMA)
+
+
+def write_batch_to_parquet(rows: list, version_date: str, batch_idx: int):
+    """Write a batch of rows to a Parquet file."""
+    if not rows:
+        return
+    
+    table = rows_to_arrow_table(rows)
+    
+    parquet_path = os.path.join(PARQUET_DIR, f"{version_date}_batch_{batch_idx:04d}.parquet")
+    
+    pq.write_table(table, parquet_path, compression='snappy')
+    
+    print(f"Written parquet batch {batch_idx} ({len(rows)} rows) | {get_resource_usage()}")
+
+
+def merge_parquet_files(version_date: str, delete_batches: bool = True):
+    """Merge all batch parquet files for a version_date into a single file using streaming."""
+    pattern = os.path.join(PARQUET_DIR, f"{version_date}_batch_*.parquet")
+    batch_files = sorted(glob.glob(pattern))
+    
+    if not batch_files:
+        print(f"No batch files found for {version_date}")
+        return None
+    
+    print(f"Merging {len(batch_files)} batch files for {version_date} (streaming)...")
+    
+    merged_path = os.path.join(PARQUET_DIR, f"{version_date}.parquet")
+    total_rows = 0
+    
+    # Stream write: read one batch at a time to minimize RAM usage
+    writer = None
+    try:
+        for i, f in enumerate(batch_files):
+            table = pq.read_table(f)
+            total_rows += table.num_rows
+            
+            if writer is None:
+                writer = pq.ParquetWriter(merged_path, table.schema, compression='snappy')
+            
+            writer.write_table(table)
+            
+            # Delete batch file immediately after reading to free disk space
+            if delete_batches:
+                os.remove(f)
+            
+            # Free memory
+            del table
+            if (i + 1) % 10 == 0:
+                gc.collect()
+                print(f"  Merged {i + 1}/{len(batch_files)} batches... | {get_resource_usage()}")
+    finally:
+        if writer is not None:
+            writer.close()
+    
+    print(f"Merged parquet file written to {merged_path} ({total_rows} total rows) | {get_resource_usage()}")
+    
+    if delete_batches:
+        print(f"Deleted {len(batch_files)} batch files during merge")
+    
+    gc.collect()
+    return merged_path
+
+
+def process_version_date(version_date: str, keep_folders: bool = False):
+    """Download, extract, and process trace files for a single version date."""
+    print(f"\nProcessing version_date: {version_date}")
+    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+    
+    def collect_trace_files_for_version_date(vd):
+        releases = fetch_releases(vd)
+        if len(releases) == 0:
+            print(f"No releases found for {vd}.")
+            return None
+        
+        downloaded_files = []
+        for release in releases:
+            tag_name = release["tag_name"]
+            print(f"Processing release: {tag_name}")
+
+            # Only download prod-0 if available, else prod-0tmp
+            assets = release.get("assets", [])
+            normal_assets = [
+                a for a in assets
+                if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
+            ]
+            tmp_assets = [
+                a for a in assets
+                if "planes-readsb-prod-0tmp" in a["name"]
+            ]
+            use_assets = normal_assets if normal_assets else tmp_assets
+
+            for asset in use_assets:
+                asset_name = asset["name"]
+                asset_url = asset["browser_download_url"]
+                file_path = os.path.join(OUTPUT_DIR, asset_name)
+                result = download_asset(asset_url, file_path)
+                if result:
+                    downloaded_files.append(file_path)
+
+        extract_split_archive(downloaded_files, extract_dir)
+        return collect_trace_files_with_find(extract_dir)
+
+    # Check if files already exist
+    pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
+    matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
+    
+    if matches:
+        print(f"Found existing files for {version_date}:")
+        # Prefer non-tmp slices when reusing existing files
+        normal_matches = [
+            p for p in matches
+            if "-planes-readsb-prod-0." in os.path.basename(p)
+            and "tmp" not in os.path.basename(p)
+        ]
+        downloaded_files = normal_matches if normal_matches else matches
+        
+        extract_split_archive(downloaded_files, extract_dir)
+        trace_files = collect_trace_files_with_find(extract_dir)
+    else:
+        trace_files = collect_trace_files_for_version_date(version_date)
+    
+    if trace_files is None or len(trace_files) == 0:
+        print(f"No trace files found for version_date: {version_date}")
+        return 0
+    
+    file_list = list(trace_files.values())
+    
+    start_time = time.perf_counter()
+    total_num_rows = 0
+    batch_rows = []
+    batch_idx = 0
+    
+    # Process files in chunks
+    for offset in range(0, len(file_list), CHUNK_SIZE):
+        chunk = file_list[offset:offset + CHUNK_SIZE]
+        with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as process_executor:
+            for rows in process_executor.map(safe_process, chunk):
+                if not rows:
+                    continue
+                batch_rows.extend(rows)
+                
+                if len(batch_rows) >= BATCH_SIZE:
+                    total_num_rows += len(batch_rows)
+                    write_batch_to_parquet(batch_rows, version_date, batch_idx)
+                    batch_idx += 1
+                    batch_rows = []
+                    
+                    elapsed = time.perf_counter() - start_time
+                    speed = total_num_rows / elapsed if elapsed > 0 else 0
+                    print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
+        
+        gc.collect()
+    
+    # Final batch
+    if batch_rows:
+        total_num_rows += len(batch_rows)
+        write_batch_to_parquet(batch_rows, version_date, batch_idx)
+        elapsed = time.perf_counter() - start_time
+        speed = total_num_rows / elapsed if elapsed > 0 else 0
+        print(f"[{version_date}] processed {total_num_rows} rows in {elapsed:.2f}s ({speed:.2f} rows/s)")
+    
+    print(f"Total rows processed for version_date {version_date}: {total_num_rows}")
+    
+    # Clean up extracted directory immediately after processing (before merging parquet files)
+    if not keep_folders and os.path.isdir(extract_dir):
+        print(f"Deleting extraction directory with 100,000+ files: {extract_dir}")
+        shutil.rmtree(extract_dir)
+        print(f"Successfully deleted extraction directory: {extract_dir} | {get_resource_usage()}")
+    
+    # Merge batch files into a single parquet file
+    merge_parquet_files(version_date, delete_batches=True)
+    
+    return total_num_rows
+
+
+def create_parquet_for_day(day, keep_folders: bool = False):
+    """Create parquet file for a single day.
+    
+    Args:
+        day: datetime object or string in 'YYYY-MM-DD' format
+        keep_folders: Whether to keep extracted folders after processing
+    
+    Returns:
+        Path to the created parquet file, or None if failed
+    """
+    from pathlib import Path
+    
+    if isinstance(day, str):
+        day = datetime.strptime(day, "%Y-%m-%d")
+    
+    version_date = f"v{day.strftime('%Y.%m.%d')}"
+    
+    # Check if parquet already exists
+    parquet_path = Path(PARQUET_DIR) / f"{version_date}.parquet"
+    if parquet_path.exists():
+        print(f"Parquet file already exists: {parquet_path}")
+        return parquet_path
+    
+    print(f"Creating parquet for {version_date}...")
+    rows_processed = process_version_date(version_date, keep_folders)
+    
+    if rows_processed > 0 and parquet_path.exists():
+        return parquet_path
+    else:
+        return None
+
+
+def main(start_date: str, end_date: str, keep_folders: bool = False):
+    """Main function to download and convert adsb.lol data to Parquet."""
+    version_dates = [f"v{date.strftime('%Y.%m.%d')}" for date in generate_version_dates(start_date, end_date)]
+    print(f"Processing dates: {version_dates}")
+    
+    total_rows_all = 0
+    for version_date in version_dates:
+        rows_processed = process_version_date(version_date, keep_folders)
+        total_rows_all += rows_processed
+    
+    print(f"\n=== Summary ===")
+    print(f"Total dates processed: {len(version_dates)}")
+    print(f"Total rows written to Parquet: {total_rows_all}")
+    print(f"Parquet files location: {PARQUET_DIR}")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout, force=True)
+    
+    parser = argparse.ArgumentParser(
+        description="Download adsb.lol data and write to Parquet files"
+    )
+    parser.add_argument("start_date", type=str, help="Start date in YYYY-MM-DD format")
+    parser.add_argument("end_date", type=str, help="End date in YYYY-MM-DD format")
+    parser.add_argument("--keep-folders", action="store_true", 
+                        help="Keep extracted folders after processing")
+    
+    args = parser.parse_args()
+    
+    main(args.start_date, args.end_date, args.keep_folders)
@@ -0,0 +1,211 @@
+"""
+Downloads and extracts adsb.lol tar files, then lists all ICAO folders.
+This is the first step of the map-reduce pipeline.
+
+Supports both single-day (daily) and multi-day (historical) modes.
+
+Outputs:
+- Extracted trace files in data/output/{version_date}-planes-readsb-prod-0.tar_0/
+- ICAO manifest at data/output/icao_manifest_{date}.txt
+"""
+import os
+import sys
+import argparse
+import glob
+import subprocess
+from datetime import datetime, timedelta
+
+# Re-use download/extract functions from download_adsb_data_to_parquet
+from src.adsb.download_adsb_data_to_parquet import (
+    OUTPUT_DIR,
+    fetch_releases,
+    download_asset,
+    extract_split_archive,
+    collect_trace_files_with_find,
+)
+
+
+def get_target_day() -> datetime:
+    """Get yesterday's date (the day we're processing)."""
+    return datetime.utcnow() - timedelta(days=1)
+
+
+def download_and_extract(version_date: str) -> str | None:
+    """Download and extract tar files, return extract directory path."""
+    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+    
+    # Check if already extracted
+    if os.path.isdir(extract_dir):
+        print(f"[SKIP] Already extracted: {extract_dir}")
+        return extract_dir
+    
+    # Check for existing tar files
+    pattern = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0*")
+    matches = [p for p in glob.glob(pattern) if os.path.isfile(p)]
+    
+    if matches:
+        print(f"Found existing tar files for {version_date}")
+        normal_matches = [
+            p for p in matches
+            if "-planes-readsb-prod-0." in os.path.basename(p)
+            and "tmp" not in os.path.basename(p)
+        ]
+        downloaded_files = normal_matches if normal_matches else matches
+    else:
+        # Download from GitHub
+        print(f"Downloading releases for {version_date}...")
+        releases = fetch_releases(version_date)
+        if not releases:
+            print(f"No releases found for {version_date}")
+            return None
+        
+        downloaded_files = []
+        for release in releases:
+            tag_name = release["tag_name"]
+            print(f"Processing release: {tag_name}")
+            
+            assets = release.get("assets", [])
+            normal_assets = [
+                a for a in assets
+                if "planes-readsb-prod-0." in a["name"] and "tmp" not in a["name"]
+            ]
+            tmp_assets = [
+                a for a in assets
+                if "planes-readsb-prod-0tmp" in a["name"]
+            ]
+            use_assets = normal_assets if normal_assets else tmp_assets
+            
+            for asset in use_assets:
+                asset_name = asset["name"]
+                asset_url = asset["browser_download_url"]
+                file_path = os.path.join(OUTPUT_DIR, asset_name)
+                if download_asset(asset_url, file_path):
+                    downloaded_files.append(file_path)
+    
+    if not downloaded_files:
+        print(f"No files downloaded for {version_date}")
+        return None
+    
+    # Extract
+    if extract_split_archive(downloaded_files, extract_dir):
+        return extract_dir
+    return None
+
+
+def list_icao_folders(extract_dir: str) -> list[str]:
+    """List all ICAO folder names from extracted directory."""
+    trace_files = collect_trace_files_with_find(extract_dir)
+    icaos = sorted(trace_files.keys())
+    print(f"Found {len(icaos)} unique ICAOs")
+    return icaos
+
+
+def write_manifest(icaos: list[str], manifest_id: str) -> str:
+    """Write ICAO list to manifest file.
+    
+    Args:
+        icaos: List of ICAO codes
+        manifest_id: Identifier for manifest file (date or date range)
+    """
+    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
+    with open(manifest_path, "w") as f:
+        for icao in sorted(icaos):
+            f.write(f"{icao}\n")
+    print(f"Wrote manifest with {len(icaos)} ICAOs to {manifest_path}")
+    return manifest_path
+
+
+def process_single_day(target_day: datetime) -> tuple[str | None, list[str]]:
+    """Process a single day: download, extract, list ICAOs.
+    
+    Returns:
+        Tuple of (extract_dir, icaos)
+    """
+    date_str = target_day.strftime("%Y-%m-%d")
+    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
+    
+    print(f"Processing date: {date_str} (version: {version_date})")
+    
+    extract_dir = download_and_extract(version_date)
+    if not extract_dir:
+        print(f"Failed to download/extract data for {date_str}")
+        return None, []
+    
+    icaos = list_icao_folders(extract_dir)
+    print(f"Found {len(icaos)} ICAOs for {date_str}")
+    
+    return extract_dir, icaos
+
+
+def process_date_range(start_date: datetime, end_date: datetime) -> set[str]:
+    """Process multiple days: download, extract, combine ICAO lists.
+    
+    Args:
+        start_date: Start date (inclusive)
+        end_date: End date (inclusive)
+    
+    Returns:
+        Combined set of all ICAOs across the date range
+    """
+    all_icaos: set[str] = set()
+    current = start_date
+    
+    # Both start and end are inclusive
+    while current <= end_date:
+        _, icaos = process_single_day(current)
+        all_icaos.update(icaos)
+        current += timedelta(days=1)
+    
+    return all_icaos
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download and list ICAOs from adsb.lol data")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
+    args = parser.parse_args()
+    
+    # Determine mode: single day or date range
+    if args.start_date and args.end_date:
+        # Historical mode: process date range
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+        
+        print(f"Processing date range: {args.start_date} to {args.end_date}")
+        
+        all_icaos = process_date_range(start_date, end_date)
+        
+        if not all_icaos:
+            print("No ICAOs found in date range")
+            sys.exit(1)
+        
+        # Write combined manifest with range identifier
+        manifest_id = f"{args.start_date}_{args.end_date}"
+        write_manifest(list(all_icaos), manifest_id)
+        
+        print(f"\nDone! Total ICAOs: {len(all_icaos)}")
+        
+    else:
+        # Daily mode: single day
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        
+        date_str = target_day.strftime("%Y-%m-%d")
+        
+        extract_dir, icaos = process_single_day(target_day)
+        
+        if not icaos:
+            print("No ICAOs found")
+            sys.exit(1)
+        
+        write_manifest(icaos, date_str)
+        
+        print(f"\nDone! Extract dir: {extract_dir}")
+        print(f"Total ICAOs: {len(icaos)}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Generate date chunk matrix for historical ADS-B processing."""
+
+import json
+import os
+import sys
+from datetime import datetime, timedelta
+
+
+def generate_chunks(start_date: str, end_date: str, chunk_days: int) -> list[dict]:
+    """Generate date chunks for parallel processing.
+    
+    Args:
+        start_date: Start date in YYYY-MM-DD format (inclusive)
+        end_date: End date in YYYY-MM-DD format (exclusive)
+        chunk_days: Number of days per chunk
+        
+    Returns:
+        List of chunk dictionaries with start_date and end_date (both inclusive within chunk)
+    """
+    start = datetime.strptime(start_date, "%Y-%m-%d")
+    end = datetime.strptime(end_date, "%Y-%m-%d")
+    
+    chunks = []
+    current = start
+    
+    # end_date is exclusive, so we process up to but not including it
+    while current < end:
+        # chunk_end is inclusive, so subtract 1 from the next chunk start
+        chunk_end = min(current + timedelta(days=chunk_days - 1), end - timedelta(days=1))
+        chunks.append({
+            "start_date": current.strftime("%Y-%m-%d"),
+            "end_date": chunk_end.strftime("%Y-%m-%d"),
+        })
+        current = chunk_end + timedelta(days=1)
+    
+    return chunks
+
+
+def main() -> None:
+    """Main entry point for GitHub Actions."""
+    start_date = os.environ.get("INPUT_START_DATE")
+    end_date = os.environ.get("INPUT_END_DATE")
+    chunk_days = int(os.environ.get("INPUT_CHUNK_DAYS", "7"))
+    
+    if not start_date or not end_date:
+        print("ERROR: INPUT_START_DATE and INPUT_END_DATE must be set", file=sys.stderr)
+        sys.exit(1)
+    
+    chunks = generate_chunks(start_date, end_date, chunk_days)
+    print(f"Generated {len(chunks)} chunks for {start_date} to {end_date}")
+    
+    # Write to GitHub Actions output
+    github_output = os.environ.get("GITHUB_OUTPUT")
+    if github_output:
+        with open(github_output, "a") as f:
+            f.write(f"chunks={json.dumps(chunks)}\n")
+    else:
+        # For local testing, just print
+        print(json.dumps(chunks, indent=2))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,343 @@
+"""
+Processes a chunk of ICAOs from pre-extracted trace files.
+This is the map phase of the map-reduce pipeline.
+
+Supports both single-day (daily) and multi-day (historical) modes.
+
+Expects extract_dir to already exist with trace files.
+Reads ICAO manifest to determine which ICAOs to process based on chunk-id.
+
+Usage:
+    # Daily mode (single day)
+    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4
+    
+    # Historical mode (date range)
+    python -m src.adsb.process_icao_chunk --chunk-id 0 --total-chunks 4 --start-date 2024-01-01 --end-date 2024-01-07
+"""
+import gc
+import os
+import sys
+import argparse
+import time
+import concurrent.futures
+from datetime import datetime, timedelta
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from src.adsb.download_adsb_data_to_parquet import (
+    OUTPUT_DIR,
+    PARQUET_DIR,
+    PARQUET_SCHEMA,
+    COLUMNS,
+    MAX_WORKERS,
+    process_file,
+    get_resource_usage,
+    collect_trace_files_with_find,
+)
+
+
+CHUNK_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
+os.makedirs(CHUNK_OUTPUT_DIR, exist_ok=True)
+
+# Smaller batch size for memory efficiency
+BATCH_SIZE = 100_000
+
+
+def get_target_day() -> datetime:
+    """Get yesterday's date (the day we're processing)."""
+    return datetime.utcnow() - timedelta(days=1)
+
+
+def read_manifest(manifest_id: str) -> list[str]:
+    """Read ICAO manifest file.
+    
+    Args:
+        manifest_id: Either a date string (YYYY-MM-DD) or range string (YYYY-MM-DD_YYYY-MM-DD)
+    """
+    manifest_path = os.path.join(OUTPUT_DIR, f"icao_manifest_{manifest_id}.txt")
+    if not os.path.exists(manifest_path):
+        raise FileNotFoundError(f"Manifest not found: {manifest_path}")
+    
+    with open(manifest_path, "r") as f:
+        icaos = [line.strip() for line in f if line.strip()]
+    return icaos
+
+
+def deterministic_hash(s: str) -> int:
+    """Return a deterministic hash for a string (unlike Python's hash() which is randomized)."""
+    # Use sum of byte values - simple but deterministic
+    return sum(ord(c) for c in s)
+
+
+def get_chunk_icaos(icaos: list[str], chunk_id: int, total_chunks: int) -> list[str]:
+    """Get the subset of ICAOs for this chunk based on deterministic hash partitioning."""
+    return [icao for icao in icaos if deterministic_hash(icao) % total_chunks == chunk_id]
+
+
+def build_trace_file_map(extract_dir: str) -> dict[str, str]:
+    """Build a map of ICAO -> trace file path using find command."""
+    print(f"Building trace file map from {extract_dir}...")
+    
+    # Debug: check what's in extract_dir
+    if os.path.isdir(extract_dir):
+        items = os.listdir(extract_dir)[:10]
+        print(f"First 10 items in extract_dir: {items}")
+        # Check if there are subdirectories
+        for item in items[:3]:
+            subpath = os.path.join(extract_dir, item)
+            if os.path.isdir(subpath):
+                subitems = os.listdir(subpath)[:5]
+                print(f"  Contents of {item}/: {subitems}")
+    
+    trace_map = collect_trace_files_with_find(extract_dir)
+    print(f"Found {len(trace_map)} trace files")
+    
+    if len(trace_map) == 0:
+        # Debug: try manual find
+        import subprocess
+        result = subprocess.run(
+            ['find', extract_dir, '-type', 'f', '-name', 'trace_full_*'],
+            capture_output=True, text=True
+        )
+        print(f"Manual find output (first 500 chars): {result.stdout[:500]}")
+        print(f"Manual find stderr: {result.stderr[:200]}")
+    
+    return trace_map
+
+
+def safe_process(filepath: str) -> list:
+    """Safely process a file, returning empty list on error."""
+    try:
+        return process_file(filepath)
+    except Exception as e:
+        print(f"Error processing {filepath}: {e}")
+        return []
+
+
+def rows_to_table(rows: list) -> pa.Table:
+    """Convert list of rows to PyArrow table."""
+    import pandas as pd
+    df = pd.DataFrame(rows, columns=COLUMNS)
+    if not df['time'].dt.tz:
+        df['time'] = df['time'].dt.tz_localize('UTC')
+    return pa.Table.from_pandas(df, schema=PARQUET_SCHEMA, preserve_index=False)
+
+
+def process_chunk(
+    chunk_id: int,
+    total_chunks: int,
+    trace_map: dict[str, str],
+    icaos: list[str],
+    output_id: str,
+) -> str | None:
+    """Process a chunk of ICAOs and write to parquet.
+    
+    Args:
+        chunk_id: This chunk's ID (0-indexed)
+        total_chunks: Total number of chunks
+        trace_map: Map of ICAO -> trace file path
+        icaos: Full list of ICAOs from manifest
+        output_id: Identifier for output file (date or date range)
+    """
+    chunk_icaos = get_chunk_icaos(icaos, chunk_id, total_chunks)
+    print(f"Chunk {chunk_id}/{total_chunks}: Processing {len(chunk_icaos)} ICAOs")
+    
+    if not chunk_icaos:
+        print(f"Chunk {chunk_id}: No ICAOs to process")
+        return None
+    
+    # Get trace file paths from the map
+    trace_files = []
+    for icao in chunk_icaos:
+        if icao in trace_map:
+            trace_files.append(trace_map[icao])
+    
+    print(f"Chunk {chunk_id}: Found {len(trace_files)} trace files")
+    
+    if not trace_files:
+        print(f"Chunk {chunk_id}: No trace files found")
+        return None
+    
+    # Process files and write parquet in batches
+    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
+    
+    start_time = time.perf_counter()
+    total_rows = 0
+    batch_rows = []
+    writer = None
+    
+    try:
+        # Process in parallel batches
+        files_per_batch = MAX_WORKERS * 100
+        for offset in range(0, len(trace_files), files_per_batch):
+            batch_files = trace_files[offset:offset + files_per_batch]
+            
+            with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
+                for rows in executor.map(safe_process, batch_files):
+                    if rows:
+                        batch_rows.extend(rows)
+                        
+                        # Write when batch is full
+                        if len(batch_rows) >= BATCH_SIZE:
+                            table = rows_to_table(batch_rows)
+                            total_rows += len(batch_rows)
+                            
+                            if writer is None:
+                                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
+                            writer.write_table(table)
+                            
+                            batch_rows = []
+                            del table
+                            gc.collect()
+                            
+                            elapsed = time.perf_counter() - start_time
+                            print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
+            
+            gc.collect()
+        
+        # Write remaining rows
+        if batch_rows:
+            table = rows_to_table(batch_rows)
+            total_rows += len(batch_rows)
+            
+            if writer is None:
+                writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
+            writer.write_table(table)
+            del table
+    
+    finally:
+        if writer:
+            writer.close()
+    
+    elapsed = time.perf_counter() - start_time
+    print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
+    
+    if total_rows > 0:
+        return output_path
+    return None
+
+
+def process_single_day(
+    chunk_id: int,
+    total_chunks: int,
+    target_day: datetime,
+) -> str | None:
+    """Process a single day for this chunk."""
+    date_str = target_day.strftime("%Y-%m-%d")
+    version_date = f"v{target_day.strftime('%Y.%m.%d')}"
+    
+    extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+    
+    if not os.path.isdir(extract_dir):
+        print(f"Extract directory not found: {extract_dir}")
+        return None
+    
+    trace_map = build_trace_file_map(extract_dir)
+    if not trace_map:
+        print("No trace files found")
+        return None
+    
+    icaos = read_manifest(date_str)
+    print(f"Total ICAOs in manifest: {len(icaos)}")
+    
+    return process_chunk(chunk_id, total_chunks, trace_map, icaos, date_str)
+
+
+def process_date_range(
+    chunk_id: int,
+    total_chunks: int,
+    start_date: datetime,
+    end_date: datetime,
+) -> str | None:
+    """Process a date range for this chunk.
+    
+    Combines trace files from all days in the range.
+    
+    Args:
+        chunk_id: This chunk's ID (0-indexed)
+        total_chunks: Total number of chunks
+        start_date: Start date (inclusive)
+        end_date: End date (inclusive)
+    """
+    start_str = start_date.strftime("%Y-%m-%d")
+    end_str = end_date.strftime("%Y-%m-%d")
+    manifest_id = f"{start_str}_{end_str}"
+    
+    print(f"Processing date range: {start_str} to {end_str}")
+    
+    # Build combined trace map from all days
+    combined_trace_map: dict[str, str] = {}
+    current = start_date
+    
+    # Both start and end are inclusive
+    while current <= end_date:
+        version_date = f"v{current.strftime('%Y.%m.%d')}"
+        extract_dir = os.path.join(OUTPUT_DIR, f"{version_date}-planes-readsb-prod-0.tar_0")
+        
+        if os.path.isdir(extract_dir):
+            trace_map = build_trace_file_map(extract_dir)
+            # Later days override earlier days (use most recent trace file)
+            combined_trace_map.update(trace_map)
+            print(f"  {current.strftime('%Y-%m-%d')}: {len(trace_map)} trace files")
+        else:
+            print(f"  {current.strftime('%Y-%m-%d')}: no extract directory")
+        
+        current += timedelta(days=1)
+    
+    if not combined_trace_map:
+        print("No trace files found in date range")
+        return None
+    
+    print(f"Combined trace map: {len(combined_trace_map)} ICAOs")
+    
+    icaos = read_manifest(manifest_id)
+    print(f"Total ICAOs in manifest: {len(icaos)}")
+    
+    return process_chunk(chunk_id, total_chunks, combined_trace_map, icaos, manifest_id)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process a chunk of ICAOs")
+    parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
+    parser.add_argument("--total-chunks", type=int, required=True, help="Total number of chunks")
+    parser.add_argument("--date", type=str, help="Single date in YYYY-MM-DD format (default: yesterday)")
+    parser.add_argument("--start-date", type=str, help="Start date for range (YYYY-MM-DD)")
+    parser.add_argument("--end-date", type=str, help="End date for range (YYYY-MM-DD)")
+    args = parser.parse_args()
+    
+    print(f"Processing chunk {args.chunk_id}/{args.total_chunks}")
+    print(f"OUTPUT_DIR: {OUTPUT_DIR}")
+    print(f"CHUNK_OUTPUT_DIR: {CHUNK_OUTPUT_DIR}")
+    print(f"Resource usage at start: {get_resource_usage()}")
+    
+    # Debug: List what's in OUTPUT_DIR
+    print(f"\nContents of {OUTPUT_DIR}:")
+    if os.path.isdir(OUTPUT_DIR):
+        for item in os.listdir(OUTPUT_DIR)[:20]:
+            print(f"  - {item}")
+    else:
+        print(f"  Directory does not exist!")
+    
+    # Determine mode: single day or date range
+    if args.start_date and args.end_date:
+        # Historical mode
+        start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
+        output_path = process_date_range(args.chunk_id, args.total_chunks, start_date, end_date)
+    else:
+        # Daily mode
+        if args.date:
+            target_day = datetime.strptime(args.date, "%Y-%m-%d")
+        else:
+            target_day = get_target_day()
+        output_path = process_single_day(args.chunk_id, args.total_chunks, target_day)
+    
+    if output_path:
+        print(f"Output: {output_path}")
+    else:
+        print("No output generated")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,97 @@
+"""
+Reduce step: downloads all chunk CSVs from S3, combines them,
+deduplicates across the full dataset, and uploads the final result.
+
+Environment variables:
+  S3_BUCKET         — bucket with intermediate results
+  RUN_ID            — run identifier matching the map workers
+  GLOBAL_START_DATE — overall start date for output filename
+  GLOBAL_END_DATE   — overall end date for output filename
+"""
+import gzip
+import os
+import shutil
+from pathlib import Path
+
+import boto3
+import polars as pl
+
+from compress_adsb_to_aircraft_data import COLUMNS, deduplicate_by_signature
+
+
+def main():
+    s3_bucket = os.environ["S3_BUCKET"]
+    run_id = os.environ.get("RUN_ID", "default")
+    global_start = os.environ["GLOBAL_START_DATE"]
+    global_end = os.environ["GLOBAL_END_DATE"]
+
+    s3 = boto3.client("s3")
+    prefix = f"intermediate/{run_id}/"
+
+    # List all chunk files for this run
+    paginator = s3.get_paginator("list_objects_v2")
+    chunk_keys = []
+    for page in paginator.paginate(Bucket=s3_bucket, Prefix=prefix):
+        for obj in page.get("Contents", []):
+            if obj["Key"].endswith(".csv.gz"):
+                chunk_keys.append(obj["Key"])
+
+    chunk_keys.sort()
+    print(f"Found {len(chunk_keys)} chunks to combine")
+
+    if not chunk_keys:
+        print("No chunks found — nothing to reduce.")
+        return
+
+    # Download and concatenate all chunks
+    download_dir = Path("/tmp/chunks")
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    dfs = []
+
+    for key in chunk_keys:
+        gz_path = download_dir / Path(key).name
+        csv_path = gz_path.with_suffix("")  # Remove .gz
+        print(f"Downloading {key}...")
+        s3.download_file(s3_bucket, key, str(gz_path))
+
+        # Decompress
+        with gzip.open(gz_path, 'rb') as f_in:
+            with open(csv_path, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+        gz_path.unlink()
+
+        df_chunk = pl.read_csv(csv_path)
+        print(f"  Loaded {df_chunk.height} rows from {csv_path.name}")
+        dfs.append(df_chunk)
+
+        # Free disk space after loading
+        csv_path.unlink()
+
+    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
+    print(f"Combined: {df_accumulated.height} rows before dedup")
+
+    # Final global deduplication
+    df_accumulated = deduplicate_by_signature(df_accumulated)
+    print(f"After dedup: {df_accumulated.height} rows")
+
+    # Write and upload final result
+    output_name = f"openairframes_adsb_{global_start}_{global_end}.csv.gz"
+    csv_output = Path(f"/tmp/openairframes_adsb_{global_start}_{global_end}.csv")
+    gz_output = Path(f"/tmp/{output_name}")
+    
+    df_accumulated.write_csv(csv_output)
+    with open(csv_output, 'rb') as f_in:
+        with gzip.open(gz_output, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    csv_output.unlink()
+
+    final_key = f"final/{output_name}"
+    print(f"Uploading to s3://{s3_bucket}/{final_key}")
+    s3.upload_file(str(gz_output), s3_bucket, final_key)
+
+    print(f"Final output: {df_accumulated.height} records -> {final_key}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,2 @@
+polars>=1.0
+boto3>=1.34
@@ -0,0 +1,5 @@
+polars>=1.0
+pyarrow>=14.0
+orjson>=3.9
+boto3>=1.34
+zstandard>=0.22
@@ -0,0 +1,89 @@
+"""
+Map worker: processes a date range chunk, uploads result to S3.
+
+Environment variables:
+  START_DATE  — inclusive, YYYY-MM-DD
+  END_DATE    — exclusive, YYYY-MM-DD
+  S3_BUCKET   — bucket for intermediate results
+  RUN_ID      — unique run identifier for namespacing S3 keys
+"""
+import os
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+
+import boto3
+import polars as pl
+
+from compress_adsb_to_aircraft_data import (
+    load_historical_for_day,
+    deduplicate_by_signature,
+    COLUMNS,
+)
+
+
+def main():
+    start_date_str = os.environ["START_DATE"]
+    end_date_str = os.environ["END_DATE"]
+    s3_bucket = os.environ["S3_BUCKET"]
+    run_id = os.environ.get("RUN_ID", "default")
+
+    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
+    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
+
+    total_days = (end_date - start_date).days
+    print(f"Worker: processing {total_days} days [{start_date_str}, {end_date_str})")
+
+    dfs = []
+    current_date = start_date
+
+    while current_date < end_date:
+        day_str = current_date.strftime("%Y-%m-%d")
+        print(f"  Loading {day_str}...")
+
+        df_compressed = load_historical_for_day(current_date)
+        if df_compressed.height == 0:
+            raise RuntimeError(f"No data found for {day_str}")
+
+        dfs.append(df_compressed)
+        total_rows = sum(df.height for df in dfs)
+        print(f"  +{df_compressed.height} rows (total: {total_rows})")
+
+        # Delete local cache after each day to save disk in container
+        cache_dir = Path("data/adsb")
+        if cache_dir.exists():
+            import shutil
+            shutil.rmtree(cache_dir)
+
+        current_date += timedelta(days=1)
+
+    # Concatenate all days
+    df_accumulated = pl.concat(dfs) if dfs else pl.DataFrame()
+
+    # Deduplicate within this chunk
+    df_accumulated = deduplicate_by_signature(df_accumulated)
+    print(f"After dedup: {df_accumulated.height} rows")
+
+    # Write to local file then upload to S3
+    local_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv")
+    df_accumulated.write_csv(local_path)
+    
+    # Compress with gzip
+    import gzip
+    import shutil
+    gz_path = Path(f"/tmp/chunk_{start_date_str}_{end_date_str}.csv.gz")
+    with open(local_path, 'rb') as f_in:
+        with gzip.open(gz_path, 'wb') as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    local_path.unlink()  # Remove uncompressed file
+
+    s3_key = f"intermediate/{run_id}/chunk_{start_date_str}_{end_date_str}.csv.gz"
+    print(f"Uploading to s3://{s3_bucket}/{s3_key}")
+
+    s3 = boto3.client("s3")
+    s3.upload_file(str(gz_path), s3_bucket, s3_key)
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,89 +0,0 @@
-from pathlib import Path
-import pandas as pd
-import re
-from derive_from_faa_master_txt import concat_faa_historical_df
-
-def concatenate_aircraft_csvs(
-    input_dir: Path = Path("data/concat"),
-    output_dir: Path = Path("data/planequery_aircraft"),
-    filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
-):
-    """
-    Read all CSVs matching the pattern from input_dir in order,
-    concatenate them using concat_faa_historical_df, and output a single CSV.
-    
-    Args:
-        input_dir: Directory containing the CSV files to concatenate
-        output_dir: Directory where the output CSV will be saved
-        filename_pattern: Regex pattern to match CSV filenames
-    """
-    input_dir = Path(input_dir)
-    output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    
-    # Find all matching CSV files
-    pattern = re.compile(filename_pattern)
-    csv_files = []
-    
-    for csv_path in sorted(input_dir.glob("*.csv")):
-        match = pattern.search(csv_path.name)
-        if match:
-            start_date = match.group(1)
-            end_date = match.group(2)
-            csv_files.append((start_date, end_date, csv_path))
-    
-    # Sort by start date, then end date
-    csv_files.sort(key=lambda x: (x[0], x[1]))
-    
-    if not csv_files:
-        raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
-    
-    print(f"Found {len(csv_files)} CSV files to concatenate")
-    
-    # Read first CSV as base
-    first_start_date, first_end_date, first_path = csv_files[0]
-    print(f"Reading base file: {first_path.name}")
-    df_base = pd.read_csv(
-        first_path,
-        dtype={
-            'transponder_code': str,
-            'unique_regulatory_id': str,
-            'registrant_county': str
-        }
-    )
-    
-    # Concatenate remaining CSVs
-    for start_date, end_date, csv_path in csv_files[1:]:
-        print(f"Concatenating: {csv_path.name}")
-        df_new = pd.read_csv(
-            csv_path,
-            dtype={
-                'transponder_code': str,
-                'unique_regulatory_id': str,
-                'registrant_county': str
-            }
-        )
-        df_base = concat_faa_historical_df(df_base, df_new)
-    
-    # Verify monotonic increasing download_date
-    assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
-    
-    # Output filename uses first start date and last end date
-    last_start_date, last_end_date, _ = csv_files[-1]
-    output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
-    output_path = output_dir / output_filename
-    
-    print(f"Writing output to: {output_path}")
-    df_base.to_csv(output_path, index=False)
-    print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
-    print(f"Total rows: {len(df_base)}")
-    
-    return output_path
-
-
-if __name__ == "__main__":
-    # Example usage - modify these paths as needed
-    concatenate_aircraft_csvs(
-        input_dir=Path("data/concat"),
-        output_dir=Path("data/planequery_aircraft")
-    )
@@ -0,0 +1 @@
+"""Community contributions processing module."""
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""
+Approve a community submission and create a PR.
+
+This script is called by the GitHub Actions workflow when the 'approved'
+label is added to a validated submission issue.
+
+Usage:
+    python -m src.contributions.approve_submission --issue-number 123 --issue-body "..." --author "username" --author-id 12345
+
+Environment variables:
+    GITHUB_TOKEN: GitHub API token with repo write permissions
+    GITHUB_REPOSITORY: owner/repo
+"""
+import argparse
+import base64
+import json
+import os
+import sys
+import urllib.request
+import urllib.error
+from datetime import datetime, timezone
+
+from .schema import extract_json_from_issue_body, extract_contributor_name_from_issue_body, parse_and_validate, load_schema, SCHEMAS_DIR
+from .contributor import (
+    generate_contributor_uuid,
+    generate_submission_filename,
+    compute_content_hash,
+)
+from .update_schema import generate_updated_schema, check_for_new_tags, get_existing_tag_definitions
+from .read_community_data import build_tag_type_registry
+
+
+def github_api_request(
+    method: str, 
+    endpoint: str, 
+    data: dict | None = None,
+    accept: str = "application/vnd.github.v3+json"
+) -> dict:
+    """Make a GitHub API request."""
+    token = os.environ.get("GITHUB_TOKEN")
+    repo = os.environ.get("GITHUB_REPOSITORY")
+    
+    if not token or not repo:
+        raise EnvironmentError("GITHUB_TOKEN and GITHUB_REPOSITORY must be set")
+    
+    url = f"https://api.github.com/repos/{repo}{endpoint}"
+    headers = {
+        "Authorization": f"token {token}",
+        "Accept": accept,
+        "Content-Type": "application/json",
+    }
+    
+    body = json.dumps(data).encode() if data else None
+    req = urllib.request.Request(url, data=body, headers=headers, method=method)
+    
+    try:
+        with urllib.request.urlopen(req) as response:
+            response_body = response.read()
+            # DELETE requests return empty body (204 No Content)
+            if not response_body:
+                return {}
+            return json.loads(response_body)
+    except urllib.error.HTTPError as e:
+        error_body = e.read().decode() if e.fp else ""
+        print(f"GitHub API error: {e.code} {e.reason}: {error_body}", file=sys.stderr)
+        raise
+
+
+def add_issue_comment(issue_number: int, body: str) -> None:
+    """Add a comment to a GitHub issue."""
+    github_api_request("POST", f"/issues/{issue_number}/comments", {"body": body})
+
+
+def get_default_branch_sha() -> str:
+    """Get the SHA of the default branch (main)."""
+    ref = github_api_request("GET", "/git/ref/heads/main")
+    return ref["object"]["sha"]
+
+
+def create_branch(branch_name: str, sha: str) -> None:
+    """Create a new branch from a SHA."""
+    try:
+        github_api_request("POST", "/git/refs", {
+            "ref": f"refs/heads/{branch_name}",
+            "sha": sha,
+        })
+    except urllib.error.HTTPError as e:
+        if e.code == 422:  # Branch exists
+            # Delete and recreate
+            try:
+                github_api_request("DELETE", f"/git/refs/heads/{branch_name}")
+            except urllib.error.HTTPError:
+                pass
+            github_api_request("POST", "/git/refs", {
+                "ref": f"refs/heads/{branch_name}",
+                "sha": sha,
+            })
+        else:
+            raise
+
+
+def get_file_sha(path: str, branch: str) -> str | None:
+    """Get the SHA of an existing file, or None if it doesn't exist."""
+    try:
+        response = github_api_request("GET", f"/contents/{path}?ref={branch}")
+        return response.get("sha")
+    except Exception:
+        return None
+
+
+def create_or_update_file(path: str, content: str, message: str, branch: str) -> None:
+    """Create or update a file in the repository."""
+    content_b64 = base64.b64encode(content.encode()).decode()
+    payload = {
+        "message": message,
+        "content": content_b64,
+        "branch": branch,
+    }
+    
+    # If file exists, we need to include its SHA to update it
+    sha = get_file_sha(path, branch)
+    if sha:
+        payload["sha"] = sha
+    
+    github_api_request("PUT", f"/contents/{path}", payload)
+
+
+def create_pull_request(title: str, head: str, base: str, body: str) -> dict:
+    """Create a pull request."""
+    return github_api_request("POST", "/pulls", {
+        "title": title,
+        "head": head,
+        "base": base,
+        "body": body,
+    })
+
+
+def add_labels_to_issue(issue_number: int, labels: list[str]) -> None:
+    """Add labels to an issue or PR."""
+    github_api_request("POST", f"/issues/{issue_number}/labels", {"labels": labels})
+
+
+def process_submission(
+    issue_number: int,
+    issue_body: str,
+    author_username: str,
+    author_id: int,
+) -> bool:
+    """
+    Process an approved submission and create a PR.
+    
+    Args:
+        issue_number: The GitHub issue number
+        issue_body: The issue body text
+        author_username: The GitHub username of the issue author
+        author_id: The numeric GitHub user ID
+        
+    Returns:
+        True if successful, False otherwise
+    """
+    # Extract and validate JSON
+    json_str = extract_json_from_issue_body(issue_body)
+    if not json_str:
+        add_issue_comment(issue_number, "❌ Could not extract JSON from submission.")
+        return False
+    
+    data, errors = parse_and_validate(json_str)
+    if errors or data is None:
+        error_list = "\n".join(f"- {e}" for e in errors) if errors else "Unknown error"
+        add_issue_comment(issue_number, f"❌ **Validation Failed**\n\n{error_list}")
+        return False
+    
+    # Normalize to list
+    submissions: list[dict] = data if isinstance(data, list) else [data]
+    
+    # Generate contributor UUID from GitHub ID
+    contributor_uuid = generate_contributor_uuid(author_id)
+    
+    # Extract contributor name from issue form (None means user opted out of attribution)
+    contributor_name = extract_contributor_name_from_issue_body(issue_body)
+    
+    # Add metadata to each submission
+    now = datetime.now(timezone.utc)
+    date_str = now.strftime("%Y-%m-%d")
+    timestamp_str = now.isoformat()
+    
+    for submission in submissions:
+        submission["contributor_uuid"] = contributor_uuid
+        if contributor_name:
+            submission["contributor_name"] = contributor_name
+        submission["creation_timestamp"] = timestamp_str
+    
+    # Generate unique filename
+    content_json = json.dumps(submissions, indent=2, sort_keys=True)
+    content_hash = compute_content_hash(content_json)
+    filename = generate_submission_filename(author_username, date_str, content_hash)
+    file_path = f"community/{date_str}/{filename}"
+    
+    # Create branch
+    branch_name = f"community-submission-{issue_number}"
+    default_sha = get_default_branch_sha()
+    create_branch(branch_name, default_sha)
+    
+    # Create file
+    commit_message = f"Add community submission from @{author_username} (closes #{issue_number})"
+    create_or_update_file(file_path, content_json, commit_message, branch_name)
+    
+    # Update schema with any new tags (modifies v1 in place)
+    schema_updated = False
+    new_tags = []
+    try:
+        # Build tag registry from new submissions
+        tag_registry = build_tag_type_registry(submissions)
+        
+        # Get current schema and merge existing tags
+        current_schema = load_schema()
+        existing_tags = get_existing_tag_definitions(current_schema)
+        
+        # Merge existing tags into registry
+        for tag_name, tag_def in existing_tags.items():
+            if tag_name not in tag_registry:
+                tag_type = tag_def.get("type", "string")
+                tag_registry[tag_name] = tag_type
+        
+        # Check for new tags
+        new_tags = check_for_new_tags(tag_registry, current_schema)
+        
+        if new_tags:
+            # Generate updated schema
+            updated_schema = generate_updated_schema(current_schema, tag_registry)
+            schema_json = json.dumps(updated_schema, indent=2) + "\n"
+            
+            create_or_update_file(
+                "schemas/community_submission.v1.schema.json",
+                schema_json,
+                f"Update schema with new tags: {', '.join(new_tags)}",
+                branch_name
+            )
+            schema_updated = True
+    except Exception as e:
+        print(f"Warning: Could not update schema: {e}", file=sys.stderr)
+    
+    # Create PR
+    schema_note = ""
+    if schema_updated:
+        schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
+    
+    pr_body = f"""## Community Submission
+
+Adds {len(submissions)} submission(s) from @{author_username}.
+
+**File:** `{file_path}`
+**Contributor UUID:** `{contributor_uuid}`
+{schema_note}
+Closes #{issue_number}
+
+---
+
+### Submissions
+```json
+{content_json}
+```"""
+    
+    pr = create_pull_request(
+        title=f"Community submission: {filename}",
+        head=branch_name,
+        base="main",
+        body=pr_body,
+    )
+    
+    # Add labels to PR
+    add_labels_to_issue(pr["number"], ["community", "auto-generated"])
+    
+    # Comment on original issue
+    add_issue_comment(
+        issue_number,
+        f"✅ **Submission Approved**\n\n"
+        f"PR #{pr['number']} has been created to add your submission.\n\n"
+        f"**File:** `{file_path}`\n"
+        f"**Your Contributor UUID:** `{contributor_uuid}`\n\n"
+        f"The PR will be merged by a maintainer."
+    )
+    
+    print(f"Created PR #{pr['number']} for submission")
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Approve community submission and create PR")
+    parser.add_argument("--issue-number", type=int, required=True, help="GitHub issue number")
+    parser.add_argument("--issue-body", required=True, help="Issue body text")
+    parser.add_argument("--author", required=True, help="Issue author username")
+    parser.add_argument("--author-id", type=int, required=True, help="Issue author numeric ID")
+    
+    args = parser.parse_args()
+    
+    success = process_submission(
+        issue_number=args.issue_number,
+        issue_body=args.issue_body,
+        author_username=args.author,
+        author_id=args.author_id,
+    )
+    
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,86 @@
+"""Contributor identification utilities."""
+import hashlib
+import uuid
+
+
+# DNS namespace UUID for generating UUIDv5
+DNS_NAMESPACE = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8')
+
+
+def generate_contributor_uuid(github_user_id: int) -> str:
+    """
+    Generate a deterministic UUID v5 from a GitHub user ID.
+    
+    This ensures the same GitHub account always gets the same contributor UUID.
+    
+    Args:
+        github_user_id: The numeric GitHub user ID
+        
+    Returns:
+        UUID string in standard format
+    """
+    name = f"github:{github_user_id}"
+    return str(uuid.uuid5(DNS_NAMESPACE, name))
+
+
+def sanitize_username(username: str, max_length: int = 20) -> str:
+    """
+    Sanitize a GitHub username for use in filenames.
+    
+    Args:
+        username: GitHub username
+        max_length: Maximum length of sanitized name
+        
+    Returns:
+        Lowercase alphanumeric string with underscores
+    """
+    sanitized = ""
+    for char in username.lower():
+        if char.isalnum():
+            sanitized += char
+        else:
+            sanitized += "_"
+    
+    # Collapse multiple underscores
+    while "__" in sanitized:
+        sanitized = sanitized.replace("__", "_")
+    
+    return sanitized.strip("_")[:max_length]
+
+
+def generate_submission_filename(
+    username: str,
+    date_str: str,
+    content_hash: str,
+    extension: str = ".json"
+) -> str:
+    """
+    Generate a unique filename for a community submission.
+    
+    Format: {sanitized_username}_{date}_{short_hash}.json
+    
+    Args:
+        username: GitHub username
+        date_str: Date in YYYY-MM-DD format
+        content_hash: Hash of the submission content (will be truncated to 8 chars)
+        extension: File extension (default: .json)
+        
+    Returns:
+        Unique filename string
+    """
+    sanitized_name = sanitize_username(username)
+    short_hash = content_hash[:8]
+    return f"{sanitized_name}_{date_str}_{short_hash}{extension}"
+
+
+def compute_content_hash(content: str) -> str:
+    """
+    Compute SHA256 hash of content.
+    
+    Args:
+        content: String content to hash
+        
+    Returns:
+        Hex digest of SHA256 hash
+    """
+    return hashlib.sha256(content.encode()).hexdigest()
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Generate a daily CSV of all community contributions.
+
+Reads all JSON files from the community/ directory and outputs a sorted CSV
+with creation_timestamp as the first column and contributor_name/contributor_uuid as the last columns.
+
+Usage:
+    python -m src.contributions.create_daily_community_release
+"""
+from datetime import datetime, timezone
+from pathlib import Path
+import json
+import sys
+
+import pandas as pd
+
+
+COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community"
+OUT_ROOT = Path("data/openairframes")
+
+
+def read_all_submissions(community_dir: Path) -> list[dict]:
+    """Read all JSON submissions from the community directory."""
+    all_submissions = []
+    
+    for json_file in sorted(community_dir.glob("*.json")):
+        try:
+            with open(json_file) as f:
+                data = json.load(f)
+            
+            # Normalize to list
+            submissions = data if isinstance(data, list) else [data]
+            all_submissions.extend(submissions)
+            
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"Warning: Failed to read {json_file}: {e}", file=sys.stderr)
+    
+    return all_submissions
+
+
+def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
+    """
+    Convert submissions to a DataFrame with proper column ordering.
+    
+    Column order:
+    - creation_timestamp (first)
+    - transponder_code_hex
+    - registration_number  
+    - openairframes_id
+    - contributor_name
+    - [other columns alphabetically]
+    - contributor_uuid (last)
+    """
+    if not submissions:
+        return pd.DataFrame()
+    
+    df = pd.DataFrame(submissions)
+    
+    # Ensure required columns exist
+    required_cols = [
+        "creation_timestamp",
+        "transponder_code_hex",
+        "registration_number",
+        "openairframes_id",
+        "contributor_name",
+        "contributor_uuid",
+    ]
+    for col in required_cols:
+        if col not in df.columns:
+            df[col] = None
+    
+    # Sort by creation_timestamp ascending
+    df = df.sort_values("creation_timestamp", ascending=True, na_position="last")
+    
+    # Reorder columns: specific order first, contributor_uuid last
+    first_cols = [
+        "creation_timestamp",
+        "transponder_code_hex",
+        "registration_number",
+        "openairframes_id",
+        "contributor_name",
+    ]
+    last_cols = ["contributor_uuid"]
+    
+    middle_cols = sorted([
+        col for col in df.columns 
+        if col not in first_cols and col not in last_cols
+    ])
+    
+    ordered_cols = first_cols + middle_cols + last_cols
+    df = df[ordered_cols]
+    
+    return df.reset_index(drop=True)
+
+
+def main():
+    """Generate the daily community contributions CSV."""
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    
+    print(f"Reading community submissions from {COMMUNITY_DIR}")
+    submissions = read_all_submissions(COMMUNITY_DIR)
+    
+    if not submissions:
+        print("No community submissions found.")
+        # Still create an empty CSV with headers
+        df = pd.DataFrame(columns=[
+            "creation_timestamp",
+            "transponder_code_hex",
+            "registration_number",
+            "openairframes_id",
+            "contributor_name",
+            "tags",
+            "contributor_uuid",
+        ])
+    else:
+        print(f"Found {len(submissions)} total submissions")
+        df = submissions_to_dataframe(submissions)
+    
+    # Determine date range for filename
+    if not df.empty and df["creation_timestamp"].notna().any():
+        # Get earliest timestamp for start date
+        earliest = pd.to_datetime(df["creation_timestamp"]).min()
+        start_date_str = earliest.strftime("%Y-%m-%d")
+    else:
+        start_date_str = date_str
+    
+    # Output
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+    output_file = OUT_ROOT / f"openairframes_community_{start_date_str}_{date_str}.csv"
+    
+    df.to_csv(output_file, index=False)
+    
+    print(f"Saved: {output_file}")
+    print(f"Total contributions: {len(df)}")
+    
+    return output_file
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+"""
+Read and aggregate all community submission data.
+
+Usage:
+    python -m src.contributions.read_community_data
+    python -m src.contributions.read_community_data --output merged.json
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+
+
+COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community"
+
+
+def read_all_submissions(community_dir: Path | None = None) -> list[dict]:
+    """
+    Read all JSON submissions from the community directory.
+    
+    Args:
+        community_dir: Path to community directory. Uses default if None.
+        
+    Returns:
+        List of all submission dictionaries
+    """
+    if community_dir is None:
+        community_dir = COMMUNITY_DIR
+    
+    all_submissions = []
+    
+    # Search both root directory and date subdirectories (e.g., 2026-02-12/)
+    for json_file in sorted(community_dir.glob("**/*.json")):
+        try:
+            with open(json_file) as f:
+                data = json.load(f)
+            
+            # Normalize to list
+            submissions = data if isinstance(data, list) else [data]
+            
+            # Add source file metadata
+            for submission in submissions:
+                submission["_source_file"] = json_file.name
+            
+            all_submissions.extend(submissions)
+            
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"Warning: Failed to read {json_file}: {e}", file=sys.stderr)
+    
+    return all_submissions
+
+
+def get_python_type_name(value) -> str:
+    """Get a normalized type name for a value."""
+    if value is None:
+        return "null"
+    if isinstance(value, bool):
+        return "boolean"
+    if isinstance(value, int):
+        return "integer"
+    if isinstance(value, float):
+        return "number"
+    if isinstance(value, str):
+        return "string"
+    if isinstance(value, list):
+        return "array"
+    if isinstance(value, dict):
+        return "object"
+    return type(value).__name__
+
+
+def build_tag_type_registry(submissions: list[dict]) -> dict[str, str]:
+    """
+    Build a registry of tag names to their expected types from existing submissions.
+    
+    Args:
+        submissions: List of existing submission dictionaries
+        
+    Returns:
+        Dict mapping tag name to expected type (e.g., {"internet": "string", "year_built": "integer"})
+    """
+    tag_types = {}
+    
+    for submission in submissions:
+        tags = submission.get("tags", {})
+        if not isinstance(tags, dict):
+            continue
+        
+        for key, value in tags.items():
+            inferred_type = get_python_type_name(value)
+            
+            if key not in tag_types:
+                tag_types[key] = inferred_type
+            # If there's a conflict, keep the first type (it's already in use)
+    
+    return tag_types
+
+
+def group_by_identifier(submissions: list[dict]) -> dict[str, list[dict]]:
+    """
+    Group submissions by their identifier (registration, transponder, or airframe ID).
+    
+    Returns:
+        Dict mapping identifier to list of submissions for that identifier
+    """
+    grouped = {}
+    
+    for submission in submissions:
+        # Determine identifier
+        if "registration_number" in submission:
+            key = f"reg:{submission['registration_number']}"
+        elif "transponder_code_hex" in submission:
+            key = f"icao:{submission['transponder_code_hex']}"
+        elif "openairframes_id" in submission:
+            key = f"id:{submission['openairframes_id']}"
+        else:
+            key = "_unknown"
+        
+        if key not in grouped:
+            grouped[key] = []
+        grouped[key].append(submission)
+    
+    return grouped
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Read community submission data")
+    parser.add_argument("--output", "-o", help="Output file (default: stdout)")
+    parser.add_argument("--group", action="store_true", help="Group by identifier")
+    parser.add_argument("--stats", action="store_true", help="Print statistics only")
+    
+    args = parser.parse_args()
+    
+    submissions = read_all_submissions()
+    
+    if args.stats:
+        grouped = group_by_identifier(submissions)
+        contributors = set(s.get("contributor_uuid", "unknown") for s in submissions)
+        
+        print(f"Total submissions: {len(submissions)}")
+        print(f"Unique identifiers: {len(grouped)}")
+        print(f"Unique contributors: {len(contributors)}")
+        return
+    
+    if args.group:
+        result = group_by_identifier(submissions)
+    else:
+        result = submissions
+    
+    output = json.dumps(result, indent=2)
+    
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(output)
+        print(f"Wrote {len(submissions)} submissions to {args.output}")
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Regenerate schema for a PR branch after main has been merged in.
+This script looks at the submission files in this branch and updates
+the schema if new tags were introduced.
+
+Usage: python -m src.contributions.regenerate_pr_schema
+"""
+
+import json
+import sys
+from pathlib import Path
+
+# Add parent to path for imports when running as script
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from src.contributions.read_community_data import read_all_submissions, build_tag_type_registry
+from src.contributions.update_schema import (
+    get_existing_tag_definitions,
+    check_for_new_tags,
+    generate_updated_schema,
+)
+from src.contributions.schema import load_schema, SCHEMAS_DIR
+
+
+def main():
+    """Main entry point."""
+    # Load current schema
+    current_schema = load_schema()
+    
+    # Get existing tag definitions from schema
+    existing_tags = get_existing_tag_definitions(current_schema)
+    
+    # Read all submissions (including ones from this PR branch)
+    submissions = read_all_submissions()
+    
+    if not submissions:
+        print("No submissions found")
+        return
+    
+    # Build tag registry from all submissions
+    tag_registry = build_tag_type_registry(submissions)
+    
+    # Check for new tags not in the current schema
+    new_tags = check_for_new_tags(tag_registry, current_schema)
+    
+    if new_tags:
+        print(f"Found new tags: {new_tags}")
+        print("Updating schema...")
+        
+        # Generate updated schema
+        updated_schema = generate_updated_schema(current_schema, tag_registry)
+        
+        # Write updated schema (in place)
+        schema_path = SCHEMAS_DIR / "community_submission.v1.schema.json"
+        with open(schema_path, 'w') as f:
+            json.dump(updated_schema, f, indent=2)
+            f.write("\n")
+        
+        print(f"Updated {schema_path}")
+    else:
+        print("No new tags found, schema is up to date")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,225 @@
+"""Schema validation for community submissions."""
+import json
+import re
+from pathlib import Path
+from typing import Any
+
+try:
+    from jsonschema import Draft202012Validator
+except ImportError:
+    Draft202012Validator = None
+
+
+SCHEMAS_DIR = Path(__file__).parent.parent.parent / "schemas"
+
+# For backwards compatibility
+SCHEMA_PATH = SCHEMAS_DIR / "community_submission.v1.schema.json"
+
+
+def get_latest_schema_version() -> int:
+    """
+    Find the latest schema version number.
+    
+    Returns:
+        Latest version number (e.g., 1, 2, 3)
+    """
+    import re
+    pattern = re.compile(r"community_submission\.v(\d+)\.schema\.json$")
+    max_version = 0
+    
+    for path in SCHEMAS_DIR.glob("community_submission.v*.schema.json"):
+        match = pattern.search(path.name)
+        if match:
+            version = int(match.group(1))
+            max_version = max(max_version, version)
+    
+    return max_version
+
+
+def get_schema_path(version: int | None = None) -> Path:
+    """
+    Get path to a specific schema version, or latest if version is None.
+    
+    Args:
+        version: Schema version number, or None for latest
+        
+    Returns:
+        Path to schema file
+    """
+    if version is None:
+        version = get_latest_schema_version()
+    return SCHEMAS_DIR / f"community_submission.v{version}.schema.json"
+
+
+def load_schema(version: int | None = None) -> dict:
+    """
+    Load the community submission schema.
+    
+    Args:
+        version: Schema version to load. If None, loads the latest version.
+        
+    Returns:
+        Schema dict
+    """
+    schema_path = get_schema_path(version)
+    with open(schema_path) as f:
+        return json.load(f)
+
+
+def validate_submission(data: dict | list, schema: dict | None = None) -> list[str]:
+    """
+    Validate submission(s) against schema.
+    
+    Args:
+        data: Single submission dict or list of submissions
+        schema: Optional schema dict. If None, loads from default path.
+        
+    Returns:
+        List of error messages. Empty list means validation passed.
+    """
+    if Draft202012Validator is None:
+        raise ImportError("jsonschema is required: pip install jsonschema")
+    
+    if schema is None:
+        schema = load_schema()
+    
+    submissions = data if isinstance(data, list) else [data]
+    errors = []
+    
+    validator = Draft202012Validator(schema)
+    
+    for i, submission in enumerate(submissions):
+        prefix = f"[{i}] " if len(submissions) > 1 else ""
+        for error in validator.iter_errors(submission):
+            path = ".".join(str(p) for p in error.path) if error.path else "(root)"
+            errors.append(f"{prefix}{path}: {error.message}")
+    
+    return errors
+
+
+def download_github_attachment(url: str) -> str | None:
+    """
+    Download content from a GitHub attachment URL.
+    
+    Args:
+        url: GitHub attachment URL (e.g., https://github.com/user-attachments/files/...)
+        
+    Returns:
+        File content as string, or None if download failed
+    """
+    import urllib.request
+    import urllib.error
+    
+    try:
+        req = urllib.request.Request(url, headers={"User-Agent": "OpenAirframes-Bot"})
+        with urllib.request.urlopen(req, timeout=30) as response:
+            return response.read().decode("utf-8")
+    except (urllib.error.URLError, urllib.error.HTTPError, UnicodeDecodeError) as e:
+        print(f"Failed to download attachment from {url}: {e}")
+        return None
+
+
+def extract_json_from_issue_body(body: str) -> str | None:
+    """
+    Extract JSON from GitHub issue body.
+    
+    Looks for JSON in the 'Submission JSON' section, either:
+    - A GitHub file attachment URL (drag-and-drop .json file)
+    - Wrapped in code blocks (```json ... ``` or ``` ... ```)
+    - Or raw JSON after the header
+    
+    Args:
+        body: The issue body text
+        
+    Returns:
+        Extracted JSON string or None if not found
+    """
+    # Try: GitHub attachment URL in the Submission JSON section
+    # Format: [filename.json](https://github.com/user-attachments/files/...)
+    # Or just the raw URL
+    pattern_attachment = r"### Submission JSON\s*\n[\s\S]*?(https://github\.com/(?:user-attachments/files|.*?/files)/[^\s\)\]]+\.json)"
+    match = re.search(pattern_attachment, body)
+    if match:
+        url = match.group(1)
+        content = download_github_attachment(url)
+        if content:
+            return content.strip()
+    
+    # Also check for GitHub user-attachments URL anywhere in submission section
+    pattern_attachment_alt = r"\[.*?\.json\]\((https://github\.com/[^\)]+)\)"
+    match = re.search(pattern_attachment_alt, body)
+    if match:
+        url = match.group(1)
+        if ".json" in url or "user-attachments" in url:
+            content = download_github_attachment(url)
+            if content:
+                return content.strip()
+    
+    # Try: JSON in code blocks after "### Submission JSON"
+    pattern_codeblock = r"### Submission JSON\s*\n\s*```(?:json)?\s*\n([\s\S]*?)\n\s*```"
+    match = re.search(pattern_codeblock, body)
+    if match:
+        return match.group(1).strip()
+    
+    # Try: Raw JSON after "### Submission JSON" until next section or end
+    pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*?[\]}])(?=\n###|\n\n###|$)"
+    match = re.search(pattern_raw, body)
+    if match:
+        return match.group(1).strip()
+    
+    # Try: Any JSON object/array in the body (fallback)
+    pattern_any = r"([\[{][\s\S]*?[\]}])"
+    for match in re.finditer(pattern_any, body):
+        candidate = match.group(1).strip()
+        # Validate it looks like JSON
+        if candidate.startswith('{') and candidate.endswith('}'):
+            return candidate
+        if candidate.startswith('[') and candidate.endswith(']'):
+            return candidate
+    
+    return None
+
+
+def extract_contributor_name_from_issue_body(body: str) -> str | None:
+    """
+    Extract contributor name from GitHub issue body.
+    
+    Looks for the 'Contributor Name' field in the issue form.
+    
+    Args:
+        body: The issue body text
+        
+    Returns:
+        Contributor name string or None if not found/empty
+    """
+    # Match "### Contributor Name" section
+    pattern = r"### Contributor Name\s*\n\s*(.+?)(?=\n###|\n\n|$)"
+    match = re.search(pattern, body)
+    
+    if match:
+        name = match.group(1).strip()
+        # GitHub issue forms show "_No response_" for empty optional fields
+        if name and name != "_No response_":
+            return name
+    
+    return None
+
+
+def parse_and_validate(json_str: str, schema: dict | None = None) -> tuple[list | dict | None, list[str]]:
+    """
+    Parse JSON string and validate against schema.
+    
+    Args:
+        json_str: JSON string to parse
+        schema: Optional schema dict
+        
+    Returns:
+        Tuple of (parsed data or None, list of errors)
+    """
+    try:
+        data = json.loads(json_str)
+    except json.JSONDecodeError as e:
+        return None, [f"Invalid JSON: {e}"]
+    
+    errors = validate_submission(data, schema)
+    return data, errors
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+Update the schema with tag type definitions from existing submissions.
+
+This script reads all community submissions and generates a new schema version
+that includes explicit type definitions for all known tags.
+
+When new tags are introduced, a new schema version is created (e.g., v1 -> v2 -> v3).
+
+Usage:
+    python -m src.contributions.update_schema
+    python -m src.contributions.update_schema --check  # Check if update needed
+"""
+import argparse
+import json
+import sys
+from pathlib import Path
+
+from .read_community_data import read_all_submissions, build_tag_type_registry
+from .schema import SCHEMAS_DIR, get_latest_schema_version, get_schema_path, load_schema
+
+
+def get_existing_tag_definitions(schema: dict) -> dict[str, dict]:
+    """Extract existing tag property definitions from schema."""
+    tags_props = schema.get("properties", {}).get("tags", {}).get("properties", {})
+    return tags_props
+
+
+def type_name_to_json_schema(type_name: str) -> dict:
+    """Convert a type name to a JSON Schema type definition."""
+    type_map = {
+        "string": {"type": "string"},
+        "integer": {"type": "integer"},
+        "number": {"type": "number"},
+        "boolean": {"type": "boolean"},
+        "null": {"type": "null"},
+        "array": {"type": "array", "items": {"$ref": "#/$defs/tagScalar"}},
+        "object": {"type": "object", "additionalProperties": {"$ref": "#/$defs/tagScalar"}},
+    }
+    return type_map.get(type_name, {"$ref": "#/$defs/tagValue"})
+
+
+def generate_updated_schema(base_schema: dict, tag_registry: dict[str, str]) -> dict:
+    """
+    Generate an updated schema with explicit tag definitions.
+    
+    Args:
+        base_schema: The current schema to update
+        tag_registry: Dict mapping tag name to type name
+        
+    Returns:
+        Updated schema dict
+    """
+    schema = json.loads(json.dumps(base_schema))  # Deep copy
+    
+    # Build tag properties with explicit types
+    tag_properties = {}
+    for tag_name, type_name in sorted(tag_registry.items()):
+        tag_properties[tag_name] = type_name_to_json_schema(type_name)
+    
+    # Only add/update the properties key within tags, preserve everything else
+    if "properties" in schema and "tags" in schema["properties"]:
+        schema["properties"]["tags"]["properties"] = tag_properties
+    
+    return schema
+
+
+def check_for_new_tags(tag_registry: dict[str, str], current_schema: dict) -> list[str]:
+    """
+    Check which tags in the registry are not yet defined in the schema.
+    
+    Returns:
+        List of new tag names
+    """
+    existing_tags = get_existing_tag_definitions(current_schema)
+    return [tag for tag in tag_registry if tag not in existing_tags]
+
+
+def update_schema_file(
+    tag_registry: dict[str, str],
+    check_only: bool = False
+) -> tuple[bool, list[str]]:
+    """
+    Update the v1 schema file with new tag definitions.
+    
+    Args:
+        tag_registry: Dict mapping tag name to type name
+        check_only: If True, only check if update is needed without writing
+        
+    Returns:
+        Tuple of (was_updated, list_of_new_tags)
+    """
+    current_schema = load_schema()
+    
+    # Find new tags
+    new_tags = check_for_new_tags(tag_registry, current_schema)
+    
+    if not new_tags:
+        return False, []
+    
+    if check_only:
+        return True, new_tags
+    
+    # Generate and write updated schema (in place)
+    updated_schema = generate_updated_schema(current_schema, tag_registry)
+    schema_path = get_schema_path()
+    
+    with open(schema_path, "w") as f:
+        json.dump(updated_schema, f, indent=2)
+        f.write("\n")
+    
+    return True, new_tags
+
+
+def update_schema_from_submissions(check_only: bool = False) -> tuple[bool, list[str]]:
+    """
+    Read all submissions and update the schema if needed.
+    
+    Args:
+        check_only: If True, only check if update is needed without writing
+        
+    Returns:
+        Tuple of (was_updated, list_of_new_tags)
+    """
+    submissions = read_all_submissions()
+    tag_registry = build_tag_type_registry(submissions)
+    return update_schema_file(tag_registry, check_only)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Update schema with tag definitions")
+    parser.add_argument("--check", action="store_true", help="Check if update needed without writing")
+    
+    args = parser.parse_args()
+    
+    was_updated, new_tags = update_schema_from_submissions(check_only=args.check)
+    
+    if args.check:
+        if was_updated:
+            print(f"Schema update needed. New tags: {', '.join(new_tags)}")
+            sys.exit(1)
+        else:
+            print("Schema is up to date")
+            sys.exit(0)
+    else:
+        if was_updated:
+            print(f"Updated {get_schema_path()}")
+            print(f"Added tags: {', '.join(new_tags)}")
+        else:
+            print("No update needed")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+Validate a community submission from a GitHub issue.
+
+This script is called by the GitHub Actions workflow to validate
+submissions when issues are opened or edited.
+
+Usage:
+    python -m src.contributions.validate_submission --issue-body "..."
+    python -m src.contributions.validate_submission --issue-body-file /path/to/body.txt
+    python -m src.contributions.validate_submission --file submission.json
+    echo '{"registration_number": "N12345"}' | python -m src.contributions.validate_submission --stdin
+    
+Environment variables (for GitHub Actions):
+    GITHUB_TOKEN: GitHub API token
+    GITHUB_REPOSITORY: owner/repo
+    ISSUE_NUMBER: Issue number to comment on
+"""
+import argparse
+import json
+import os
+import sys
+import urllib.request
+import urllib.error
+
+from .schema import extract_json_from_issue_body, parse_and_validate, load_schema
+from .read_community_data import read_all_submissions, build_tag_type_registry, get_python_type_name
+
+
+def github_api_request(method: str, endpoint: str, data: dict | None = None) -> dict:
+    """Make a GitHub API request."""
+    token = os.environ.get("GITHUB_TOKEN")
+    repo = os.environ.get("GITHUB_REPOSITORY")
+    
+    if not token or not repo:
+        raise EnvironmentError("GITHUB_TOKEN and GITHUB_REPOSITORY must be set")
+    
+    url = f"https://api.github.com/repos/{repo}{endpoint}"
+    headers = {
+        "Authorization": f"token {token}",
+        "Accept": "application/vnd.github.v3+json",
+        "Content-Type": "application/json",
+    }
+    
+    body = json.dumps(data).encode() if data else None
+    req = urllib.request.Request(url, data=body, headers=headers, method=method)
+    
+    with urllib.request.urlopen(req) as response:
+        return json.loads(response.read())
+
+
+def add_issue_comment(issue_number: int, body: str) -> None:
+    """Add a comment to a GitHub issue."""
+    github_api_request("POST", f"/issues/{issue_number}/comments", {"body": body})
+
+
+def add_issue_label(issue_number: int, label: str) -> None:
+    """Add a label to a GitHub issue."""
+    github_api_request("POST", f"/issues/{issue_number}/labels", {"labels": [label]})
+
+
+def remove_issue_label(issue_number: int, label: str) -> None:
+    """Remove a label from a GitHub issue."""
+    try:
+        github_api_request("DELETE", f"/issues/{issue_number}/labels/{label}")
+    except urllib.error.HTTPError:
+        pass  # Label might not exist
+
+
+def validate_tag_consistency(data: dict | list, tag_registry: dict[str, str]) -> list[str]:
+    """
+    Check that tag types in new submissions match existing tag types.
+    
+    Args:
+        data: Single submission dict or list of submissions
+        tag_registry: Dict mapping tag name to expected type
+        
+    Returns:
+        List of error messages. Empty list means validation passed.
+    """
+    errors = []
+    submissions = data if isinstance(data, list) else [data]
+    
+    for i, submission in enumerate(submissions):
+        prefix = f"[{i}] " if len(submissions) > 1 else ""
+        tags = submission.get("tags", {})
+        
+        if not isinstance(tags, dict):
+            continue
+        
+        for key, value in tags.items():
+            actual_type = get_python_type_name(value)
+            
+            if key in tag_registry:
+                expected_type = tag_registry[key]
+                if actual_type != expected_type:
+                    errors.append(
+                        f"{prefix}tags.{key}: expected type '{expected_type}', got '{actual_type}'"
+                    )
+    
+    return errors
+
+
+def validate_and_report(json_str: str, issue_number: int | None = None) -> bool:
+    """
+    Validate JSON and optionally report to GitHub issue.
+    
+    Args:
+        json_str: JSON string to validate
+        issue_number: Optional issue number to comment on
+        
+    Returns:
+        True if validation passed, False otherwise
+    """
+    data, errors = parse_and_validate(json_str)
+    
+    if errors:
+        error_list = "\n".join(f"- {e}" for e in errors)
+        message = f"❌ **Validation Failed**\n\n{error_list}\n\nPlease fix the errors and edit your submission."
+        
+        print(message, file=sys.stderr)
+        
+        if issue_number:
+            add_issue_comment(issue_number, message)
+            remove_issue_label(issue_number, "validated")
+        
+        return False
+    
+    # Check tag type consistency against existing submissions
+    if data is not None:
+        try:
+            existing_submissions = read_all_submissions()
+            tag_registry = build_tag_type_registry(existing_submissions)
+            tag_errors = validate_tag_consistency(data, tag_registry)
+            
+            if tag_errors:
+                error_list = "\n".join(f"- {e}" for e in tag_errors)
+                message = (
+                    f"❌ **Tag Type Mismatch**\n\n"
+                    f"Your submission uses tags with types that don't match existing submissions:\n\n"
+                    f"{error_list}\n\n"
+                    f"Please use the same type as existing tags, or use a different tag name."
+                )
+                
+                print(message, file=sys.stderr)
+                
+                if issue_number:
+                    add_issue_comment(issue_number, message)
+                    remove_issue_label(issue_number, "validated")
+                
+                return False
+        except Exception as e:
+            # Don't fail validation if we can't read existing submissions
+            print(f"Warning: Could not check tag consistency: {e}", file=sys.stderr)
+    
+    count = len(data) if isinstance(data, list) else 1
+    message = f"✅ **Validation Passed**\n\n{count} submission(s) validated successfully against the schema.\n\nA maintainer can approve this submission by adding the `approved` label."
+    
+    print(message)
+    
+    if issue_number:
+        add_issue_comment(issue_number, message)
+        add_issue_label(issue_number, "validated")
+    
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Validate community submission JSON")
+    source_group = parser.add_mutually_exclusive_group(required=True)
+    source_group.add_argument("--issue-body", help="Issue body text containing JSON")
+    source_group.add_argument("--issue-body-file", help="File containing issue body text")
+    source_group.add_argument("--file", help="JSON file to validate")
+    source_group.add_argument("--stdin", action="store_true", help="Read JSON from stdin")
+    
+    parser.add_argument("--issue-number", type=int, help="GitHub issue number to comment on")
+    
+    args = parser.parse_args()
+    
+    # Get JSON string
+    if args.issue_body:
+        json_str = extract_json_from_issue_body(args.issue_body)
+        if not json_str:
+            print("❌ Could not extract JSON from issue body", file=sys.stderr)
+            if args.issue_number:
+                add_issue_comment(
+                    args.issue_number,
+                    "❌ **Validation Failed**\n\nCould not extract JSON from submission. "
+                    "Please ensure your JSON is in the 'Submission JSON' field wrapped in code blocks."
+                )
+            sys.exit(1)
+    elif args.issue_body_file:
+        with open(args.issue_body_file) as f:
+            issue_body = f.read()
+        json_str = extract_json_from_issue_body(issue_body)
+        if not json_str:
+            print("❌ Could not extract JSON from issue body", file=sys.stderr)
+            print(f"Issue body:\n{issue_body}", file=sys.stderr)
+            if args.issue_number:
+                add_issue_comment(
+                    args.issue_number,
+                    "❌ **Validation Failed**\n\nCould not extract JSON from submission. "
+                    "Please ensure your JSON is in the 'Submission JSON' field."
+                )
+            sys.exit(1)
+    elif args.file:
+        with open(args.file) as f:
+            json_str = f.read()
+    else:  # stdin
+        json_str = sys.stdin.read()
+    
+    # Validate
+    success = validate_and_report(json_str, args.issue_number)
+    sys.exit(0 if success else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,84 @@
+from pathlib import Path
+from datetime import datetime, timezone, timedelta
+import sys
+
+import polars as pl
+
+# Add adsb directory to path
+sys.path.insert(0, str(Path(__file__).parent / "adsb")) # TODO: Fix this hacky path manipulation
+
+from adsb.compress_adsb_to_aircraft_data import (
+    load_historical_for_day,
+    concat_compressed_dfs,
+    get_latest_aircraft_adsb_csv_df,
+)
+
+if __name__ == '__main__':
+    # Get yesterday's date (data for the previous day)
+    day = datetime.now(timezone.utc) - timedelta(days=1)
+
+    # Find a day with complete data
+    max_attempts = 2  # Don't look back more than a week
+    for attempt in range(max_attempts):
+        date_str = day.strftime("%Y-%m-%d")
+        print(f"Processing ADS-B data for {date_str}")
+        
+        print("Loading new ADS-B data...")
+        df_new = load_historical_for_day(day)
+        if df_new.height == 0:
+            day = day - timedelta(days=1)
+            continue
+        max_time = df_new['time'].max()
+        if max_time is not None:
+            # Handle timezone
+            max_time_dt = max_time
+            if hasattr(max_time_dt, 'replace'):
+                max_time_dt = max_time_dt.replace(tzinfo=timezone.utc)
+            
+            end_of_day = day.replace(hour=23, minute=59, second=59, tzinfo=timezone.utc) - timedelta(minutes=5)
+            
+            # Convert polars datetime to python datetime if needed
+            if isinstance(max_time_dt, datetime):
+                if max_time_dt.replace(tzinfo=timezone.utc) >= end_of_day:
+                    break
+            else:
+                # Polars returns python datetime already
+                if max_time >= day.replace(hour=23, minute=54, second=59):
+                    break
+        
+        print(f"WARNING: Latest data time is {max_time}, which is more than 5 minutes before end of day.")
+        day = day - timedelta(days=1)
+    else:
+        raise RuntimeError(f"Could not find complete data in the last {max_attempts} days")
+
+    try:
+        # Get the latest release data
+        print("Downloading latest ADS-B release...")
+        df_base, start_date_str = get_latest_aircraft_adsb_csv_df()
+        # Combine with historical data
+        print("Combining with historical data...")
+        df_combined = concat_compressed_dfs(df_base, df_new)
+    except Exception as e:
+        print(f"Error downloading latest ADS-B release: {e}")
+        df_combined = df_new
+        start_date_str = date_str
+
+    # Sort by time for consistent ordering
+    df_combined = df_combined.sort('time')
+    
+    # Convert any list columns to strings for CSV compatibility
+    for col in df_combined.columns:
+        if df_combined[col].dtype == pl.List:
+            df_combined = df_combined.with_columns(
+                pl.col(col).list.join(",").alias(col)
+            )
+
+    # Save the result
+    OUT_ROOT = Path("data/openairframes")
+    OUT_ROOT.mkdir(parents=True, exist_ok=True)
+
+    output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv"
+    df_combined.write_csv(output_file)
+
+    print(f"Saved: {output_file}")
+    print(f"Total aircraft: {df_combined.height}")
@@ -0,0 +1,49 @@
+from pathlib import Path
+from datetime import datetime, timezone, timedelta
+import argparse
+
+parser = argparse.ArgumentParser(description="Create daily FAA release")
+parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today)")
+args = parser.parse_args()
+
+if args.date:
+    date_str = args.date
+else:
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+out_dir = Path("data/faa_releasable")
+out_dir.mkdir(parents=True, exist_ok=True)
+zip_name = f"ReleasableAircraft_{date_str}.zip"
+
+zip_path = out_dir / zip_name
+if not zip_path.exists():
+    # URL and paths
+    url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
+    from urllib.request import Request, urlopen
+
+    req = Request(
+        url,
+        headers={"User-Agent": "Mozilla/5.0"},
+        method="GET",
+    )
+
+    with urlopen(req, timeout=120) as r:
+        body = r.read()
+        zip_path.write_bytes(body)
+
+OUT_ROOT = Path("data/openairframes")
+OUT_ROOT.mkdir(parents=True, exist_ok=True)
+from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
+from get_latest_release import get_latest_aircraft_faa_csv_df
+df_new = convert_faa_master_txt_to_df(zip_path, date_str)
+
+try:
+    df_base, start_date_str = get_latest_aircraft_faa_csv_df()
+    df_base = concat_faa_historical_df(df_base, df_new)
+    assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
+except Exception as e:
+    print(f"No existing FAA release found, using only new data: {e}")
+    df_base = df_new
+    start_date_str = date_str
+
+df_base.to_csv(OUT_ROOT / f"openairframes_faa_{start_date_str}_{date_str}.csv", index=False)
@@ -1,33 +0,0 @@
-from pathlib import Path
-from datetime import datetime, timezone
-date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
-
-out_dir = Path("data/faa_releasable")
-out_dir.mkdir(parents=True, exist_ok=True)
-zip_name = f"ReleasableAircraft_{date_str}.zip"
-
-zip_path = out_dir / zip_name
-if not zip_path.exists():
-    # URL and paths
-    url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
-    from urllib.request import Request, urlopen
-
-    req = Request(
-        url,
-        headers={"User-Agent": "Mozilla/5.0"},
-        method="GET",
-    )
-
-    with urlopen(req, timeout=120) as r:
-        body = r.read()
-        zip_path.write_bytes(body)
-
-OUT_ROOT = Path("data/planequery_aircraft")
-OUT_ROOT.mkdir(parents=True, exist_ok=True)
-from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
-from get_latest_planequery_aircraft_release import get_latest_aircraft_csv_df
-df_new = convert_faa_master_txt_to_df(zip_path, date_str)
-df_base, start_date_str = get_latest_aircraft_csv_df()
-df_base = concat_faa_historical_df(df_base, df_new)
-assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
-df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date_str}_{date_str}.csv", index=False)
@@ -29,8 +29,8 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
    certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
    df = df.drop(columns="certification").join(certification)
    
-    # Create planequery_airframe_id
-    df["planequery_airframe_id"] = (
+    # Create openairframes_id
+    df["openairframes_id"] = (
        normalize(df["aircraft_manufacturer"])
        + "|"
        + normalize(df["aircraft_model"])
@@ -38,11 +38,11 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
        + normalize(df["serial_number"])
    )
    
-    # Move planequery_airframe_id to come after registration_number
+    # Move openairframes_id to come after registration_number
    cols = df.columns.tolist()
-    cols.remove("planequery_airframe_id")
+    cols.remove("openairframes_id")
    reg_idx = cols.index("registration_number")
-    cols.insert(reg_idx + 1, "planequery_airframe_id")
+    cols.insert(reg_idx + 1, "openairframes_id")
    df = df[cols]
    
    # Convert all NaN to empty strings
@@ -1,116 +0,0 @@
-"""
-For each commit-day in Feb 2024 (last commit per day):
- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
-    ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
- Recombine MASTER-*.txt into Master.txt
- Produce Master.csv via convert_faa_master_txt_to_csv
-
-Assumes the non-master files are present in every commit.
-"""
-import subprocess, re
-from pathlib import Path
-import shutil
-from collections import OrderedDict
-from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
-import zipfile
-import pandas as pd
-import argparse
-from datetime import datetime, timedelta
-
-# Parse command line arguments
-parser = argparse.ArgumentParser(description="Process historical FAA data from git commits")
-parser.add_argument("since", help="Start date (YYYY-MM-DD)")
-parser.add_argument("until", help="End date (YYYY-MM-DD)")
-args = parser.parse_args()
-
-# Clone repository if it doesn't exist
-REPO = Path("data/scrape-faa-releasable-aircraft")
-OUT_ROOT = Path("data/faa_releasable_historical")
-OUT_ROOT.mkdir(parents=True, exist_ok=True)
-
-def run_git_text(*args: str) -> str:
-    return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip()
-
-def run_git_bytes(*args: str) -> bytes:
-    return subprocess.check_output(["git", "-C", str(REPO), *args])
-
-# Parse dates and adjust --since to the day before
-since_date = datetime.strptime(args.since, "%Y-%m-%d")
-adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d")
-
-# All commits in specified date range (oldest -> newest)
-log = run_git_text(
-    "log",
-    "--reverse",
-    "--format=%H %cs",
-    f"--since={adjusted_since}",
-    f"--until={args.until}",
-)
-lines = [ln for ln in log.splitlines() if ln.strip()]
-if not lines:
-    raise SystemExit(f"No commits found between {args.since} and {args.until}.")
-
-# date -> last SHA that day
-date_to_sha = OrderedDict()
-for ln in lines:
-    sha, date = ln.split()
-    date_to_sha[date] = sha
-
-OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
-master_re = re.compile(r"^MASTER-(\d+)\.txt$")
-df_base = pd.DataFrame()
-start_date = None
-end_date = None
-for date, sha in date_to_sha.items():
-    if start_date is None:
-        start_date = date
-    end_date = date
-    day_dir = OUT_ROOT / date
-    day_dir.mkdir(parents=True, exist_ok=True)
-
-    # Write auxiliary files (assumed present)
-    for fname in OTHER_FILES:
-        (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
-
-    # Recombine MASTER parts
-    names = run_git_text("ls-tree", "--name-only", sha).splitlines()
-    parts = []
-    for n in names:
-        m = master_re.match(n)
-        if m:
-            parts.append((int(m.group(1)), n))
-    parts.sort()
-    if not parts:
-        raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")
-
-    master_path = day_dir / "MASTER.txt"
-    with master_path.open("wb") as w:
-        for _, fname in parts:
-            data = run_git_bytes("show", f"{sha}:{fname}")
-            w.write(data)
-            if data and not data.endswith(b"\n"):
-                w.write(b"\n")
-
-    # 3) Zip the day's files
-    zip_path = day_dir / f"ReleasableAircraft.zip"
-    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
-        for p in day_dir.iterdir():
-            z.write(p, arcname=p.name)
-
-    print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
-    # 4) Convert ZIP -> CSV
-    df_new = convert_faa_master_txt_to_df(zip_path, date)
-    if df_base.empty:
-        df_base = df_new
-        print(len(df_base), "total entries so far")
-        # Delete all files in the day directory
-        shutil.rmtree(day_dir)
-        continue
-    
-    df_base = concat_faa_historical_df(df_base, df_new)
-    shutil.rmtree(day_dir)
-    print(len(df_base), "total entries so far")
-
-assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
-df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date}_{end_date}.csv", index=False)
-# TODO: get average number of new rows per day.
@@ -9,7 +9,7 @@ import urllib.error
 import json


-REPO = "PlaneQuery/planequery-aircraft"
+REPO = "PlaneQuery/openairframes"
 LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest"


@@ -31,7 +31,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
    url = f"https://api.github.com/repos/{repo}/releases/latest"
    headers = {
        "Accept": "application/vnd.github+json",
-        "User-Agent": "planequery-aircraft-downloader/1.0",
+        "User-Agent": "openairframes-downloader/1.0",
    }
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"
@@ -80,7 +80,7 @@ def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[s
    out_path.parent.mkdir(parents=True, exist_ok=True)

    headers = {
-        "User-Agent": "planequery-aircraft-downloader/1.0",
+        "User-Agent": "openairframes-downloader/1.0",
        "Accept": "application/octet-stream",
    }
    if github_token:
@@ -109,7 +109,7 @@ def download_latest_aircraft_csv(
    repo: str = REPO,
 ) -> Path:
    """
-    Download the latest planequery_aircraft_*.csv file from the latest GitHub release.
+    Download the latest openairframes_faa_*.csv file from the latest GitHub release.

    Args:
        output_dir: Directory to save the downloaded file (default: "downloads")
@@ -120,25 +120,70 @@ def download_latest_aircraft_csv(
        Path to the downloaded file
    """
    assets = get_latest_release_assets(repo, github_token=github_token)
-    asset = pick_asset(assets, name_regex=r"^planequery_aircraft_.*\.csv$")
+    try:
+        asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$")
+    except FileNotFoundError:
+        # Fallback to old naming pattern
+        asset = pick_asset(assets, name_regex=r"^openairframes_\d{4}-\d{2}-\d{2}_.*\.csv$")
    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
    return saved_to

-def get_latest_aircraft_csv_df():
+def get_latest_aircraft_faa_csv_df():
    csv_path = download_latest_aircraft_csv()
    import pandas as pd
    df = pd.read_csv(csv_path, dtype={'transponder_code': str, 
           'unique_regulatory_id': str, 
           'registrant_county': str})
    df = df.fillna("")
-    # Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv
-    match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    # Extract start date from filename pattern: openairframes_faa_{start_date}_{end_date}.csv
+    match = re.search(r"openairframes_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    if not match:
+        # Fallback to old naming pattern: openairframes_{start_date}_{end_date}.csv
+        match = re.search(r"openairframes_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
    
    date_str = match.group(1)
    return df, date_str

+
+def download_latest_aircraft_adsb_csv(
+    output_dir: Path = Path("downloads"),
+    github_token: Optional[str] = None,
+    repo: str = REPO,
+) -> Path:
+    """
+    Download the latest openairframes_adsb_*.csv file from the latest GitHub release.
+
+    Args:
+        output_dir: Directory to save the downloaded file (default: "downloads")
+        github_token: Optional GitHub token for authentication
+        repo: GitHub repository in format "owner/repo" (default: REPO)
+
+    Returns:
+        Path to the downloaded file
+    """
+    assets = get_latest_release_assets(repo, github_token=github_token)
+    asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$")
+    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
+    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
+    return saved_to
+
+
+def get_latest_aircraft_adsb_csv_df():
+    csv_path = download_latest_aircraft_adsb_csv()
+    import pandas as pd
+    df = pd.read_csv(csv_path)
+    df = df.fillna("")
+    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
+    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    if not match:
+        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+    
+    date_str = match.group(1)
+    return df, date_str
+
+
 if __name__ == "__main__":
    download_latest_aircraft_csv()
Author	SHA1	Message	Date
ggman12	8999a943a9	update histoircal	2026-02-13 00:12:18 -05:00
ggman12	74625b9bc9	split large file into chuncks	2026-02-12 20:22:36 -05:00
ggman12	f2728d6156	delete aws	2026-02-12 20:13:40 -05:00
ggman12	5ed10ec42e	update	2026-02-12 19:32:34 -05:00
ggman12	3b8a14a4b9	add ability for custom run input date	2026-02-12 19:09:35 -05:00
ggman12	e5f124428f	use github token for adsb.lol downlaods	2026-02-12 19:03:23 -05:00
ggman12	d5039fb766	update to fix files	2026-02-12 19:01:02 -05:00
ggman12	50267f3c57	make faa work with no new data	2026-02-12 17:26:48 -05:00
ggman12	dd323f6e55	delete old files	2026-02-12 17:25:50 -05:00
ggman12	0e8b21daf9	rename from planequery to openairframes	2026-02-12 17:24:08 -05:00
ggman12	3960e6936c	use start_date_end_date for adsb naming	2026-02-12 17:13:06 -05:00
ggman12	48623ef79e	delete existign release	2026-02-12 17:12:09 -05:00
ggman12	5affe8937c	rename to openairframes	2026-02-12 17:09:07 -05:00
ggman12	d0254146f3	update release to fix not grabbing FAA file	2026-02-12 16:42:47 -05:00
ggman12	1699ad6d8a	rename file	2026-02-12 16:12:03 -05:00
ggman12	2a6892c347	fix download	2026-02-12 16:08:08 -05:00
ggman12	47ccecb9ba	set fail-fast to true	2026-02-12 16:07:42 -05:00
ggman12	2826dfd450	remove notebook	2026-02-12 16:07:28 -05:00
ggman12	fecf9ff0ea	format properly	2026-02-12 16:01:14 -05:00
ggman12	7e0a396fc7	only modify key parts of schemas/community_submission.v1.schema.json schema. Lowest diffs	2026-02-12 15:55:44 -05:00
ggman12	b0503bb3b2	fix: should update schema now	2026-02-12 15:46:11 -05:00
ggman12	0b89138daf	modify existing json schema instead of creating a new file every time	2026-02-12 15:40:01 -05:00
ggman12	4b756cdaef	fix syntax error	2026-02-12 15:32:37 -05:00
ggman12	9acffe1e56	handle multiple PRs with schema changes	2026-02-12 15:31:53 -05:00
ggman12	1694fe0b46	allow fileupload in submission	2026-02-12 15:26:45 -05:00
ggman12	c6d9e59d01	update template	2026-02-12 13:29:45 -05:00
ggman12	dd6cd7b6fd	update schema with optional start_date and end_date scope	2026-02-12 13:28:43 -05:00
ggman12	f543b671f8	updating schema	2026-02-12 13:22:56 -05:00
ggman12	efb4cbb953	update example	2026-02-12 13:22:43 -05:00
ggman12	5578133a99	update schema to be uppercase only	2026-02-12 12:36:50 -05:00
ggman12	eace7d5a63	update folder	2026-02-12 12:34:27 -05:00
ggman12	82f47b662c	make blank username work	2026-02-12 12:32:41 -05:00
ggman12	787796c3ab	update approve_submission	2026-02-12 12:26:54 -05:00
ggman12	61aae586ee	fix approve	2026-02-12 12:18:28 -05:00
ggman12	5abfa6b226	update submission validation	2026-02-12 12:15:04 -05:00
ggman12	a743b74ae5	Merge branch 'develop'	2026-02-12 12:10:24 -05:00
ggman12	53a020ab73	add jsonschema to requirements.txt	2026-02-12 12:09:03 -05:00
ggman12	2de41c9883	update historical. To check tar and fail fast if any maps fail	2026-02-12 12:01:13 -05:00
ggman12	bccc634158	remove existing release	2026-02-12 11:50:45 -05:00
ggman12	43b07942b0	add needed permissions	2026-02-12 11:42:49 -05:00
ggman12	2c9e994a12	add debug for FAA	2026-02-12 11:06:38 -05:00
ggman12	99b680476a	delete parquet chunck after load to not use so much space for big historical run	2026-02-12 10:52:42 -05:00
JG	f9e04337ae	Merge pull request #5 from PlaneQuery/develop FIX: trigger for planequery-aircraft daily release workflow. Update contributions issue template.	2026-02-12 10:42:47 -05:00
ggman12	1348e1f3a0	Merge branch 'main' into develop	2026-02-12 10:41:26 -05:00
ggman12	b349c01d31	FIX: trigger for planequery-aircraft daily release workflow. Update contributions issue template.	2026-02-12 10:26:05 -05:00
JG	a98175bc6c	Merge pull request #3 from PlaneQuery/develop Develop to main new historical adsb workflow. Community Submission updates.	2026-02-11 23:42:40 -05:00
ggman12	953a3647df	remove process historical-faa github workflow	2026-02-11 23:41:42 -05:00
ggman12	e5c99b611c	make a histoircla runner for adsb	2026-02-11 23:41:42 -05:00
ggman12	4e803dbb45	remove confirmations	2026-02-11 23:41:42 -05:00
JG	59c2aab5c7	Merge pull request #2 from PlaneQuery/develop develop to main FEATURE: Add contributions framework. Fix and improve daily adsb release	2026-02-11 23:24:01 -05:00
ggman12	722bcdf791	FEATURE: Add contributions framework. Fix and improve daily adsb release using Github actions for map reduce.	2026-02-11 23:22:46 -05:00
ggman12	27da93801e	FEATURE: add historical adsb aircraft data and update daily adsb aircraft data derivation. add clickhouse_connect use 32GB update to no longer do df.copy() Add planequery_adsb_read.ipynb INCREASE: update Fargate task definition to 16 vCPU and 64 GB memory for improved performance on large datasets update notebook remove print(df) Ensure empty strings are preserved in DataFrame columns check if day has data for adsb update notebook	2026-02-11 13:58:56 -05:00
JG	b94bfdc575	Merge pull request #1 from PlaneQuery/import/af-klm-fleet af-klm-fleet from iclems	2026-02-04 17:51:46 -05:00
ggman12	c90bdada76	delete air-france folder	2026-02-04 17:49:25 -05:00
ggman12	921cbefb6e	Add 'af-klm-fleet/' from commit 'b1dd01c27eccc8ba620994b6ae0df78a37075f3a' git-subtree-dir: af-klm-fleet git-subtree-mainline: `85a3db4dd0` git-subtree-split: `b1dd01c27e`	2026-02-04 17:47:47 -05:00
Clément Wehrung	b1dd01c27e	Fix API rate limit risk	2026-02-04 23:27:15 +01:00
Clément Wehrung	2282e1197f	Auto-update fleet data - 2026-02-04	2026-02-04 23:25:27 +01:00
Clément Wehrung	ea9c095f91	Fix generate README / License	2026-02-04 23:25:18 +01:00
Clément Wehrung	4eb2b9ce0b	Auto-update fleet data - 2026-02-04	2026-02-04 23:21:27 +01:00
Clément Wehrung	23ef72100f	Merge branch 'main' of github.com:iclems/af-klm-fleet	2026-02-04 23:20:04 +01:00
Clément Wehrung	bfb22670ba	Added cron update	2026-02-04 23:19:02 +01:00
Clem	c7a3d9e056	Update README.md	2026-02-04 23:07:47 +01:00
Clément Wehrung	0d683d3510	Initial fleet data: Air France (220) + KLM (117) aircraft	2026-02-04 23:03:47 +01:00
Clem	8f11a1d05a	Initial commit	2026-02-04 23:00:48 +01:00
				`@@ -0,0 +1 @@`
				`"""Community contributions processing module."""`