Community submission (rebased on main)

2026-05-03 16:25:08 +02:00 · 2026-02-18 22:19:18 +00:00
17 changed files with 105 additions and 614 deletions
@@ -49,38 +49,11 @@ jobs:
          python -m src.adsb.download_and_list_icaos --date "$DATE"
          ls -lah data/output/adsb_archives/"$DATE" || true

-      - name: Upload archive part 0
+      - name: Upload archives
        uses: actions/upload-artifact@v4
        with:
-          name: adsb-archive-${{ inputs.date }}-part-0
-          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_0.tar.gz
-          retention-days: 1
-          compression-level: 0
-          if-no-files-found: error
-
-      - name: Upload archive part 1
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-archive-${{ inputs.date }}-part-1
-          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_1.tar.gz
-          retention-days: 1
-          compression-level: 0
-          if-no-files-found: error
-
-      - name: Upload archive part 2
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-archive-${{ inputs.date }}-part-2
-          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_2.tar.gz
-          retention-days: 1
-          compression-level: 0
-          if-no-files-found: error
-
-      - name: Upload archive part 3
-        uses: actions/upload-artifact@v4
-        with:
-          name: adsb-archive-${{ inputs.date }}-part-3
-          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_3.tar.gz
+          name: adsb-archives-${{ inputs.date }}
+          path: data/output/adsb_archives/${{ inputs.date }}
          retention-days: 1
          compression-level: 0
          if-no-files-found: error
@@ -106,22 +79,12 @@ jobs:
          python -m pip install --upgrade pip
          pip install -r requirements.txt

-      - name: Download archive part
+      - name: Download archives
        uses: actions/download-artifact@v4
        with:
-          name: adsb-archive-${{ inputs.date }}-part-${{ matrix.part_id }}
+          name: adsb-archives-${{ inputs.date }}
          path: data/output/adsb_archives/${{ inputs.date }}

-      - name: Verify archive
-        run: |
-          FILE="data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_${{ matrix.part_id }}.tar.gz"
-          ls -lah data/output/adsb_archives/${{ inputs.date }}/
-          if [ ! -f "$FILE" ]; then
-            echo "::error::Archive not found: $FILE"
-            exit 1
-          fi
-          echo "Verified: $(du -h "$FILE")"
-
      - name: Process part
        env:
          DATE: ${{ inputs.date }}
@@ -177,6 +140,6 @@ jobs:
        uses: actions/upload-artifact@v4
        with:
          name: openairframes_adsb-${{ inputs.date }}
-          path: data/output/openairframes_adsb_*
+          path: data/output/openairframes_adsb_${{ inputs.date }}*
          retention-days: 30
          if-no-files-found: error
@@ -101,51 +101,6 @@ jobs:
      date: ${{ needs.resolve-dates.outputs.adsb_date }}
      concat_with_latest_csv: true

-  adsb-reduce:
-    needs: [resolve-dates, adsb-to-aircraft]
-    if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
-    runs-on: ubuntu-24.04-arm
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Setup Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.12'
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements.txt
-
-      - name: Download compressed outputs
-        uses: actions/download-artifact@v4
-        with:
-          pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
-          path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
-          merge-multiple: true
-
-      - name: Concatenate final outputs
-        env:
-          DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
-          CONCAT_WITH_LATEST_CSV: true
-        run: |
-          EXTRA=""
-          if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
-            EXTRA="--concat_with_latest_csv"
-          fi
-          python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
-          ls -lah data/output/ || true
-
-      - name: Upload final artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
-          path: data/output/openairframes_adsb_*
-          retention-days: 30
-          if-no-files-found: error
-
  build-community:
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule'
@@ -233,13 +188,13 @@ jobs:

  create-release:
    runs-on: ubuntu-latest
-    needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
+    needs: [resolve-dates, build-faa, adsb-to-aircraft, build-community, build-adsbexchange-json, build-mictronics-db]
    if: github.event_name != 'schedule' && !cancelled()
    steps:
-      - name: Check ADS-B workflow status
-        if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
+      - name: Check adsb-to-aircraft status
+        if: needs.adsb-to-aircraft.result != 'success'
        run: |
-          echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"
+          echo "WARNING: adsb-to-aircraft result was '${{ needs.adsb-to-aircraft.result }}', will continue without ADS-B artifacts"

      - name: Checkout for gh CLI
        uses: actions/checkout@v4
@@ -256,7 +211,7 @@ jobs:

      - name: Download ADS-B artifacts
        uses: actions/download-artifact@v5
-        if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
+        if: needs.adsb-to-aircraft.result == 'success'
        continue-on-error: true
        with:
          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
@@ -311,11 +266,7 @@ jobs:
          
          # Find files from artifacts using find (handles nested structures)
          CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
-          # Prefer concatenated file (with date range) over single-day file
-          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*_*.csv.gz" -type f 2>/dev/null | head -1)
-          if [ -z "$CSV_FILE_ADSB" ]; then
-            CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
-          fi
+          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
          CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
          ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
          JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1)
@@ -16,19 +16,11 @@ A daily release is created at **06:00 UTC** and includes:
 - **openairframes_community.csv**  
  All community submissions

- **openairframes_adsb.csv**  
-  Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that day’s ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
-Example Usage:
-```python
-import pandas as pd
-url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
-df = pd.read_csv(url)
-df
-```
-![](docs/images/df_adsb_example_0.png)
 - **openairframes_faa.csv**  
  All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)

+- **openairframes_adsb.csv**  
+  Airframe information derived from ADS-B messages on the [ADSB.lol](https://www.adsb.lol/) network, from 2026-02-12 to present (will be from 2024-01-01 soon). The airframe information originates from [mictronics aircraft database](https://www.mictronics.de/aircraft-database/) (~5 MB).

 - **ReleasableAircraft_{date}.zip**  
  A daily snapshot of the FAA database, which updates at **05:30 UTC**
@@ -0,0 +1,21 @@
+[
+  {
+    "contributor_name": "applesauce123",
+    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
+    "creation_timestamp": "2026-02-13T16:58:21.863525+00:00",
+    "registration_number": "N12345",
+    "tags": {
+      "internet": "starlink"
+    }
+  },
+  {
+    "contributor_name": "applesauce123",
+    "contributor_uuid": "2981c3ee-8712-5f96-84bf-732eda515a3f",
+    "creation_timestamp": "2026-02-13T16:58:21.863525+00:00",
+    "tags": {
+      "internet": "viasat",
+      "owner": "John Doe"
+    },
+    "transponder_code_hex": "ABC123"
+  }
+]
@@ -61,6 +61,9 @@
        "icao_aircraft_type": {
          "type": "string"
        },
+        "internet": {
+          "type": "string"
+        },
        "manufacturer_icao": {
          "type": "string"
        },
@@ -79,6 +82,9 @@
        "operator_icao": {
          "type": "string"
        },
+        "owner": {
+          "type": "string"
+        },
        "serial_number": {
          "type": "string"
        },
@@ -23,12 +23,6 @@ gh run list \
    "repos/$REPO/actions/runs/$run_id/artifacts" \
    --jq '.artifacts[] | select(.name | test("^openairframes_adsb-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}$")) | .name' | while read -r artifact_name; do
    
-    # Check if artifact directory already exists and has files
-    if [ -d "downloads/adsb_artifacts/$artifact_name" ] && [ -n "$(ls -A "downloads/adsb_artifacts/$artifact_name" 2>/dev/null)" ]; then
-      echo "  Skipping (already exists): $artifact_name"
-      continue
-    fi
-    
    echo "  Downloading: $artifact_name"
    gh run download "$run_id" \
      --repo "$REPO" \
@@ -194,7 +194,7 @@ def main():
    if triggered_runs and not args.dry_run:
        import json
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-        runs_file = f"./output/triggered_runs_{timestamp}.json"
+        runs_file = f"./triggered_runs_{timestamp}.json"
        with open(runs_file, 'w') as f:
            json.dump({
                'start_date': args.start_date,
@@ -1,242 +0,0 @@
-#!/usr/bin/env python3
-"""
-Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
-
-Source: "TheAirTraffic Database - Aircraft 2.csv"
-Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
-
-Categories in the spreadsheet columns (paired: name, registrations, separator):
-  Col  1-3:  Business
-  Col  4-6:  Government
-  Col  7-9:  People
-  Col 10-12: Sports
-  Col 13-15: Celebrity
-  Col 16-18: State Govt./Law
-  Col 19-21: Other
-  Col 22-24: Test Aircraft
-  Col 25-27: YouTubers
-  Col 28-30: Formula 1 VIP's
-  Col 31-33: Active GII's and GIII's  (test/demo aircraft)
-  Col 34-37: Russia & Ukraine          (extra col for old/new)
-  Col 38-40: Helicopters & Blimps
-  Col 41-43: Unique Reg's
-  Col 44-46: Saudi & UAE
-  Col 47-49: Schools
-  Col 50-52: Special Charter
-  Col 53-55: Unknown Owners
-  Col 56-59: Frequent Flyers           (extra cols: name, aircraft, logged, hours)
-"""
-
-import csv
-import json
-import hashlib
-import re
-import sys
-import uuid
-from datetime import datetime, timezone
-from pathlib import Path
-
-# ── Category mapping ────────────────────────────────────────────────────────
-# Each entry: (name_col, reg_col, owner_category_tags)
-# owner_category_tags is a dict of tag keys to add beyond "owner"
-CATEGORY_COLUMNS = [
-    # (name_col, reg_col, {tag_key: tag_value, ...})
-    (1,  2,  {"owner_category_0": "business"}),
-    (4,  5,  {"owner_category_0": "government"}),
-    (7,  8,  {"owner_category_0": "celebrity"}),
-    (10, 11, {"owner_category_0": "sports"}),
-    (13, 14, {"owner_category_0": "celebrity"}),
-    (16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
-    (19, 20, {"owner_category_0": "other"}),
-    (22, 23, {"owner_category_0": "test_aircraft"}),
-    (25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
-    (28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
-    (31, 32, {"owner_category_0": "test_aircraft"}),
-    # Russia & Ukraine: col 34=name, col 35 or 36 may have reg
-    (34, 35, {"owner_category_0": "russia_ukraine"}),
-    (38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
-    (41, 42, {"owner_category_0": "other"}),
-    (44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
-    (47, 48, {"owner_category_0": "education"}),
-    (50, 51, {"owner_category_0": "charter"}),
-    (53, 54, {"owner_category_0": "unknown"}),
-    (56, 57, {"owner_category_0": "celebrity"}),   # Frequent Flyers name col, aircraft col
-]
-
-# First data row index (0-based) in the CSV
-DATA_START_ROW = 4
-
-# ── Contributor info ────────────────────────────────────────────────────────
-CONTRIBUTOR_NAME = "TheAirTraffic"
-# Deterministic UUID v5 from contributor name
-CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
-
-# Citation
-CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
-
-
-def looks_like_military_serial(reg: str) -> bool:
-    """
-    Detect military-style serials like 92-9000, 82-8000, 98-0001
-    or pure numeric IDs like 929000, 828000, 980001.
-    These aren't standard civil registrations; use openairframes_id.
-    """
-    # Pattern: NN-NNNN
-    if re.match(r'^\d{2}-\d{4}$', reg):
-        return True
-    # Pure 6-digit numbers (likely ICAO hex or military mode-S)
-    if re.match(r'^\d{6}$', reg):
-        return True
-    # Short numeric-only (1-5 digits) like "01", "02", "676"
-    if re.match(r'^\d{1,5}$', reg):
-        return True
-    return False
-
-
-def normalize_reg(raw: str) -> str:
-    """Clean up a registration string."""
-    reg = raw.strip().rstrip(',').strip()
-    # Remove carriage returns and other whitespace
-    reg = reg.replace('\r', '').replace('\n', '').strip()
-    return reg
-
-
-def parse_regs(cell_value: str) -> list[str]:
-    """
-    Parse a cell that may contain one or many registrations,
-    separated by commas, possibly wrapped in quotes.
-    """
-    if not cell_value or not cell_value.strip():
-        return []
-
-    # Some cells have ADS-B exchange URLs – skip those
-    if 'globe.adsbexchange.com' in cell_value:
-        return []
-    if cell_value.strip() in ('.', ',', ''):
-        return []
-
-    results = []
-    # Split on comma
-    parts = cell_value.split(',')
-    for part in parts:
-        reg = normalize_reg(part)
-        if not reg:
-            continue
-        # Skip URLs, section labels, etc.
-        if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
-            continue
-        # Skip if it's just whitespace or dots
-        if reg in ('.', '..', '...'):
-            continue
-        results.append(reg)
-    return results
-
-
-def make_submission(
-    reg: str,
-    owner: str,
-    category_tags: dict[str, str],
-) -> dict:
-    """Build a single community_submission.v1 object."""
-
-    entry: dict = {}
-
-    # Decide identifier field
-    if looks_like_military_serial(reg):
-        entry["openairframes_id"] = reg
-    else:
-        entry["registration_number"] = reg
-
-    # Tags
-    tags: dict = {
-        "citation_0": CITATION,
-    }
-    if owner:
-        tags["owner"] = owner.strip()
-    tags.update(category_tags)
-    entry["tags"] = tags
-
-    return entry
-
-
-def main():
-    csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
-        "/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
-    )
-
-    if not csv_path.exists():
-        print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
-        sys.exit(1)
-
-    # Read CSV
-    with open(csv_path, 'r', encoding='utf-8-sig') as f:
-        reader = csv.reader(f)
-        rows = list(reader)
-
-    print(f"Read {len(rows)} rows from {csv_path.name}")
-
-    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
-
-    submissions: list[dict] = []
-    seen: set[tuple] = set()  # (reg, owner) dedup
-
-    for row_idx in range(DATA_START_ROW, len(rows)):
-        row = rows[row_idx]
-        if len(row) < 3:
-            continue
-
-        for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
-            if reg_col >= len(row) or name_col >= len(row):
-                continue
-
-            owner_raw = row[name_col].strip().rstrip(',').strip()
-            reg_raw = row[reg_col]
-
-            # Clean owner name
-            owner = owner_raw.replace('\r', '').replace('\n', '').strip()
-            if not owner or owner in ('.', ',', 'Section 1'):
-                continue
-            # Skip header-like values
-            if owner.startswith('http') or owner.startswith('Link '):
-                continue
-
-            regs = parse_regs(reg_raw)
-            if not regs:
-                # For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
-                if name_col == 34 and reg_col + 1 < len(row):
-                    regs = parse_regs(row[reg_col + 1])
-
-            for reg in regs:
-                key = (reg, owner)
-                if key in seen:
-                    continue
-                seen.add(key)
-                submissions.append(make_submission(reg, owner, cat_tags))
-
-    print(f"Generated {len(submissions)} submissions")
-
-    # Write output
-    proj_root = Path(__file__).resolve().parent.parent
-    out_dir = proj_root / "community" / date_str
-    out_dir.mkdir(parents=True, exist_ok=True)
-
-    out_file = out_dir / f"theairtraffic_{date_str}.json"
-
-    with open(out_file, 'w', encoding='utf-8') as f:
-        json.dump(submissions, f, indent=2, ensure_ascii=False)
-
-    print(f"Written to {out_file}")
-    print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
-
-    # Quick stats
-    cats = {}
-    for s in submissions:
-        c = s['tags'].get('owner_category_0', 'NONE')
-        cats[c] = cats.get(c, 0) + 1
-    print("\nCategory breakdown:")
-    for c, n in sorted(cats.items(), key=lambda x: -x[1]):
-        print(f"  {c}: {n}")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,69 +0,0 @@
-#!/usr/bin/env python3
-"""Validate the generated theairtraffic JSON output."""
-import json
-import glob
-import sys
-
-# Find the latest output
-files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
-if not files:
-    print("No output files found!")
-    sys.exit(1)
-
-path = files[-1]
-print(f"Validating: {path}")
-
-with open(path) as f:
-    data = json.load(f)
-
-print(f"Total entries: {len(data)}")
-
-# Check military serial handling
-mil = [d for d in data if "openairframes_id" in d]
-print(f"\nEntries using openairframes_id: {len(mil)}")
-for m in mil[:10]:
-    print(f"  {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
-
-# Check youtuber entries
-yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
-print(f"\nYouTuber entries: {len(yt)}")
-for y in yt[:5]:
-    reg = y.get("registration_number", y.get("openairframes_id"))
-    c0 = y["tags"].get("owner_category_0")
-    c1 = y["tags"].get("owner_category_1")
-    print(f"  {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
-
-# Check US Govt / military
-gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
-print(f"\nUSA 747/757 entries: {len(gov)}")
-for g in gov:
-    oid = g.get("openairframes_id", g.get("registration_number"))
-    print(f"  {oid}")
-
-# Schema validation
-issues = 0
-for i, d in enumerate(data):
-    has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
-    if not has_id:
-        print(f"  Entry {i}: no identifier!")
-        issues += 1
-    if "tags" not in d:
-        print(f"  Entry {i}: no tags!")
-        issues += 1
-    # Check tag key format
-    for k in d.get("tags", {}):
-        import re
-        if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
-            print(f"  Entry {i}: invalid tag key '{k}'")
-            issues += 1
-
-print(f"\nSchema issues: {issues}")
-
-# Category breakdown
-cats = {}
-for s in data:
-    c = s["tags"].get("owner_category_0", "NONE")
-    cats[c] = cats.get(c, 0) + 1
-print("\nCategory breakdown:")
-for c, n in sorted(cats.items(), key=lambda x: -x[1]):
-    print(f"  {c}: {n}")
@@ -1,7 +1,7 @@
 from pathlib import Path
 import polars as pl
 import argparse
-import os
+
 OUTPUT_DIR = Path("./data/output")
 CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]

@@ -13,55 +13,38 @@ def main():

    compressed_dir = OUTPUT_DIR / "compressed"
    date_dir = compressed_dir / args.date
+    if not date_dir.is_dir():
+        raise FileNotFoundError(f"No date folder found: {date_dir}")

    parquet_files = sorted(date_dir.glob("*.parquet"))
-    df = None
-    if parquet_files: # TODO: This logic could be updated slightly.
-        print(f"No parquet files found in {date_dir}")
+    if not parquet_files:
+        raise FileNotFoundError(f"No parquet files found in {date_dir}")

-        frames = [pl.read_parquet(p) for p in parquet_files]
-        df = pl.concat(frames, how="vertical", rechunk=True)
+    frames = [pl.read_parquet(p) for p in parquet_files]
+    df = pl.concat(frames, how="vertical", rechunk=True)

-        df = df.sort(["time", "icao"])
-        df = df.select(CORRECT_ORDER_OF_COLUMNS)
-        
-        output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
-        print(f"Writing combined parquet to {output_path} with {df.height} rows")
-        df.write_parquet(output_path)
+    df = df.sort(["time", "icao"])
+    df = df.select(CORRECT_ORDER_OF_COLUMNS)
+    
+    output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
+    print(f"Writing combined parquet to {output_path} with {df.height} rows")
+    df.write_parquet(output_path)

-        csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
-        print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
-        df.write_csv(csv_output_path, compression="gzip")
+    csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
+    print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
+    df.write_csv(csv_output_path, compression="gzip")

    if args.concat_with_latest_csv:
        print("Loading latest CSV from GitHub releases to concatenate with...")
        from src.get_latest_release import get_latest_aircraft_adsb_csv_df
-        from datetime import datetime
-        
-        df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
-        
-        # Compare dates: end_date is exclusive, so if csv_end_date > args.date, 
-        # the latest CSV already includes this day's data
-        csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
-        args_dt = datetime.strptime(args.date, "%Y-%m-%d")
-        
-        if df is None or csv_end_dt >= args_dt:
-            print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
-            print("Writing latest CSV directly without concatenation to avoid duplicates")
-            os.makedirs(OUTPUT_DIR, exist_ok=True)
-            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
-            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
-            df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
-        else:
-            print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
-            # Ensure column order matches before concatenating
-            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
-            from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
-            df_final = concat_compressed_dfs(df_latest_csv, df)
-            df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
-            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
-            df_final.write_csv(final_csv_output_path, compression="gzip")
-        print(f"Final CSV written to {final_csv_output_path}")
+        df_latest_csv, csv_date = get_latest_aircraft_adsb_csv_df()
+        # Ensure column order matches before concatenating
+        df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+        from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
+        df_final = concat_compressed_dfs(df_latest_csv, df)
+        df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
+        final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_date}_{args.date}.csv.gz"
+        df_final.write_csv(final_csv_output_path, compression="gzip")

 if __name__ == "__main__":
    main()
@@ -129,32 +129,13 @@ def fetch_releases(version_date: str) -> list:
    return releases


-def download_asset(asset_url: str, file_path: str, expected_size: int | None = None) -> bool:
-    """Download a single release asset with size verification.
-    
-    Args:
-        asset_url: URL to download from
-        file_path: Local path to save to
-        expected_size: Expected file size in bytes (for verification)
-    
-    Returns:
-        True if download succeeded and size matches (if provided), False otherwise
-    """
+def download_asset(asset_url: str, file_path: str) -> bool:
+    """Download a single release asset."""
    os.makedirs(os.path.dirname(file_path) or OUTPUT_DIR, exist_ok=True)
    
-    # Check if file exists and has correct size
    if os.path.exists(file_path):
-        if expected_size is not None:
-            actual_size = os.path.getsize(file_path)
-            if actual_size == expected_size:
-                print(f"[SKIP] {file_path} already downloaded and verified ({actual_size} bytes).")
-                return True
-            else:
-                print(f"[WARN] {file_path} exists but size mismatch (expected {expected_size}, got {actual_size}). Re-downloading.")
-                os.remove(file_path)
-        else:
-            print(f"[SKIP] {file_path} already downloaded.")
-            return True
+        print(f"[SKIP] {file_path} already downloaded.")
+        return True
    
    max_retries = 2
    retry_delay = 30
@@ -172,21 +153,7 @@ def download_asset(asset_url: str, file_path: str, expected_size: int | None = N
                            if not chunk:
                                break
                            file.write(chunk)
-                    
-                    # Verify file size if expected_size was provided
-                    if expected_size is not None:
-                        actual_size = os.path.getsize(file_path)
-                        if actual_size != expected_size:
-                            print(f"[ERROR] Size mismatch for {file_path}: expected {expected_size} bytes, got {actual_size} bytes")
-                            os.remove(file_path)
-                            if attempt < max_retries:
-                                print(f"Waiting {retry_delay} seconds before retry")
-                                time.sleep(retry_delay)
-                                continue
-                            return False
-                        print(f"Saved {file_path} ({actual_size} bytes, verified)")
-                    else:
-                        print(f"Saved {file_path}")
+                    print(f"Saved {file_path}")
                    return True
                else:
                    print(f"Failed to download {asset_url}: {response.status} {response.msg}")
@@ -260,6 +227,7 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
            stdin=cat_proc.stdout,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
+            check=True
        )
        cat_proc.stdout.close()
        cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
@@ -268,24 +236,6 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        if cat_stderr:
            print(f"cat stderr: {cat_stderr}")
        
-        tar_stderr = result.stderr.decode() if result.stderr else ""
-        if result.returncode != 0:
-            # GNU tar exits non-zero for format issues that BSD tar silently
-            # tolerates (e.g. trailing junk after the last valid entry).
-            # Check whether files were actually extracted before giving up.
-            extracted_items = os.listdir(extract_dir)
-            if extracted_items:
-                print(f"[WARN] tar exited {result.returncode} but extracted "
-                      f"{len(extracted_items)} items — treating as success")
-                if tar_stderr:
-                    print(f"tar stderr: {tar_stderr}")
-            else:
-                print(f"Failed to extract split archive (tar exit {result.returncode})")
-                if tar_stderr:
-                    print(f"tar stderr: {tar_stderr}")
-                shutil.rmtree(extract_dir, ignore_errors=True)
-                return False
-        
        print(f"Successfully extracted archive to {extract_dir}")
        
        # Delete tar files immediately after extraction
@@ -302,9 +252,11 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
        print(f"Disk space after tar deletion: {free_gb:.1f}GB free")
        
        return True
-    except Exception as e:
+    except subprocess.CalledProcessError as e:
+        stderr_output = e.stderr.decode() if e.stderr else ""
        print(f"Failed to extract split archive: {e}")
-        shutil.rmtree(extract_dir, ignore_errors=True)
+        if stderr_output:
+            print(f"tar stderr: {stderr_output}")
        return False


@@ -77,9 +77,8 @@ def download_and_extract(version_date: str) -> str | None:
            for asset in use_assets:
                asset_name = asset["name"]
                asset_url = asset["browser_download_url"]
-                asset_size = asset.get("size")  # Get expected file size
                file_path = os.path.join(OUTPUT_DIR, asset_name)
-                if download_asset(asset_url, file_path, expected_size=asset_size):
+                if download_asset(asset_url, file_path):
                    downloaded_files.append(file_path)
    
    if not downloaded_files:
@@ -123,16 +123,7 @@ def main():
    print(f"Processing part {args.part_id} for {args.date}")
    
    # Get specific archive file for this part
-    archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", args.date)
-    archive_path = os.path.join(archive_dir, f"{args.date}_part_{args.part_id}.tar.gz")
-    
-    if not os.path.isfile(archive_path):
-        print(f"ERROR: Archive not found: {archive_path}")
-        if os.path.isdir(archive_dir):
-            print(f"Files in {archive_dir}: {os.listdir(archive_dir)}")
-        else:
-            print(f"Directory does not exist: {archive_dir}")
-        sys.exit(1)
+    archive_path = os.path.join(OUTPUT_DIR, "adsb_archives", args.date, f"{args.date}_part_{args.part_id}.tar.gz")
    
    # Extract and collect trace files
    trace_map = build_trace_file_map(archive_path)
@@ -246,20 +246,6 @@ def process_submission(
    if schema_updated:
        schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
    
-    # Truncate JSON preview to stay under GitHub's 65536 char body limit
-    max_json_preview = 50000
-    if len(content_json) > max_json_preview:
-        # Show first few entries as a preview
-        preview_entries = submissions[:10]
-        preview_json = json.dumps(preview_entries, indent=2, sort_keys=True)
-        json_section = (
-            f"### Submissions (showing 10 of {len(submissions)})\n"
-            f"```json\n{preview_json}\n```\n\n"
-            f"*Full submission ({len(submissions)} entries, {len(content_json):,} chars) is in the committed file.*"
-        )
-    else:
-        json_section = f"### Submissions\n```json\n{content_json}\n```"
-
    pr_body = f"""## Community Submission

 Adds {len(submissions)} submission(s) from @{author_username}.
@@ -271,7 +257,10 @@ Closes #{issue_number}

 ---

-{json_section}"""
+### Submissions
+```json
+{content_json}
+```"""
    
    pr = create_pull_request(
        title=f"Community submission: {filename}",
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
    """Read all JSON submissions from the community directory."""
    all_submissions = []
    
-    for json_file in sorted(community_dir.glob("**/*.json")):
+    for json_file in sorted(community_dir.glob("*.json")):
        try:
            with open(json_file) as f:
                data = json.load(f)
@@ -27,33 +27,6 @@ def _http_get_json(url: str, headers: dict[str, str]) -> dict:
    return json.loads(data.decode("utf-8"))


-def get_releases(repo: str = REPO, github_token: Optional[str] = None, per_page: int = 30) -> list[dict]:
-    """Get a list of releases from the repository."""
-    url = f"https://api.github.com/repos/{repo}/releases?per_page={per_page}"
-    headers = {
-        "Accept": "application/vnd.github+json",
-        "User-Agent": "openairframes-downloader/1.0",
-    }
-    if github_token:
-        headers["Authorization"] = f"Bearer {github_token}"
-
-    return _http_get_json(url, headers=headers)
-
-
-def get_release_assets_from_release_data(release_data: dict) -> list[ReleaseAsset]:
-    """Extract assets from a release data dictionary."""
-    assets = []
-    for a in release_data.get("assets", []):
-        assets.append(
-            ReleaseAsset(
-                name=a["name"],
-                download_url=a["browser_download_url"],
-                size=int(a.get("size", 0)),
-            )
-        )
-    return assets
-
-
 def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
    url = f"https://api.github.com/repos/{repo}/releases/latest"
    headers = {
@@ -64,7 +37,16 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
        headers["Authorization"] = f"Bearer {github_token}"

    payload = _http_get_json(url, headers=headers)
-    return get_release_assets_from_release_data(payload)
+    assets = []
+    for a in payload.get("assets", []):
+        assets.append(
+            ReleaseAsset(
+                name=a["name"],
+                download_url=a["browser_download_url"],
+                size=int(a.get("size", 0)),
+            )
+        )
+    return assets


 def pick_asset(
@@ -173,8 +155,7 @@ def download_latest_aircraft_adsb_csv(
    repo: str = REPO,
 ) -> Path:
    """
-    Download the latest openairframes_adsb_*.csv file from GitHub releases.
-    If the latest release doesn't have the file, searches previous releases.
+    Download the latest openairframes_adsb_*.csv file from the latest GitHub release.

    Args:
        output_dir: Directory to save the downloaded file (default: "downloads")
@@ -185,33 +166,15 @@ def download_latest_aircraft_adsb_csv(
        Path to the downloaded file
    """
    output_dir = Path(output_dir)
-    
-    # Get multiple releases
-    releases = get_releases(repo, github_token=github_token, per_page=30)
-    
-    # Try each release until we find one with the matching asset
-    for release in releases:
-        assets = get_release_assets_from_release_data(release)
-        try:
-            asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
-            saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
-            print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
-            return saved_to
-        except FileNotFoundError:
-            # This release doesn't have the matching asset, try the next one
-            continue
-    
-    raise FileNotFoundError(
-        f"No release in the last 30 releases has an asset matching 'openairframes_adsb_.*\\.csv(\\.gz)?$'"
-    )
+    assets = get_latest_release_assets(repo, github_token=github_token)
+    asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
+    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
+    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
+    return saved_to

 import polars as pl
 def get_latest_aircraft_adsb_csv_df():
-    """Download and load the latest ADS-B CSV from GitHub releases.
-    
-    Returns:
-        tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
-    """
+    """Download and load the latest ADS-B CSV from GitHub releases."""
    import re
    
    csv_path = download_latest_aircraft_adsb_csv()
@@ -235,19 +198,17 @@ def get_latest_aircraft_adsb_csv_df():
        if df[col].dtype == pl.Utf8:
            df = df.with_columns(pl.col(col).fill_null(""))
    
-    # Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
-    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
+    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
+    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
    if not match:
-        raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
+        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
    
-    start_date = match.group(1)
-    end_date = match.group(2)
+    date_str = match.group(1)
    print(df.columns)
    print(df.dtypes)
-    return df, start_date, end_date
+    return df, date_str



 if __name__ == "__main__":
    download_latest_aircraft_csv()
-    download_latest_aircraft_adsb_csv()