Merge pull request #35 from PlaneQuery/develop

update readme.md
2026-05-03 08:15:08 +02:00 · 2026-03-18 14:31:29 -04:00 · 2026-03-18 14:29:13 -04:00 · 2026-03-10 05:12:11 -04:00 · 2026-03-10 05:08:19 -04:00 · 2026-03-10 05:08:19 -04:00
11 changed files with 533 additions and 63 deletions
@@ -49,11 +49,38 @@ jobs:
          python -m src.adsb.download_and_list_icaos --date "$DATE"
          ls -lah data/output/adsb_archives/"$DATE" || true

-      - name: Upload archives
+      - name: Upload archive part 0
        uses: actions/upload-artifact@v4
        with:
-          name: adsb-archives-${{ inputs.date }}
-          path: data/output/adsb_archives/${{ inputs.date }}
+          name: adsb-archive-${{ inputs.date }}-part-0
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_0.tar.gz
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+      - name: Upload archive part 1
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-1
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_1.tar.gz
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+      - name: Upload archive part 2
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-2
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_2.tar.gz
+          retention-days: 1
+          compression-level: 0
+          if-no-files-found: error
+
+      - name: Upload archive part 3
+        uses: actions/upload-artifact@v4
+        with:
+          name: adsb-archive-${{ inputs.date }}-part-3
+          path: data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_3.tar.gz
          retention-days: 1
          compression-level: 0
          if-no-files-found: error
@@ -79,12 +106,22 @@ jobs:
          python -m pip install --upgrade pip
          pip install -r requirements.txt

-      - name: Download archives
+      - name: Download archive part
        uses: actions/download-artifact@v4
        with:
-          name: adsb-archives-${{ inputs.date }}
+          name: adsb-archive-${{ inputs.date }}-part-${{ matrix.part_id }}
          path: data/output/adsb_archives/${{ inputs.date }}

+      - name: Verify archive
+        run: |
+          FILE="data/output/adsb_archives/${{ inputs.date }}/${{ inputs.date }}_part_${{ matrix.part_id }}.tar.gz"
+          ls -lah data/output/adsb_archives/${{ inputs.date }}/
+          if [ ! -f "$FILE" ]; then
+            echo "::error::Archive not found: $FILE"
+            exit 1
+          fi
+          echo "Verified: $(du -h "$FILE")"
+
      - name: Process part
        env:
          DATE: ${{ inputs.date }}
@@ -140,6 +177,6 @@ jobs:
        uses: actions/upload-artifact@v4
        with:
          name: openairframes_adsb-${{ inputs.date }}
-          path: data/output/openairframes_adsb_${{ inputs.date }}*
+          path: data/output/openairframes_adsb_*
          retention-days: 30
          if-no-files-found: error
@@ -101,6 +101,51 @@ jobs:
      date: ${{ needs.resolve-dates.outputs.adsb_date }}
      concat_with_latest_csv: true

+  adsb-reduce:
+    needs: [resolve-dates, adsb-to-aircraft]
+    if: always() && github.event_name != 'schedule' && needs.adsb-to-aircraft.result == 'failure'
+    runs-on: ubuntu-24.04-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+
+      - name: Download compressed outputs
+        uses: actions/download-artifact@v4
+        with:
+          pattern: adsb-compressed-${{ needs.resolve-dates.outputs.adsb_date }}-part-*
+          path: data/output/compressed/${{ needs.resolve-dates.outputs.adsb_date }}
+          merge-multiple: true
+
+      - name: Concatenate final outputs
+        env:
+          DATE: ${{ needs.resolve-dates.outputs.adsb_date }}
+          CONCAT_WITH_LATEST_CSV: true
+        run: |
+          EXTRA=""
+          if [ "$CONCAT_WITH_LATEST_CSV" = "true" ]; then
+            EXTRA="--concat_with_latest_csv"
+          fi
+          python -m src.adsb.concat_parquet_to_final --date "$DATE" $EXTRA
+          ls -lah data/output/ || true
+
+      - name: Upload final artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
+          path: data/output/openairframes_adsb_*
+          retention-days: 30
+          if-no-files-found: error
+
  build-community:
    runs-on: ubuntu-latest
    if: github.event_name != 'schedule'
@@ -188,13 +233,13 @@ jobs:

  create-release:
    runs-on: ubuntu-latest
-    needs: [resolve-dates, build-faa, adsb-to-aircraft, build-community, build-adsbexchange-json, build-mictronics-db]
+    needs: [resolve-dates, build-faa, adsb-to-aircraft, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
    if: github.event_name != 'schedule' && !cancelled()
    steps:
-      - name: Check adsb-to-aircraft status
-        if: needs.adsb-to-aircraft.result != 'success'
+      - name: Check ADS-B workflow status
+        if: needs.adsb-to-aircraft.result != 'success' && needs.adsb-reduce.result != 'success'
        run: |
-          echo "WARNING: adsb-to-aircraft result was '${{ needs.adsb-to-aircraft.result }}', will continue without ADS-B artifacts"
+          echo "WARNING: ADS-B workflow failed (adsb-to-aircraft='${{ needs.adsb-to-aircraft.result }}', adsb-reduce='${{ needs.adsb-reduce.result }}'), will continue without ADS-B artifacts"

      - name: Checkout for gh CLI
        uses: actions/checkout@v4
@@ -211,7 +256,7 @@ jobs:

      - name: Download ADS-B artifacts
        uses: actions/download-artifact@v5
-        if: needs.adsb-to-aircraft.result == 'success'
+        if: needs.adsb-to-aircraft.result == 'success' || needs.adsb-reduce.result == 'success'
        continue-on-error: true
        with:
          name: openairframes_adsb-${{ needs.resolve-dates.outputs.adsb_date }}
@@ -266,7 +311,11 @@ jobs:
          
          # Find files from artifacts using find (handles nested structures)
          CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
-          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
+          # Prefer concatenated file (with date range) over single-day file
+          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*_*.csv.gz" -type f 2>/dev/null | head -1)
+          if [ -z "$CSV_FILE_ADSB" ]; then
+            CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv.gz" -type f 2>/dev/null | head -1)
+          fi
          CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
          ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
          JSON_FILE_ADSBX=$(find artifacts/adsbexchange -name "basic-ac-db_*.json.gz" -type f 2>/dev/null | head -1)
@@ -16,11 +16,19 @@ A daily release is created at **06:00 UTC** and includes:
 - **openairframes_community.csv**  
  All community submissions

+- **openairframes_adsb.csv**  
+  Airframes dataset derived from ADSB.lol network data. For each UTC day, a row is created for every icao observed in that day’s ADS-B messages, using registration data from [tar1090-db](https://github.com/wiedehopf/tar1090-db) (ADSBExchange & Mictronics).
+Example Usage:
+```python
+import pandas as pd
+url = "https://github.com/PlaneQuery/OpenAirframes/releases/download/openairframes-2026-03-18-main/openairframes_adsb_2024-01-01_2026-03-17.csv.gz" # 1GB
+df = pd.read_csv(url)
+df
+```
+![](docs/images/df_adsb_example_0.png)
 - **openairframes_faa.csv**  
  All [FAA registration data](https://www.faa.gov/licenses_certificates/aircraft_certification/aircraft_registry/releasable_aircraft_download) from 2023-08-16 to present (~260 MB)

- **openairframes_adsb.csv**  
-  Airframe information derived from ADS-B messages on the [ADSB.lol](https://www.adsb.lol/) network, from 2026-02-12 to present (will be from 2024-01-01 soon). The airframe information originates from [mictronics aircraft database](https://www.mictronics.de/aircraft-database/) (~5 MB).

 - **ReleasableAircraft_{date}.zip**  
  A daily snapshot of the FAA database, which updates at **05:30 UTC**
@@ -194,7 +194,7 @@ def main():
    if triggered_runs and not args.dry_run:
        import json
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-        runs_file = f"./triggered_runs_{timestamp}.json"
+        runs_file = f"./output/triggered_runs_{timestamp}.json"
        with open(runs_file, 'w') as f:
            json.dump({
                'start_date': args.start_date,
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+"""
+Parse TheAirTraffic Database CSV and produce community_submission.v1 JSON.
+
+Source: "TheAirTraffic Database - Aircraft 2.csv"
+Output: community/YYYY-MM-DD/theairtraffic_<date>_<hash>.json
+
+Categories in the spreadsheet columns (paired: name, registrations, separator):
+  Col  1-3:  Business
+  Col  4-6:  Government
+  Col  7-9:  People
+  Col 10-12: Sports
+  Col 13-15: Celebrity
+  Col 16-18: State Govt./Law
+  Col 19-21: Other
+  Col 22-24: Test Aircraft
+  Col 25-27: YouTubers
+  Col 28-30: Formula 1 VIP's
+  Col 31-33: Active GII's and GIII's  (test/demo aircraft)
+  Col 34-37: Russia & Ukraine          (extra col for old/new)
+  Col 38-40: Helicopters & Blimps
+  Col 41-43: Unique Reg's
+  Col 44-46: Saudi & UAE
+  Col 47-49: Schools
+  Col 50-52: Special Charter
+  Col 53-55: Unknown Owners
+  Col 56-59: Frequent Flyers           (extra cols: name, aircraft, logged, hours)
+"""
+
+import csv
+import json
+import hashlib
+import re
+import sys
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+# ── Category mapping ────────────────────────────────────────────────────────
+# Each entry: (name_col, reg_col, owner_category_tags)
+# owner_category_tags is a dict of tag keys to add beyond "owner"
+CATEGORY_COLUMNS = [
+    # (name_col, reg_col, {tag_key: tag_value, ...})
+    (1,  2,  {"owner_category_0": "business"}),
+    (4,  5,  {"owner_category_0": "government"}),
+    (7,  8,  {"owner_category_0": "celebrity"}),
+    (10, 11, {"owner_category_0": "sports"}),
+    (13, 14, {"owner_category_0": "celebrity"}),
+    (16, 17, {"owner_category_0": "government", "owner_category_1": "law_enforcement"}),
+    (19, 20, {"owner_category_0": "other"}),
+    (22, 23, {"owner_category_0": "test_aircraft"}),
+    (25, 26, {"owner_category_0": "youtuber", "owner_category_1": "celebrity"}),
+    (28, 29, {"owner_category_0": "celebrity", "owner_category_1": "motorsport"}),
+    (31, 32, {"owner_category_0": "test_aircraft"}),
+    # Russia & Ukraine: col 34=name, col 35 or 36 may have reg
+    (34, 35, {"owner_category_0": "russia_ukraine"}),
+    (38, 39, {"owner_category_0": "celebrity", "category": "helicopter_or_blimp"}),
+    (41, 42, {"owner_category_0": "other"}),
+    (44, 45, {"owner_category_0": "government", "owner_category_1": "royal_family"}),
+    (47, 48, {"owner_category_0": "education"}),
+    (50, 51, {"owner_category_0": "charter"}),
+    (53, 54, {"owner_category_0": "unknown"}),
+    (56, 57, {"owner_category_0": "celebrity"}),   # Frequent Flyers name col, aircraft col
+]
+
+# First data row index (0-based) in the CSV
+DATA_START_ROW = 4
+
+# ── Contributor info ────────────────────────────────────────────────────────
+CONTRIBUTOR_NAME = "TheAirTraffic"
+# Deterministic UUID v5 from contributor name
+CONTRIBUTOR_UUID = str(uuid.uuid5(uuid.NAMESPACE_URL, "https://theairtraffic.com"))
+
+# Citation
+CITATION = "https://docs.google.com/spreadsheets/d/1JHhfJBnJPNBA6TgiSHjkXFkHBdVTTz_nXxaUDRWcHpk"
+
+
+def looks_like_military_serial(reg: str) -> bool:
+    """
+    Detect military-style serials like 92-9000, 82-8000, 98-0001
+    or pure numeric IDs like 929000, 828000, 980001.
+    These aren't standard civil registrations; use openairframes_id.
+    """
+    # Pattern: NN-NNNN
+    if re.match(r'^\d{2}-\d{4}$', reg):
+        return True
+    # Pure 6-digit numbers (likely ICAO hex or military mode-S)
+    if re.match(r'^\d{6}$', reg):
+        return True
+    # Short numeric-only (1-5 digits) like "01", "02", "676"
+    if re.match(r'^\d{1,5}$', reg):
+        return True
+    return False
+
+
+def normalize_reg(raw: str) -> str:
+    """Clean up a registration string."""
+    reg = raw.strip().rstrip(',').strip()
+    # Remove carriage returns and other whitespace
+    reg = reg.replace('\r', '').replace('\n', '').strip()
+    return reg
+
+
+def parse_regs(cell_value: str) -> list[str]:
+    """
+    Parse a cell that may contain one or many registrations,
+    separated by commas, possibly wrapped in quotes.
+    """
+    if not cell_value or not cell_value.strip():
+        return []
+
+    # Some cells have ADS-B exchange URLs – skip those
+    if 'globe.adsbexchange.com' in cell_value:
+        return []
+    if cell_value.strip() in ('.', ',', ''):
+        return []
+
+    results = []
+    # Split on comma
+    parts = cell_value.split(',')
+    for part in parts:
+        reg = normalize_reg(part)
+        if not reg:
+            continue
+        # Skip URLs, section labels, etc.
+        if reg.startswith('http') or reg.startswith('Link') or reg == 'Section 1':
+            continue
+        # Skip if it's just whitespace or dots
+        if reg in ('.', '..', '...'):
+            continue
+        results.append(reg)
+    return results
+
+
+def make_submission(
+    reg: str,
+    owner: str,
+    category_tags: dict[str, str],
+) -> dict:
+    """Build a single community_submission.v1 object."""
+
+    entry: dict = {}
+
+    # Decide identifier field
+    if looks_like_military_serial(reg):
+        entry["openairframes_id"] = reg
+    else:
+        entry["registration_number"] = reg
+
+    # Tags
+    tags: dict = {
+        "citation_0": CITATION,
+    }
+    if owner:
+        tags["owner"] = owner.strip()
+    tags.update(category_tags)
+    entry["tags"] = tags
+
+    return entry
+
+
+def main():
+    csv_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path(
+        "/Users/jonahgoode/Downloads/TheAirTraffic Database - Aircraft 2.csv"
+    )
+
+    if not csv_path.exists():
+        print(f"ERROR: CSV not found at {csv_path}", file=sys.stderr)
+        sys.exit(1)
+
+    # Read CSV
+    with open(csv_path, 'r', encoding='utf-8-sig') as f:
+        reader = csv.reader(f)
+        rows = list(reader)
+
+    print(f"Read {len(rows)} rows from {csv_path.name}")
+
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+    submissions: list[dict] = []
+    seen: set[tuple] = set()  # (reg, owner) dedup
+
+    for row_idx in range(DATA_START_ROW, len(rows)):
+        row = rows[row_idx]
+        if len(row) < 3:
+            continue
+
+        for name_col, reg_col, cat_tags in CATEGORY_COLUMNS:
+            if reg_col >= len(row) or name_col >= len(row):
+                continue
+
+            owner_raw = row[name_col].strip().rstrip(',').strip()
+            reg_raw = row[reg_col]
+
+            # Clean owner name
+            owner = owner_raw.replace('\r', '').replace('\n', '').strip()
+            if not owner or owner in ('.', ',', 'Section 1'):
+                continue
+            # Skip header-like values
+            if owner.startswith('http') or owner.startswith('Link '):
+                continue
+
+            regs = parse_regs(reg_raw)
+            if not regs:
+                # For Russia & Ukraine, try the next column too (col 35 might have old reg, col 36 new)
+                if name_col == 34 and reg_col + 1 < len(row):
+                    regs = parse_regs(row[reg_col + 1])
+
+            for reg in regs:
+                key = (reg, owner)
+                if key in seen:
+                    continue
+                seen.add(key)
+                submissions.append(make_submission(reg, owner, cat_tags))
+
+    print(f"Generated {len(submissions)} submissions")
+
+    # Write output
+    proj_root = Path(__file__).resolve().parent.parent
+    out_dir = proj_root / "community" / date_str
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    out_file = out_dir / f"theairtraffic_{date_str}.json"
+
+    with open(out_file, 'w', encoding='utf-8') as f:
+        json.dump(submissions, f, indent=2, ensure_ascii=False)
+
+    print(f"Written to {out_file}")
+    print(f"Sample entry:\n{json.dumps(submissions[0], indent=2)}")
+
+    # Quick stats
+    cats = {}
+    for s in submissions:
+        c = s['tags'].get('owner_category_0', 'NONE')
+        cats[c] = cats.get(c, 0) + 1
+    print("\nCategory breakdown:")
+    for c, n in sorted(cats.items(), key=lambda x: -x[1]):
+        print(f"  {c}: {n}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""Validate the generated theairtraffic JSON output."""
+import json
+import glob
+import sys
+
+# Find the latest output
+files = sorted(glob.glob("community/2026-02-*/theairtraffic_*.json"))
+if not files:
+    print("No output files found!")
+    sys.exit(1)
+
+path = files[-1]
+print(f"Validating: {path}")
+
+with open(path) as f:
+    data = json.load(f)
+
+print(f"Total entries: {len(data)}")
+
+# Check military serial handling
+mil = [d for d in data if "openairframes_id" in d]
+print(f"\nEntries using openairframes_id: {len(mil)}")
+for m in mil[:10]:
+    print(f"  {m['openairframes_id']} -> owner: {m['tags'].get('owner','?')}")
+
+# Check youtuber entries
+yt = [d for d in data if d["tags"].get("owner_category_0") == "youtuber"]
+print(f"\nYouTuber entries: {len(yt)}")
+for y in yt[:5]:
+    reg = y.get("registration_number", y.get("openairframes_id"))
+    c0 = y["tags"].get("owner_category_0")
+    c1 = y["tags"].get("owner_category_1")
+    print(f"  {reg} -> owner: {y['tags']['owner']}, cat0: {c0}, cat1: {c1}")
+
+# Check US Govt / military
+gov = [d for d in data if d["tags"].get("owner") == "United States of America 747/757"]
+print(f"\nUSA 747/757 entries: {len(gov)}")
+for g in gov:
+    oid = g.get("openairframes_id", g.get("registration_number"))
+    print(f"  {oid}")
+
+# Schema validation
+issues = 0
+for i, d in enumerate(data):
+    has_id = any(k in d for k in ["registration_number", "transponder_code_hex", "openairframes_id"])
+    if not has_id:
+        print(f"  Entry {i}: no identifier!")
+        issues += 1
+    if "tags" not in d:
+        print(f"  Entry {i}: no tags!")
+        issues += 1
+    # Check tag key format
+    for k in d.get("tags", {}):
+        import re
+        if not re.match(r"^[a-z][a-z0-9_]{0,63}$", k):
+            print(f"  Entry {i}: invalid tag key '{k}'")
+            issues += 1
+
+print(f"\nSchema issues: {issues}")
+
+# Category breakdown
+cats = {}
+for s in data:
+    c = s["tags"].get("owner_category_0", "NONE")
+    cats[c] = cats.get(c, 0) + 1
+print("\nCategory breakdown:")
+for c, n in sorted(cats.items(), key=lambda x: -x[1]):
+    print(f"  {c}: {n}")
@@ -1,7 +1,7 @@
 from pathlib import Path
 import polars as pl
 import argparse
-
+import os
 OUTPUT_DIR = Path("./data/output")
 CORRECT_ORDER_OF_COLUMNS = ["time", "icao", "r", "t", "dbFlags", "ownOp", "year", "desc", "aircraft_category"]

@@ -13,38 +13,55 @@ def main():

    compressed_dir = OUTPUT_DIR / "compressed"
    date_dir = compressed_dir / args.date
-    if not date_dir.is_dir():
-        raise FileNotFoundError(f"No date folder found: {date_dir}")

    parquet_files = sorted(date_dir.glob("*.parquet"))
-    if not parquet_files:
-        raise FileNotFoundError(f"No parquet files found in {date_dir}")
+    df = None
+    if parquet_files: # TODO: This logic could be updated slightly.
+        print(f"No parquet files found in {date_dir}")

-    frames = [pl.read_parquet(p) for p in parquet_files]
-    df = pl.concat(frames, how="vertical", rechunk=True)
+        frames = [pl.read_parquet(p) for p in parquet_files]
+        df = pl.concat(frames, how="vertical", rechunk=True)

-    df = df.sort(["time", "icao"])
-    df = df.select(CORRECT_ORDER_OF_COLUMNS)
-    
-    output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
-    print(f"Writing combined parquet to {output_path} with {df.height} rows")
-    df.write_parquet(output_path)
+        df = df.sort(["time", "icao"])
+        df = df.select(CORRECT_ORDER_OF_COLUMNS)
+        
+        output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.parquet"
+        print(f"Writing combined parquet to {output_path} with {df.height} rows")
+        df.write_parquet(output_path)

-    csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
-    print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
-    df.write_csv(csv_output_path, compression="gzip")
+        csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{args.date}.csv.gz"
+        print(f"Writing combined csv.gz to {csv_output_path} with {df.height} rows")
+        df.write_csv(csv_output_path, compression="gzip")

    if args.concat_with_latest_csv:
        print("Loading latest CSV from GitHub releases to concatenate with...")
        from src.get_latest_release import get_latest_aircraft_adsb_csv_df
-        df_latest_csv, csv_date = get_latest_aircraft_adsb_csv_df()
-        # Ensure column order matches before concatenating
-        df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
-        from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
-        df_final = concat_compressed_dfs(df_latest_csv, df)
-        df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
-        final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_date}_{args.date}.csv.gz"
-        df_final.write_csv(final_csv_output_path, compression="gzip")
+        from datetime import datetime
+        
+        df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
+        
+        # Compare dates: end_date is exclusive, so if csv_end_date > args.date, 
+        # the latest CSV already includes this day's data
+        csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
+        args_dt = datetime.strptime(args.date, "%Y-%m-%d")
+        
+        if df is None or csv_end_dt >= args_dt:
+            print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
+            print("Writing latest CSV directly without concatenation to avoid duplicates")
+            os.makedirs(OUTPUT_DIR, exist_ok=True)
+            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
+            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+            df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
+        else:
+            print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
+            # Ensure column order matches before concatenating
+            df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
+            from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
+            df_final = concat_compressed_dfs(df_latest_csv, df)
+            df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
+            final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
+            df_final.write_csv(final_csv_output_path, compression="gzip")
+        print(f"Final CSV written to {final_csv_output_path}")

 if __name__ == "__main__":
    main()
@@ -123,7 +123,16 @@ def main():
    print(f"Processing part {args.part_id} for {args.date}")
    
    # Get specific archive file for this part
-    archive_path = os.path.join(OUTPUT_DIR, "adsb_archives", args.date, f"{args.date}_part_{args.part_id}.tar.gz")
+    archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", args.date)
+    archive_path = os.path.join(archive_dir, f"{args.date}_part_{args.part_id}.tar.gz")
+    
+    if not os.path.isfile(archive_path):
+        print(f"ERROR: Archive not found: {archive_path}")
+        if os.path.isdir(archive_dir):
+            print(f"Files in {archive_dir}: {os.listdir(archive_dir)}")
+        else:
+            print(f"Directory does not exist: {archive_dir}")
+        sys.exit(1)
    
    # Extract and collect trace files
    trace_map = build_trace_file_map(archive_path)
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
    """Read all JSON submissions from the community directory."""
    all_submissions = []
    
-    for json_file in sorted(community_dir.glob("*.json")):
+    for json_file in sorted(community_dir.glob("**/*.json")):
        try:
            with open(json_file) as f:
                data = json.load(f)
@@ -27,6 +27,33 @@ def _http_get_json(url: str, headers: dict[str, str]) -> dict:
    return json.loads(data.decode("utf-8"))


+def get_releases(repo: str = REPO, github_token: Optional[str] = None, per_page: int = 30) -> list[dict]:
+    """Get a list of releases from the repository."""
+    url = f"https://api.github.com/repos/{repo}/releases?per_page={per_page}"
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "User-Agent": "openairframes-downloader/1.0",
+    }
+    if github_token:
+        headers["Authorization"] = f"Bearer {github_token}"
+
+    return _http_get_json(url, headers=headers)
+
+
+def get_release_assets_from_release_data(release_data: dict) -> list[ReleaseAsset]:
+    """Extract assets from a release data dictionary."""
+    assets = []
+    for a in release_data.get("assets", []):
+        assets.append(
+            ReleaseAsset(
+                name=a["name"],
+                download_url=a["browser_download_url"],
+                size=int(a.get("size", 0)),
+            )
+        )
+    return assets
+
+
 def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
    url = f"https://api.github.com/repos/{repo}/releases/latest"
    headers = {
@@ -37,16 +64,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
        headers["Authorization"] = f"Bearer {github_token}"

    payload = _http_get_json(url, headers=headers)
-    assets = []
-    for a in payload.get("assets", []):
-        assets.append(
-            ReleaseAsset(
-                name=a["name"],
-                download_url=a["browser_download_url"],
-                size=int(a.get("size", 0)),
-            )
-        )
-    return assets
+    return get_release_assets_from_release_data(payload)


 def pick_asset(
@@ -155,7 +173,8 @@ def download_latest_aircraft_adsb_csv(
    repo: str = REPO,
 ) -> Path:
    """
-    Download the latest openairframes_adsb_*.csv file from the latest GitHub release.
+    Download the latest openairframes_adsb_*.csv file from GitHub releases.
+    If the latest release doesn't have the file, searches previous releases.

    Args:
        output_dir: Directory to save the downloaded file (default: "downloads")
@@ -166,15 +185,33 @@ def download_latest_aircraft_adsb_csv(
        Path to the downloaded file
    """
    output_dir = Path(output_dir)
-    assets = get_latest_release_assets(repo, github_token=github_token)
-    asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
-    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
-    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
-    return saved_to
+    
+    # Get multiple releases
+    releases = get_releases(repo, github_token=github_token, per_page=30)
+    
+    # Try each release until we find one with the matching asset
+    for release in releases:
+        assets = get_release_assets_from_release_data(release)
+        try:
+            asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv(\.gz)?$")
+            saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
+            print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
+            return saved_to
+        except FileNotFoundError:
+            # This release doesn't have the matching asset, try the next one
+            continue
+    
+    raise FileNotFoundError(
+        f"No release in the last 30 releases has an asset matching 'openairframes_adsb_.*\\.csv(\\.gz)?$'"
+    )

 import polars as pl
 def get_latest_aircraft_adsb_csv_df():
-    """Download and load the latest ADS-B CSV from GitHub releases."""
+    """Download and load the latest ADS-B CSV from GitHub releases.
+    
+    Returns:
+        tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
+    """
    import re
    
    csv_path = download_latest_aircraft_adsb_csv()
@@ -198,17 +235,19 @@ def get_latest_aircraft_adsb_csv_df():
        if df[col].dtype == pl.Utf8:
            df = df.with_columns(pl.col(col).fill_null(""))
    
-    # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
-    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    # Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
+    match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
    if not match:
-        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+        raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
    
-    date_str = match.group(1)
+    start_date = match.group(1)
+    end_date = match.group(2)
    print(df.columns)
    print(df.dtypes)
-    return df, date_str
+    return df, start_date, end_date



 if __name__ == "__main__":
    download_latest_aircraft_csv()
+    download_latest_aircraft_adsb_csv()
Author	SHA1	Message	Date
JG	2829e5fb6e	Merge pull request #35 from PlaneQuery/develop update readme.md	2026-03-18 14:31:29 -04:00
ggman12	9c744b0baf	update readme.md	2026-03-18 14:29:13 -04:00
JG	ebda04767f	Merge pull request #34 from PlaneQuery/develop Develop to main: theairtraffic google sheet	2026-03-10 05:12:11 -04:00
ggman12	3fdf443894	add russia_ukraine	2026-03-10 05:08:19 -04:00
ggman12	24313603c5	works	2026-03-10 05:08:19 -04:00
JG	2bb0a5eac3	Merge pull request #33 from PlaneQuery/develop Develop to Main: Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb	2026-02-26 15:32:59 -05:00
ggman12	b54f33aa56	Handle ADSB when ADSB.lol has not released any data for day. Just rerelease latest adsb	2026-02-26 15:31:47 -05:00
JG	2dda3d341c	Merge pull request #32 from PlaneQuery/develop Develop to Main: Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.	2026-02-24 15:37:54 -05:00
ggman12	b0526f0a95	Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.	2026-02-24 15:36:10 -05:00
JG	4b6a043a9d	Merge pull request #31 from PlaneQuery/develop Develop to Main Fix adsb asset retrival to be more fault tolerant. Fix download issue	2026-02-24 02:17:08 -05:00
ggman12	55c464aad7	Fix adsb asset retrival to be more fault tolerant. Fix download issue for 2024-07-03	2026-02-24 02:12:55 -05:00
ggman12	aa509e8560	attempt to fix download issue for 2024-07-03	2026-02-19 17:51:49 -05:00