Compare commits

...

15 Commits

Author SHA1 Message Date
ggman12 50267f3c57 make faa work with no new data 2026-02-12 17:26:48 -05:00
ggman12 dd323f6e55 delete old files 2026-02-12 17:25:50 -05:00
ggman12 0e8b21daf9 rename from planequery to openairframes 2026-02-12 17:24:08 -05:00
ggman12 3960e6936c use start_date_end_date for adsb naming 2026-02-12 17:13:06 -05:00
ggman12 48623ef79e delete existign release 2026-02-12 17:12:09 -05:00
ggman12 5affe8937c rename to openairframes 2026-02-12 17:09:07 -05:00
ggman12 d0254146f3 update release to fix not grabbing FAA file 2026-02-12 16:42:47 -05:00
ggman12 1699ad6d8a rename file 2026-02-12 16:12:03 -05:00
ggman12 2a6892c347 fix download 2026-02-12 16:08:08 -05:00
ggman12 47ccecb9ba set fail-fast to true 2026-02-12 16:07:42 -05:00
ggman12 2826dfd450 remove notebook 2026-02-12 16:07:28 -05:00
ggman12 fecf9ff0ea format properly 2026-02-12 16:01:14 -05:00
ggman12 7e0a396fc7 only modify key parts of schemas/community_submission.v1.schema.json schema. Lowest diffs 2026-02-12 15:55:44 -05:00
ggman12 b0503bb3b2 fix: should update schema now 2026-02-12 15:46:11 -05:00
ggman12 0b89138daf modify existing json schema instead of creating a new file every time 2026-02-12 15:40:01 -05:00
23 changed files with 398 additions and 849 deletions
@@ -14,7 +14,7 @@ body:
- Each object must include **at least one** of:
- `registration_number`
- `transponder_code_hex` (6 uppercase hex chars, e.g., `ABC123`)
- `planequery_airframe_id`
- `openairframes_id`
- Your contributor name (entered below) will be applied to all objects.
- `contributor_uuid` is derived from your GitHub account automatically.
- `creation_timestamp` is created by the system (you may omit it).
+4 -4
View File
@@ -48,7 +48,7 @@ jobs:
matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
max-parallel: 3
fail-fast: false
fail-fast: true
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -220,11 +220,11 @@ jobs:
END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
run: |
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
ls -lah data/planequery_aircraft/
ls -lah data/openairframes/
- name: Upload final artifact
uses: actions/upload-artifact@v4
with:
name: planequery_aircraft_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
path: data/planequery_aircraft/*.csv
name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
path: data/openairframes/*.csv
retention-days: 30
@@ -1,4 +1,4 @@
name: planequery-aircraft Daily Release
name: OpenAirframes Daily Release
on:
schedule:
@@ -22,7 +22,7 @@ jobs:
await github.rest.actions.createWorkflowDispatch({
owner: context.repo.owner,
repo: context.repo.repo,
workflow_id: 'planequery-aircraft-daily-release.yaml',
workflow_id: 'openairframes-daily-release.yaml',
ref: 'main'
});
@@ -33,7 +33,7 @@ jobs:
await github.rest.actions.createWorkflowDispatch({
owner: context.repo.owner,
repo: context.repo.repo,
workflow_id: 'planequery-aircraft-daily-release.yaml',
workflow_id: 'openairframes-daily-release.yaml',
ref: 'develop'
});
@@ -58,16 +58,16 @@ jobs:
- name: Run FAA release script
run: |
python src/create_daily_planequery_aircraft_release.py
python src/create_daily_faa_release.py
ls -lah data/faa_releasable
ls -lah data/planequery_aircraft
ls -lah data/openairframes
- name: Upload FAA artifacts
uses: actions/upload-artifact@v4
with:
name: faa-release
path: |
data/planequery_aircraft/planequery_aircraft_faa_*.csv
data/openairframes/openairframes_faa_*.csv
data/faa_releasable/ReleasableAircraft_*.zip
retention-days: 1
@@ -214,13 +214,13 @@ jobs:
mkdir -p data/output/adsb_chunks
ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
ls -lah data/planequery_aircraft/
ls -lah data/openairframes/
- name: Upload ADS-B artifacts
uses: actions/upload-artifact@v4
with:
name: adsb-release
path: data/planequery_aircraft/planequery_aircraft_adsb_*.csv
path: data/openairframes/openairframes_adsb_*.csv
retention-days: 1
build-community:
@@ -245,13 +245,13 @@ jobs:
- name: Run Community release script
run: |
python -m src.contributions.create_daily_community_release
ls -lah data/planequery_aircraft
ls -lah data/openairframes
- name: Upload Community artifacts
uses: actions/upload-artifact@v4
with:
name: community-release
path: data/planequery_aircraft/planequery_aircraft_community_*.csv
path: data/openairframes/openairframes_community_*.csv
retention-days: 1
create-release:
@@ -297,16 +297,16 @@ jobs:
elif [ "$BRANCH_NAME" = "develop" ]; then
BRANCH_SUFFIX="-develop"
fi
TAG="planequery-aircraft-${DATE}${BRANCH_SUFFIX}"
TAG="openairframes-${DATE}${BRANCH_SUFFIX}"
# Find files from artifacts
CSV_FILE_FAA=$(ls artifacts/faa/data/planequery_aircraft/planequery_aircraft_faa_*.csv | head -1)
# Find files from artifacts using find (handles nested structures)
CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" | head -1)
CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
CSV_FILE_ADSB=$(ls artifacts/adsb/planequery_aircraft_adsb_*.csv | head -1)
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" | head -1)
CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
CSV_FILE_COMMUNITY=$(ls artifacts/community/planequery_aircraft_community_*.csv 2>/dev/null | head -1 || echo "")
CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" 2>/dev/null | head -1 || echo "")
CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
ZIP_FILE=$(ls artifacts/faa/data/faa_releasable/ReleasableAircraft_*.zip | head -1)
ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" | head -1)
ZIP_BASENAME=$(basename "$ZIP_FILE")
echo "date=$DATE" >> "$GITHUB_OUTPUT"
@@ -319,12 +319,15 @@ jobs:
echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT"
echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
- name: Checkout for gh CLI
uses: actions/checkout@v4
- name: Delete existing release if exists
run: |
gh release delete "${{ steps.meta.outputs.tag }}" --yes 2>/dev/null || true
git push --delete origin "refs/tags/${{ steps.meta.outputs.tag }}" 2>/dev/null || true
echo "Attempting to delete release: ${{ steps.meta.outputs.tag }}"
gh release delete "${{ steps.meta.outputs.tag }}" --yes --cleanup-tag || echo "No existing release to delete"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -0,0 +1,171 @@
name: Process Historical FAA Data
on:
workflow_dispatch: # Manual trigger
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Generate date ranges
id: set-matrix
run: |
python3 << 'EOF'
import json
from datetime import datetime, timedelta
start = datetime(2023, 8, 16)
end = datetime(2026, 1, 1)
ranges = []
current = start
# Process in 4-day chunks
while current < end:
chunk_end = current + timedelta(days=4)
# Don't go past the end date
if chunk_end > end:
chunk_end = end
ranges.append({
"since": current.strftime("%Y-%m-%d"),
"until": chunk_end.strftime("%Y-%m-%d")
})
current = chunk_end
print(f"::set-output name=matrix::{json.dumps(ranges)}")
EOF
clone-faa-repo:
runs-on: ubuntu-latest
steps:
- name: Cache FAA repository
id: cache-faa-repo
uses: actions/cache@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
- name: Clone FAA repository
if: steps.cache-faa-repo.outputs.cache-hit != 'true'
run: |
mkdir -p data
git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft
echo "Repository cloned successfully"
process-chunk:
needs: [generate-matrix, clone-faa-repo]
runs-on: ubuntu-latest
strategy:
max-parallel: 5 # Process 5 chunks at a time
matrix:
range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Restore FAA repository cache
uses: actions/cache/restore@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
fail-on-cache-miss: true
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }}
run: |
python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}"
- name: Upload CSV artifact
uses: actions/upload-artifact@v4
with:
name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }}
path: data/faa_releasable_historical/*.csv
retention-days: 1
create-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare release files
run: |
mkdir -p release-files
find artifacts -name "*.csv" -exec cp {} release-files/ \;
ls -lh release-files/
- name: Create Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-${{ github.run_number }}
name: Historical FAA Data Release ${{ github.run_number }}
body: |
Automated release of historical FAA aircraft data
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: release-files/*.csv
draft: false
prerelease: false
concatenate-and-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare CSVs for concatenation
run: |
mkdir -p data/faa_releasable_historical
find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
ls -lh data/faa_releasable_historical/
- name: Concatenate all CSVs
run: |
python scripts/concat_csvs.py
- name: Create Combined Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-combined-${{ github.run_number }}
name: Historical FAA Data Combined Release ${{ github.run_number }}
body: |
Combined historical FAA aircraft data (all chunks concatenated)
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: data/openairframes/*.csv
draft: false
prerelease: false
+20 -36
View File
@@ -5,7 +5,7 @@ on:
branches: [main]
paths:
- 'community/**'
- 'schemas/community_submission.*.schema.json'
- 'schemas/community_submission.v1.schema.json'
permissions:
contents: write
@@ -29,18 +29,9 @@ jobs:
- name: Install dependencies
run: pip install jsonschema
- name: Get current schema version
id: schema
run: |
# Find the latest schema version on main
latest=$(ls schemas/community_submission.v*.schema.json 2>/dev/null | sed 's/.*\.v\([0-9]*\)\.schema\.json/\1/' | sort -n | tail -1)
echo "latest_version=${latest:-1}" >> "$GITHUB_OUTPUT"
echo "Latest schema version: ${latest:-1}"
- name: Find and update open community PRs
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
LATEST_SCHEMA_VERSION: ${{ steps.schema.outputs.latest_version }}
run: |
# Get list of open community PRs
prs=$(gh pr list --label community --state open --json number,headRefName --jq '.[] | "\(.number) \(.headRefName)"')
@@ -57,35 +48,28 @@ jobs:
git fetch origin "$branch_name"
git checkout "$branch_name"
# Check if this PR has a schema file that needs updating
pr_schema=$(ls schemas/community_submission.v*.schema.json 2>/dev/null | sed 's/.*\.v\([0-9]*\)\.schema\.json/\1/' | sort -n | tail -1)
# Merge main into PR branch
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
if [ "$pr_schema" -le "$LATEST_SCHEMA_VERSION" ] 2>/dev/null; then
echo " PR schema version ($pr_schema) <= main version ($LATEST_SCHEMA_VERSION)"
if git merge origin/main -m "Merge main to update schema"; then
# Regenerate schema for this PR's submission (adds any new tags)
python -m src.contributions.regenerate_pr_schema || true
# Merge main into PR branch
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
if git merge origin/main -m "Merge main to update schema baseline"; then
# Regenerate schema for this PR's submission
python -m src.contributions.regenerate_pr_schema || true
# If there are changes, commit and push
if [ -n "$(git status --porcelain schemas/)" ]; then
new_version=$((LATEST_SCHEMA_VERSION + 1))
git add schemas/
git commit -m "Update schema to v${new_version} (rebased on main)"
git push origin "$branch_name"
echo " Updated PR #$pr_number with new schema"
else
git push origin "$branch_name"
echo " Merged main into PR #$pr_number"
fi
# If there are changes, commit and push
if [ -n "$(git status --porcelain schemas/)" ]; then
git add schemas/
git commit -m "Update schema with new tags"
git push origin "$branch_name"
echo " Updated PR #$pr_number with schema changes"
else
echo " Merge conflict in PR #$pr_number, adding comment"
gh pr comment "$pr_number" --body $'⚠️ **Merge Conflict**\n\nAnother community submission was merged and this PR has conflicts.\n\nA maintainer may need to:\n1. Close this PR\n2. Remove the `approved` label from the original issue\n3. Re-add the `approved` label to regenerate the PR'
git merge --abort
git push origin "$branch_name"
echo " Merged main into PR #$pr_number"
fi
else
echo " Merge conflict in PR #$pr_number, adding comment"
gh pr comment "$pr_number" --body $'⚠️ **Merge Conflict**\n\nAnother community submission was merged and this PR has conflicts.\n\nA maintainer may need to:\n1. Close this PR\n2. Remove the `approved` label from the original issue\n3. Re-add the `approved` label to regenerate the PR'
git merge --abort
fi
fi
+1 -1
View File
@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2026 PlaneQuery
Copyright (c) 2026 OpenAirframes
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
+1 -1
View File
@@ -23,7 +23,7 @@ class AdsbProcessingStack(Stack):
# --- S3 bucket for intermediate and final results ---
bucket = s3.Bucket(
self, "ResultsBucket",
bucket_name="planequery-aircraft-dev",
bucket_name="openairframes-dev",
removal_policy=RemovalPolicy.DESTROY,
auto_delete_objects=True,
lifecycle_rules=[
-640
View File
@@ -1,640 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "06ae0319",
"metadata": {},
"outputs": [],
"source": [
"import clickhouse_connect\n",
"client = clickhouse_connect.get_client(\n",
" host=os.environ[\"CLICKHOUSE_HOST\"],\n",
" username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
" password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
" secure=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "779710f0",
"metadata": {},
"outputs": [],
"source": [
"df = client.query_df(\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '2024-01-01 00:00:00' AND time < '2024-01-02 00:00:00'\")\n",
"df_copy = df.copy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf024da8",
"metadata": {},
"outputs": [],
"source": [
"# -- military = dbFlags & 1; interesting = dbFlags & 2; PIA = dbFlags & 4; LADD = dbFlags & 8;"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "270607b5",
"metadata": {},
"outputs": [],
"source": [
"df = load_raw_adsb_for_day(datetime(2024,1,1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac06a30e",
"metadata": {},
"outputs": [],
"source": [
"df['aircraft']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91edab3e",
"metadata": {},
"outputs": [],
"source": [
"COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
"def compress_df(df):\n",
" icao = df.name\n",
" df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
" original_df = df.copy()\n",
" df = df.groupby(\"_signature\", as_index=False).last() # check if it works with both last and first.\n",
" # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
" def get_non_empty_dict(row):\n",
" return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
" \n",
" df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
" df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
" \n",
" # Check if row i's non-empty values are a subset of row j's non-empty values\n",
" def is_subset_of_any(idx):\n",
" row_dict = df.loc[idx, '_non_empty_dict']\n",
" row_count = df.loc[idx, '_non_empty_count']\n",
" \n",
" for other_idx in df.index:\n",
" if idx == other_idx:\n",
" continue\n",
" other_dict = df.loc[other_idx, '_non_empty_dict']\n",
" other_count = df.loc[other_idx, '_non_empty_count']\n",
" \n",
" # Check if all non-empty values in current row match those in other row\n",
" if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
" # If they match and other has more defined columns, current row is redundant\n",
" if other_count > row_count:\n",
" return True\n",
" return False\n",
" \n",
" # Keep rows that are not subsets of any other row\n",
" keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
" df = df[keep_mask]\n",
"\n",
" if len(df) > 1:\n",
" original_df = original_df[original_df['_signature'].isin(df['_signature'])]\n",
" value_counts = original_df[\"_signature\"].value_counts()\n",
" max_signature = value_counts.idxmax()\n",
" df = df[df['_signature'] == max_signature]\n",
"\n",
" df['icao'] = icao\n",
" df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
" return df\n",
"\n",
"# df = df_copy\n",
"# df = df_copy.iloc[0:100000]\n",
"# df = df[df['r'] == \"N4131T\"]\n",
"# df = df[(df['icao'] == \"008081\")]\n",
"# df = df.iloc[0:500]\n",
"df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
"df = df.drop(columns=['aircraft'])\n",
"df = df.sort_values(['icao', 'time'])\n",
"df[COLUMNS] = df[COLUMNS].fillna('')\n",
"ORIGINAL_COLUMNS = df.columns.tolist()\n",
"df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
"cols = df_compressed.columns.tolist()\n",
"cols.remove(\"icao\")\n",
"cols.insert(1, \"icao\")\n",
"df_compressed = df_compressed[cols]\n",
"df_compressed"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "efdfcb2c",
"metadata": {},
"outputs": [],
"source": [
"df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
"df[~df['aircraft_category'].isna()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "495c5025",
"metadata": {},
"outputs": [],
"source": [
"# SOME KIND OF MAP REDUCE SYSTEM\n",
"import os\n",
"\n",
"COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
"def compress_df(df):\n",
" icao = df.name\n",
" df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
" \n",
" # Compute signature counts before grouping (avoid copy)\n",
" signature_counts = df[\"_signature\"].value_counts()\n",
" \n",
" df = df.groupby(\"_signature\", as_index=False).first() # check if it works with both last and first.\n",
" # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
" def get_non_empty_dict(row):\n",
" return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
" \n",
" df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
" df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
" \n",
" # Check if row i's non-empty values are a subset of row j's non-empty values\n",
" def is_subset_of_any(idx):\n",
" row_dict = df.loc[idx, '_non_empty_dict']\n",
" row_count = df.loc[idx, '_non_empty_count']\n",
" \n",
" for other_idx in df.index:\n",
" if idx == other_idx:\n",
" continue\n",
" other_dict = df.loc[other_idx, '_non_empty_dict']\n",
" other_count = df.loc[other_idx, '_non_empty_count']\n",
" \n",
" # Check if all non-empty values in current row match those in other row\n",
" if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
" # If they match and other has more defined columns, current row is redundant\n",
" if other_count > row_count:\n",
" return True\n",
" return False\n",
" \n",
" # Keep rows that are not subsets of any other row\n",
" keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
" df = df[keep_mask]\n",
"\n",
" if len(df) > 1:\n",
" # Use pre-computed signature counts instead of original_df\n",
" remaining_sigs = df['_signature']\n",
" sig_counts = signature_counts[remaining_sigs]\n",
" max_signature = sig_counts.idxmax()\n",
" df = df[df['_signature'] == max_signature]\n",
"\n",
" df['icao'] = icao\n",
" df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
" return df\n",
"\n",
"# names of releases something like\n",
"# planequery_aircraft_adsb_2024-06-01T00-00-00Z.csv.gz\n",
"\n",
"# Let's build historical first. \n",
"\n",
"_ch_client = None\n",
"\n",
"def _get_clickhouse_client():\n",
" \"\"\"Return a reusable ClickHouse client, with retry/backoff for transient DNS or connection errors.\"\"\"\n",
" global _ch_client\n",
" if _ch_client is not None:\n",
" return _ch_client\n",
"\n",
" import clickhouse_connect\n",
" import time\n",
"\n",
" max_retries = 5\n",
" for attempt in range(1, max_retries + 1):\n",
" try:\n",
" _ch_client = clickhouse_connect.get_client(\n",
" host=os.environ[\"CLICKHOUSE_HOST\"],\n",
" username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
" password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
" secure=True,\n",
" )\n",
" return _ch_client\n",
" except Exception as e:\n",
" wait = min(2 ** attempt, 30)\n",
" print(f\" ClickHouse connect attempt {attempt}/{max_retries} failed: {e}\")\n",
" if attempt == max_retries:\n",
" raise\n",
" print(f\" Retrying in {wait}s...\")\n",
" time.sleep(wait)\n",
"\n",
"\n",
"def load_raw_adsb_for_day(day):\n",
" \"\"\"Load raw ADS-B data for a day from cache or ClickHouse.\"\"\"\n",
" from datetime import timedelta\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" import time\n",
" \n",
" start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)\n",
" end_time = start_time + timedelta(days=1)\n",
" \n",
" # Set up caching\n",
" cache_dir = Path(\"data/adsb\")\n",
" cache_dir.mkdir(parents=True, exist_ok=True)\n",
" cache_file = cache_dir / f\"adsb_raw_{start_time.strftime('%Y-%m-%d')}.csv.zst\"\n",
" \n",
" # Check if cache exists\n",
" if cache_file.exists():\n",
" print(f\" Loading from cache: {cache_file}\")\n",
" df = pd.read_csv(cache_file, compression='zstd')\n",
" df['time'] = pd.to_datetime(df['time'])\n",
" else:\n",
" # Format dates for the query\n",
" start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" \n",
" max_retries = 3\n",
" for attempt in range(1, max_retries + 1):\n",
" try:\n",
" client = _get_clickhouse_client()\n",
" print(f\" Querying ClickHouse for {start_time.strftime('%Y-%m-%d')}\")\n",
" df = client.query_df(f\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '{start_str}' AND time < '{end_str}'\")\n",
" break\n",
" except Exception as e:\n",
" wait = min(2 ** attempt, 30)\n",
" print(f\" Query attempt {attempt}/{max_retries} failed: {e}\")\n",
" if attempt == max_retries:\n",
" raise\n",
" # Reset client in case connection is stale\n",
" global _ch_client\n",
" _ch_client = None\n",
" print(f\" Retrying in {wait}s...\")\n",
" time.sleep(wait)\n",
" \n",
" # Save to cache\n",
" df.to_csv(cache_file, index=False, compression='zstd')\n",
" print(f\" Saved to cache: {cache_file}\")\n",
" \n",
" return df\n",
"\n",
"def load_historical_for_day(day):\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" \n",
" df = load_raw_adsb_for_day(day)\n",
" print(df)\n",
" df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
" df = df.drop(columns=['aircraft'])\n",
" df = df.sort_values(['icao', 'time'])\n",
" df[COLUMNS] = df[COLUMNS].fillna('')\n",
" df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
" cols = df_compressed.columns.tolist()\n",
" cols.remove('time')\n",
" cols.insert(0, 'time')\n",
" cols.remove(\"icao\")\n",
" cols.insert(1, \"icao\")\n",
" df_compressed = df_compressed[cols]\n",
" return df_compressed\n",
"\n",
"\n",
"def concat_compressed_dfs(df_base, df_new):\n",
" \"\"\"Concatenate base and new compressed dataframes, keeping the most informative row per ICAO.\"\"\"\n",
" import pandas as pd\n",
" \n",
" # Combine both dataframes\n",
" df_combined = pd.concat([df_base, df_new], ignore_index=True)\n",
" \n",
" # Sort by ICAO and time\n",
" df_combined = df_combined.sort_values(['icao', 'time'])\n",
" \n",
" # Fill NaN values\n",
" df_combined[COLUMNS] = df_combined[COLUMNS].fillna('')\n",
" \n",
" # Apply compression logic per ICAO to get the best row\n",
" df_compressed = df_combined.groupby('icao', group_keys=False).apply(compress_df)\n",
" \n",
" # Sort by time\n",
" df_compressed = df_compressed.sort_values('time')\n",
" \n",
" return df_compressed\n",
"\n",
"\n",
"def get_latest_aircraft_adsb_csv_df():\n",
" \"\"\"Download and load the latest ADS-B CSV from GitHub releases.\"\"\"\n",
" from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv\n",
" \n",
" import pandas as pd\n",
" import re\n",
" \n",
" csv_path = download_latest_aircraft_adsb_csv()\n",
" df = pd.read_csv(csv_path)\n",
" df = df.fillna(\"\")\n",
" \n",
" # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv\n",
" match = re.search(r\"planequery_aircraft_adsb_(\\d{4}-\\d{2}-\\d{2})_\", str(csv_path))\n",
" if not match:\n",
" raise ValueError(f\"Could not extract date from filename: {csv_path.name}\")\n",
" \n",
" date_str = match.group(1)\n",
" return df, date_str\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f66acf7",
"metadata": {},
"outputs": [],
"source": [
"# SOME KIND OF MAP REDUCE SYSTEM\n",
"\n",
"\n",
"COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
"def compress_df(df):\n",
" icao = df.name\n",
" df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
" original_df = df.copy()\n",
" df = df.groupby(\"_signature\", as_index=False).first() # check if it works with both last and first.\n",
" # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
" def get_non_empty_dict(row):\n",
" return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
" \n",
" df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
" df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
" \n",
" # Check if row i's non-empty values are a subset of row j's non-empty values\n",
" def is_subset_of_any(idx):\n",
" row_dict = df.loc[idx, '_non_empty_dict']\n",
" row_count = df.loc[idx, '_non_empty_count']\n",
" \n",
" for other_idx in df.index:\n",
" if idx == other_idx:\n",
" continue\n",
" other_dict = df.loc[other_idx, '_non_empty_dict']\n",
" other_count = df.loc[other_idx, '_non_empty_count']\n",
" \n",
" # Check if all non-empty values in current row match those in other row\n",
" if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
" # If they match and other has more defined columns, current row is redundant\n",
" if other_count > row_count:\n",
" return True\n",
" return False\n",
" \n",
" # Keep rows that are not subsets of any other row\n",
" keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
" df = df[keep_mask]\n",
"\n",
" if len(df) > 1:\n",
" original_df = original_df[original_df['_signature'].isin(df['_signature'])]\n",
" value_counts = original_df[\"_signature\"].value_counts()\n",
" max_signature = value_counts.idxmax()\n",
" df = df[df['_signature'] == max_signature]\n",
"\n",
" df['icao'] = icao\n",
" df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
" return df\n",
"\n",
"# names of releases something like\n",
"# planequery_aircraft_adsb_2024-06-01T00-00-00Z.csv.gz\n",
"\n",
"# Let's build historical first. \n",
"\n",
"def load_raw_adsb_for_day(day):\n",
" \"\"\"Load raw ADS-B data for a day from cache or ClickHouse.\"\"\"\n",
" from datetime import timedelta\n",
" import clickhouse_connect\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" \n",
" start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)\n",
" end_time = start_time + timedelta(days=1)\n",
" \n",
" # Set up caching\n",
" cache_dir = Path(\"data/adsb\")\n",
" cache_dir.mkdir(parents=True, exist_ok=True)\n",
" cache_file = cache_dir / f\"adsb_raw_{start_time.strftime('%Y-%m-%d')}.csv.zst\"\n",
" \n",
" # Check if cache exists\n",
" if cache_file.exists():\n",
" print(f\" Loading from cache: {cache_file}\")\n",
" df = pd.read_csv(cache_file, compression='zstd')\n",
" df['time'] = pd.to_datetime(df['time'])\n",
" else:\n",
" # Format dates for the query\n",
" start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" \n",
" client = clickhouse_connect.get_client(\n",
" host=os.environ[\"CLICKHOUSE_HOST\"],\n",
" username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
" password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
" secure=True,\n",
" )\n",
" print(f\" Querying ClickHouse for {start_time.strftime('%Y-%m-%d')}\")\n",
" df = client.query_df(f\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '{start_str}' AND time < '{end_str}'\")\n",
" \n",
" # Save to cache\n",
" df.to_csv(cache_file, index=False, compression='zstd')\n",
" print(f\" Saved to cache: {cache_file}\")\n",
" \n",
" return df\n",
"\n",
"def load_historical_for_day(day):\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" \n",
" df = load_raw_adsb_for_day(day)\n",
" \n",
" df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
" df = df.drop(columns=['aircraft'])\n",
" df = df.sort_values(['icao', 'time'])\n",
" df[COLUMNS] = df[COLUMNS].fillna('')\n",
" df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
" cols = df_compressed.columns.tolist()\n",
" cols.remove('time')\n",
" cols.insert(0, 'time')\n",
" cols.remove(\"icao\")\n",
" cols.insert(1, \"icao\")\n",
" df_compressed = df_compressed[cols]\n",
" return df_compressed\n",
"\n",
"\n",
"def concat_compressed_dfs(df_base, df_new):\n",
" \"\"\"Concatenate base and new compressed dataframes, keeping the most informative row per ICAO.\"\"\"\n",
" import pandas as pd\n",
" \n",
" # Combine both dataframes\n",
" df_combined = pd.concat([df_base, df_new], ignore_index=True)\n",
" \n",
" # Sort by ICAO and time\n",
" df_combined = df_combined.sort_values(['icao', 'time'])\n",
" \n",
" # Fill NaN values\n",
" df_combined[COLUMNS] = df_combined[COLUMNS].fillna('')\n",
" \n",
" # Apply compression logic per ICAO to get the best row\n",
" df_compressed = df_combined.groupby('icao', group_keys=False).apply(compress_df)\n",
" \n",
" # Sort by time\n",
" df_compressed = df_compressed.sort_values('time')\n",
" \n",
" return df_compressed\n",
"\n",
"\n",
"def get_latest_aircraft_adsb_csv_df():\n",
" \"\"\"Download and load the latest ADS-B CSV from GitHub releases.\"\"\"\n",
" from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv\n",
" \n",
" import pandas as pd\n",
" import re\n",
" \n",
" csv_path = download_latest_aircraft_adsb_csv()\n",
" df = pd.read_csv(csv_path)\n",
" df = df.fillna(\"\")\n",
" \n",
" # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv\n",
" match = re.search(r\"planequery_aircraft_adsb_(\\d{4}-\\d{2}-\\d{2})_\", str(csv_path))\n",
" if not match:\n",
" raise ValueError(f\"Could not extract date from filename: {csv_path.name}\")\n",
" \n",
" date_str = match.group(1)\n",
" return df, date_str\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e14c8363",
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"df = load_historical_for_day(datetime(2024,1,1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3874ba4d",
"metadata": {},
"outputs": [],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bcae50ad",
"metadata": {},
"outputs": [],
"source": [
"df[(df['icao'] == \"008081\")]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50921c86",
"metadata": {},
"outputs": [],
"source": [
"df[df['icao'] == \"a4e1d2\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8194d9aa",
"metadata": {},
"outputs": [],
"source": [
"df[df['r'] == \"N4131T\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e3b7aa2",
"metadata": {},
"outputs": [],
"source": [
"df_compressed[df_compressed['icao'].duplicated(keep=False)]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40613bc1",
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import json\n",
"\n",
"path = \"/Users/jonahgoode/Downloads/test_extract/traces/fb/trace_full_acbbfb.json\"\n",
"\n",
"with gzip.open(path, \"rt\", encoding=\"utf-8\") as f:\n",
" data = json.load(f)\n",
"\n",
"print(type(data))\n",
"# use `data` here\n",
"import json\n",
"print(json.dumps(data, indent=2)[:2000])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "320109b2",
"metadata": {},
"outputs": [],
"source": [
"# First, load the JSON to inspect its structure\n",
"import json\n",
"with open(\"/Users/jonahgoode/Documents/PlaneQuery/Other-Code/readsb-protobuf/webapp/src/db/aircrafts.json\", 'r') as f:\n",
" data = json.load(f)\n",
"\n",
"# Check the structure\n",
"print(type(data))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "590134f4",
"metadata": {},
"outputs": [],
"source": [
"data['AC97E3']"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+38 -18
View File
@@ -1,9 +1,8 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "PlaneQuery Aircraft Community Submission (v1)",
"title": "OpenAirframes Community Submission (v1)",
"type": "object",
"additionalProperties": false,
"properties": {
"registration_number": {
"type": "string",
@@ -13,11 +12,10 @@
"type": "string",
"pattern": "^[0-9A-F]{6}$"
},
"planequery_airframe_id": {
"openairframes_id": {
"type": "string",
"minLength": 1
},
"contributor_uuid": {
"type": "string",
"format": "uuid"
@@ -28,14 +26,12 @@
"maxLength": 150,
"description": "Display name (may be blank)"
},
"creation_timestamp": {
"type": "string",
"format": "date-time",
"description": "Set by the system when the submission is persisted/approved.",
"readOnly": true
},
"start_date": {
"type": "string",
"format": "date",
@@ -48,7 +44,6 @@
"pattern": "^\\d{4}-\\d{2}-\\d{2}$",
"description": "Optional end date for when this submission's tags are valid (ISO 8601, e.g., 2025-07-03)."
},
"tags": {
"type": "object",
"description": "Additional community-defined tags as key/value pairs (values may be scalar, array, or object).",
@@ -56,38 +51,63 @@
"type": "string",
"pattern": "^[a-z][a-z0-9_]{0,63}$"
},
"additionalProperties": { "$ref": "#/$defs/tagValue" }
"additionalProperties": {
"$ref": "#/$defs/tagValue"
},
"properties": {}
}
},
"allOf": [
{
"anyOf": [
{ "required": ["registration_number"] },
{ "required": ["transponder_code_hex"] },
{ "required": ["planequery_airframe_id"] }
{
"required": [
"registration_number"
]
},
{
"required": [
"transponder_code_hex"
]
},
{
"required": [
"openairframes_id"
]
}
]
}
],
"$defs": {
"tagScalar": {
"type": ["string", "number", "integer", "boolean", "null"]
"type": [
"string",
"number",
"integer",
"boolean",
"null"
]
},
"tagValue": {
"anyOf": [
{ "$ref": "#/$defs/tagScalar" },
{
"$ref": "#/$defs/tagScalar"
},
{
"type": "array",
"maxItems": 50,
"items": { "$ref": "#/$defs/tagScalar" }
"items": {
"$ref": "#/$defs/tagScalar"
}
},
{
"type": "object",
"maxProperties": 50,
"additionalProperties": { "$ref": "#/$defs/tagScalar" }
"additionalProperties": {
"$ref": "#/$defs/tagScalar"
}
}
]
}
}
}
}
+6 -6
View File
@@ -27,7 +27,7 @@ from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLU
DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
FINAL_OUTPUT_DIR = "./data/planequery_aircraft"
FINAL_OUTPUT_DIR = "./data/openairframes"
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
@@ -85,12 +85,12 @@ def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFram
def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame:
"""Download base release and merge with new data."""
from src.get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv
from src.get_latest_release import download_latest_aircraft_adsb_csv
print("Downloading base ADS-B release...")
try:
base_path = download_latest_aircraft_adsb_csv(
output_dir="./data/planequery_aircraft_base"
output_dir="./data/openairframes_base"
)
print(f"Download returned: {base_path}")
@@ -176,10 +176,10 @@ def main():
if args.start_date and args.end_date:
# Historical mode
output_id = f"{args.start_date}_{args.end_date}"
output_filename = f"planequery_aircraft_adsb_{args.start_date}_{args.end_date}.csv"
output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv"
print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
else:
# Daily mode
# Daily mode - use same date for start and end
if args.date:
target_day = datetime.strptime(args.date, "%Y-%m-%d")
else:
@@ -187,7 +187,7 @@ def main():
date_str = target_day.strftime("%Y-%m-%d")
output_id = date_str
output_filename = f"planequery_aircraft_adsb_{date_str}.csv"
output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv"
print(f"Combining chunks for {date_str}")
chunks_dir = args.chunks_dir
+3 -3
View File
@@ -253,7 +253,7 @@ def concat_compressed_dfs(df_base, df_new):
def get_latest_aircraft_adsb_csv_df():
"""Download and load the latest ADS-B CSV from GitHub releases."""
from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv
from get_latest_release import download_latest_aircraft_adsb_csv
import re
csv_path = download_latest_aircraft_adsb_csv()
@@ -264,8 +264,8 @@ def get_latest_aircraft_adsb_csv_df():
if df[col].dtype == pl.Utf8:
df = df.with_columns(pl.col(col).fill_null(""))
# Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+13 -5
View File
@@ -82,7 +82,8 @@ def fetch_releases(version_date: str) -> list:
if version_date == "v2024.12.31":
year = "2025"
BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
PATTERN = f"{version_date}-planes-readsb-prod-0"
# Match exact release name, exclude tmp releases
PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+$"
releases = []
page = 1
@@ -187,19 +188,23 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
cat_proc = subprocess.Popen(
["cat"] + file_paths,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL
stderr=subprocess.PIPE
)
tar_cmd = ["tar", "xf", "-", "-C", extract_dir, "--strip-components=1"]
subprocess.run(
result = subprocess.run(
tar_cmd,
stdin=cat_proc.stdout,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True
)
cat_proc.stdout.close()
cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
cat_proc.wait()
if cat_stderr:
print(f"cat stderr: {cat_stderr}")
print(f"Successfully extracted archive to {extract_dir}")
# Delete tar files immediately after extraction
@@ -217,7 +222,10 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
return True
except subprocess.CalledProcessError as e:
stderr_output = e.stderr.decode() if e.stderr else ""
print(f"Failed to extract split archive: {e}")
if stderr_output:
print(f"tar stderr: {stderr_output}")
return False
+2 -2
View File
@@ -76,8 +76,8 @@ def main():
print(f"After dedup: {df_accumulated.height} rows")
# Write and upload final result
output_name = f"planequery_aircraft_adsb_{global_start}_{global_end}.csv.gz"
csv_output = Path(f"/tmp/planequery_aircraft_adsb_{global_start}_{global_end}.csv")
output_name = f"openairframes_adsb_{global_start}_{global_end}.csv.gz"
csv_output = Path(f"/tmp/openairframes_adsb_{global_start}_{global_end}.csv")
gz_output = Path(f"/tmp/{output_name}")
df_accumulated.write_csv(csv_output)
+28 -15
View File
@@ -21,13 +21,13 @@ import urllib.request
import urllib.error
from datetime import datetime, timezone
from .schema import extract_json_from_issue_body, extract_contributor_name_from_issue_body, parse_and_validate, get_latest_schema_version, load_schema
from .schema import extract_json_from_issue_body, extract_contributor_name_from_issue_body, parse_and_validate, load_schema, SCHEMAS_DIR
from .contributor import (
generate_contributor_uuid,
generate_submission_filename,
compute_content_hash,
)
from .update_schema import generate_new_schema, check_for_new_tags, get_existing_tag_definitions
from .update_schema import generate_updated_schema, check_for_new_tags, get_existing_tag_definitions
from .read_community_data import build_tag_type_registry
@@ -100,14 +100,30 @@ def create_branch(branch_name: str, sha: str) -> None:
raise
def get_file_sha(path: str, branch: str) -> str | None:
"""Get the SHA of an existing file, or None if it doesn't exist."""
try:
response = github_api_request("GET", f"/contents/{path}?ref={branch}")
return response.get("sha")
except Exception:
return None
def create_or_update_file(path: str, content: str, message: str, branch: str) -> None:
"""Create or update a file in the repository."""
content_b64 = base64.b64encode(content.encode()).decode()
github_api_request("PUT", f"/contents/{path}", {
payload = {
"message": message,
"content": content_b64,
"branch": branch,
})
}
# If file exists, we need to include its SHA to update it
sha = get_file_sha(path, branch)
if sha:
payload["sha"] = sha
github_api_request("PUT", f"/contents/{path}", payload)
def create_pull_request(title: str, head: str, base: str, body: str) -> dict:
@@ -190,17 +206,15 @@ def process_submission(
commit_message = f"Add community submission from @{author_username} (closes #{issue_number})"
create_or_update_file(file_path, content_json, commit_message, branch_name)
# Update schema with any new tags (creates new version if needed)
# Update schema with any new tags (modifies v1 in place)
schema_updated = False
new_version = None
new_tags = []
try:
# Build tag registry from new submissions
tag_registry = build_tag_type_registry(submissions)
# Get current schema and merge existing tags
current_version = get_latest_schema_version()
current_schema = load_schema(current_version)
current_schema = load_schema()
existing_tags = get_existing_tag_definitions(current_schema)
# Merge existing tags into registry
@@ -213,15 +227,14 @@ def process_submission(
new_tags = check_for_new_tags(tag_registry, current_schema)
if new_tags:
# Generate new schema version
new_version = current_version + 1
new_schema = generate_new_schema(current_schema, tag_registry, new_version)
schema_json = json.dumps(new_schema, indent=2) + "\n"
# Generate updated schema
updated_schema = generate_updated_schema(current_schema, tag_registry)
schema_json = json.dumps(updated_schema, indent=2) + "\n"
create_or_update_file(
f"schemas/community_submission.v{new_version}.schema.json",
"schemas/community_submission.v1.schema.json",
schema_json,
f"Create schema v{new_version} with new tags: {', '.join(new_tags)}",
f"Update schema with new tags: {', '.join(new_tags)}",
branch_name
)
schema_updated = True
@@ -231,7 +244,7 @@ def process_submission(
# Create PR
schema_note = ""
if schema_updated:
schema_note = f"\n**Schema Updated:** Created v{new_version} with new tags: `{', '.join(new_tags)}`\n"
schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
pr_body = f"""## Community Submission
@@ -17,7 +17,7 @@ import pandas as pd
COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community"
OUT_ROOT = Path("data/planequery_aircraft")
OUT_ROOT = Path("data/openairframes")
def read_all_submissions(community_dir: Path) -> list[dict]:
@@ -47,7 +47,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
- creation_timestamp (first)
- transponder_code_hex
- registration_number
- planequery_airframe_id
- openairframes_id
- contributor_name
- [other columns alphabetically]
- contributor_uuid (last)
@@ -62,7 +62,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
"creation_timestamp",
"transponder_code_hex",
"registration_number",
"planequery_airframe_id",
"openairframes_id",
"contributor_name",
"contributor_uuid",
]
@@ -78,7 +78,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
"creation_timestamp",
"transponder_code_hex",
"registration_number",
"planequery_airframe_id",
"openairframes_id",
"contributor_name",
]
last_cols = ["contributor_uuid"]
@@ -108,7 +108,7 @@ def main():
"creation_timestamp",
"transponder_code_hex",
"registration_number",
"planequery_airframe_id",
"openairframes_id",
"contributor_name",
"tags",
"contributor_uuid",
@@ -127,7 +127,7 @@ def main():
# Output
OUT_ROOT.mkdir(parents=True, exist_ok=True)
output_file = OUT_ROOT / f"planequery_aircraft_community_{start_date_str}_{date_str}.csv"
output_file = OUT_ROOT / f"openairframes_community_{start_date_str}_{date_str}.csv"
df.to_csv(output_file, index=False)
+2 -2
View File
@@ -112,8 +112,8 @@ def group_by_identifier(submissions: list[dict]) -> dict[str, list[dict]]:
key = f"reg:{submission['registration_number']}"
elif "transponder_code_hex" in submission:
key = f"icao:{submission['transponder_code_hex']}"
elif "planequery_airframe_id" in submission:
key = f"id:{submission['planequery_airframe_id']}"
elif "openairframes_id" in submission:
key = f"id:{submission['openairframes_id']}"
else:
key = "_unknown"
+14 -16
View File
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
"""
Regenerate schema for a PR branch after main has been merged in.
This script looks at the submission files in this branch and generates
an updated schema version if new tags were introduced.
This script looks at the submission files in this branch and updates
the schema if new tags were introduced.
Usage: python -m src.contributions.regenerate_pr_schema
"""
@@ -18,15 +18,14 @@ from src.contributions.read_community_data import read_all_submissions, build_ta
from src.contributions.update_schema import (
get_existing_tag_definitions,
check_for_new_tags,
generate_new_schema,
generate_updated_schema,
)
from src.contributions.schema import get_latest_schema_version, load_schema, SCHEMAS_DIR
from src.contributions.schema import load_schema, SCHEMAS_DIR
def main():
"""Main entry point."""
# Get current schema version and load it
current_version = get_latest_schema_version()
# Load current schema
current_schema = load_schema()
# Get existing tag definitions from schema
@@ -46,20 +45,19 @@ def main():
new_tags = check_for_new_tags(tag_registry, current_schema)
if new_tags:
# Generate new schema version
new_version = current_version + 1
print(f"Found new tags: {new_tags}")
print(f"Generating schema v{new_version}")
print("Updating schema...")
# Generate new schema with updated tag definitions
new_schema = generate_new_schema(current_schema, tag_registry, new_version)
# Generate updated schema
updated_schema = generate_updated_schema(current_schema, tag_registry)
# Write new schema version
new_schema_path = SCHEMAS_DIR / f"community_submission.v{new_version}.schema.json"
with open(new_schema_path, 'w') as f:
json.dump(new_schema, f, indent=2)
# Write updated schema (in place)
schema_path = SCHEMAS_DIR / "community_submission.v1.schema.json"
with open(schema_path, 'w') as f:
json.dump(updated_schema, f, indent=2)
f.write("\n")
print(f"Created {new_schema_path}")
print(f"Updated {schema_path}")
else:
print("No new tags found, schema is up to date")
+1 -1
View File
@@ -111,7 +111,7 @@ def download_github_attachment(url: str) -> str | None:
import urllib.error
try:
req = urllib.request.Request(url, headers={"User-Agent": "PlaneQuery-Bot"})
req = urllib.request.Request(url, headers={"User-Agent": "OpenAirframes-Bot"})
with urllib.request.urlopen(req, timeout=30) as response:
return response.read().decode("utf-8")
except (urllib.error.URLError, urllib.error.HTTPError, UnicodeDecodeError) as e:
+31 -46
View File
@@ -40,40 +40,27 @@ def type_name_to_json_schema(type_name: str) -> dict:
return type_map.get(type_name, {"$ref": "#/$defs/tagValue"})
def generate_new_schema(base_schema: dict, tag_registry: dict[str, str], new_version: int) -> dict:
def generate_updated_schema(base_schema: dict, tag_registry: dict[str, str]) -> dict:
"""
Generate a new schema version with explicit tag definitions.
Generate an updated schema with explicit tag definitions.
Args:
base_schema: The current schema to base the new one on
base_schema: The current schema to update
tag_registry: Dict mapping tag name to type name
new_version: The new version number
Returns:
Complete new schema dict
Updated schema dict
"""
schema = json.loads(json.dumps(base_schema)) # Deep copy
# Update title with new version
schema["title"] = f"PlaneQuery Aircraft Community Submission (v{new_version})"
# Build tag properties with explicit types
tag_properties = {}
for tag_name, type_name in sorted(tag_registry.items()):
tag_properties[tag_name] = type_name_to_json_schema(type_name)
# Update tags definition
schema["properties"]["tags"] = {
"type": "object",
"description": "Community-defined tags. New tags can be added, but must use consistent types.",
"propertyNames": {
"type": "string",
"pattern": "^[a-z][a-z0-9_]{0,63}$"
},
"properties": tag_properties,
# Still allow additional properties for new tags
"additionalProperties": {"$ref": "#/$defs/tagValue"}
}
# Only add/update the properties key within tags, preserve everything else
if "properties" in schema and "tags" in schema["properties"]:
schema["properties"]["tags"]["properties"] = tag_properties
return schema
@@ -89,57 +76,55 @@ def check_for_new_tags(tag_registry: dict[str, str], current_schema: dict) -> li
return [tag for tag in tag_registry if tag not in existing_tags]
def create_new_schema_version(
def update_schema_file(
tag_registry: dict[str, str],
check_only: bool = False
) -> tuple[int | None, list[str]]:
) -> tuple[bool, list[str]]:
"""
Create a new schema version if there are new tags.
Update the v1 schema file with new tag definitions.
Args:
tag_registry: Dict mapping tag name to type name
check_only: If True, only check if update is needed without writing
Returns:
Tuple of (new_version or None if no update, list_of_new_tags)
Tuple of (was_updated, list_of_new_tags)
"""
current_version = get_latest_schema_version()
current_schema = load_schema(current_version)
current_schema = load_schema()
# Find new tags
new_tags = check_for_new_tags(tag_registry, current_schema)
if not new_tags:
return None, []
return False, []
if check_only:
return current_version + 1, new_tags
return True, new_tags
# Generate and write new schema
new_version = current_version + 1
new_schema = generate_new_schema(current_schema, tag_registry, new_version)
new_schema_path = get_schema_path(new_version)
# Generate and write updated schema (in place)
updated_schema = generate_updated_schema(current_schema, tag_registry)
schema_path = get_schema_path()
with open(new_schema_path, "w") as f:
json.dump(new_schema, f, indent=2)
with open(schema_path, "w") as f:
json.dump(updated_schema, f, indent=2)
f.write("\n")
return new_version, new_tags
return True, new_tags
def update_schema_from_submissions(check_only: bool = False) -> tuple[int | None, list[str]]:
def update_schema_from_submissions(check_only: bool = False) -> tuple[bool, list[str]]:
"""
Read all submissions and create a new schema version if needed.
Read all submissions and update the schema if needed.
Args:
check_only: If True, only check if update is needed without writing
Returns:
Tuple of (new_version or None if no update, list_of_new_tags)
Tuple of (was_updated, list_of_new_tags)
"""
submissions = read_all_submissions()
tag_registry = build_tag_type_registry(submissions)
return create_new_schema_version(tag_registry, check_only)
return update_schema_file(tag_registry, check_only)
def main():
@@ -148,21 +133,21 @@ def main():
args = parser.parse_args()
new_version, new_tags = update_schema_from_submissions(check_only=args.check)
was_updated, new_tags = update_schema_from_submissions(check_only=args.check)
if args.check:
if new_version:
print(f"Schema update needed -> v{new_version}. New tags: {', '.join(new_tags)}")
if was_updated:
print(f"Schema update needed. New tags: {', '.join(new_tags)}")
sys.exit(1)
else:
print(f"Schema is up to date (v{get_latest_schema_version()})")
print("Schema is up to date")
sys.exit(0)
else:
if new_version:
print(f"Created {get_schema_path(new_version)}")
if was_updated:
print(f"Updated {get_schema_path()}")
print(f"Added tags: {', '.join(new_tags)}")
else:
print(f"No update needed (v{get_latest_schema_version()})")
print("No update needed")
if __name__ == "__main__":
@@ -74,10 +74,10 @@ if __name__ == '__main__':
)
# Save the result
OUT_ROOT = Path("data/planequery_aircraft")
OUT_ROOT = Path("data/openairframes")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
output_file = OUT_ROOT / f"planequery_aircraft_adsb_{start_date_str}_{date_str}.csv"
output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv"
df_combined.write_csv(output_file)
print(f"Saved: {output_file}")
@@ -22,12 +22,19 @@ if not zip_path.exists():
body = r.read()
zip_path.write_bytes(body)
OUT_ROOT = Path("data/planequery_aircraft")
OUT_ROOT = Path("data/openairframes")
OUT_ROOT.mkdir(parents=True, exist_ok=True)
from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
from get_latest_planequery_aircraft_release import get_latest_aircraft_faa_csv_df
from get_latest_release import get_latest_aircraft_faa_csv_df
df_new = convert_faa_master_txt_to_df(zip_path, date_str)
df_base, start_date_str = get_latest_aircraft_faa_csv_df()
df_base = concat_faa_historical_df(df_base, df_new)
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
df_base.to_csv(OUT_ROOT / f"planequery_aircraft_faa_{start_date_str}_{date_str}.csv", index=False)
try:
df_base, start_date_str = get_latest_aircraft_faa_csv_df()
df_base = concat_faa_historical_df(df_base, df_new)
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
except Exception as e:
print(f"No existing FAA release found, using only new data: {e}")
df_base = df_new
start_date_str = date_str
df_base.to_csv(OUT_ROOT / f"openairframes_faa_{start_date_str}_{date_str}.csv", index=False)
+5 -5
View File
@@ -29,8 +29,8 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
df = df.drop(columns="certification").join(certification)
# Create planequery_airframe_id
df["planequery_airframe_id"] = (
# Create openairframes_id
df["openairframes_id"] = (
normalize(df["aircraft_manufacturer"])
+ "|"
+ normalize(df["aircraft_model"])
@@ -38,11 +38,11 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
+ normalize(df["serial_number"])
)
# Move planequery_airframe_id to come after registration_number
# Move openairframes_id to come after registration_number
cols = df.columns.tolist()
cols.remove("planequery_airframe_id")
cols.remove("openairframes_id")
reg_idx = cols.index("registration_number")
cols.insert(reg_idx + 1, "planequery_airframe_id")
cols.insert(reg_idx + 1, "openairframes_id")
df = df[cols]
# Convert all NaN to empty strings
@@ -9,7 +9,7 @@ import urllib.error
import json
REPO = "PlaneQuery/planequery-aircraft"
REPO = "PlaneQuery/openairframes"
LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest"
@@ -31,7 +31,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
url = f"https://api.github.com/repos/{repo}/releases/latest"
headers = {
"Accept": "application/vnd.github+json",
"User-Agent": "planequery-aircraft-downloader/1.0",
"User-Agent": "openairframes-downloader/1.0",
}
if github_token:
headers["Authorization"] = f"Bearer {github_token}"
@@ -80,7 +80,7 @@ def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[s
out_path.parent.mkdir(parents=True, exist_ok=True)
headers = {
"User-Agent": "planequery-aircraft-downloader/1.0",
"User-Agent": "openairframes-downloader/1.0",
"Accept": "application/octet-stream",
}
if github_token:
@@ -109,7 +109,7 @@ def download_latest_aircraft_csv(
repo: str = REPO,
) -> Path:
"""
Download the latest planequery_aircraft_faa_*.csv file from the latest GitHub release.
Download the latest openairframes_faa_*.csv file from the latest GitHub release.
Args:
output_dir: Directory to save the downloaded file (default: "downloads")
@@ -121,10 +121,10 @@ def download_latest_aircraft_csv(
"""
assets = get_latest_release_assets(repo, github_token=github_token)
try:
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_faa_.*\.csv$")
asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$")
except FileNotFoundError:
# Fallback to old naming pattern
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_\d{4}-\d{2}-\d{2}_.*\.csv$")
asset = pick_asset(assets, name_regex=r"^openairframes_\d{4}-\d{2}-\d{2}_.*\.csv$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to
@@ -136,11 +136,11 @@ def get_latest_aircraft_faa_csv_df():
'unique_regulatory_id': str,
'registrant_county': str})
df = df.fillna("")
# Extract start date from filename pattern: planequery_aircraft_faa_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path))
# Extract start date from filename pattern: openairframes_faa_{start_date}_{end_date}.csv
match = re.search(r"openairframes_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
# Fallback to old naming pattern: planequery_aircraft_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
# Fallback to old naming pattern: openairframes_{start_date}_{end_date}.csv
match = re.search(r"openairframes_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
@@ -154,7 +154,7 @@ def download_latest_aircraft_adsb_csv(
repo: str = REPO,
) -> Path:
"""
Download the latest planequery_aircraft_adsb_*.csv file from the latest GitHub release.
Download the latest openairframes_adsb_*.csv file from the latest GitHub release.
Args:
output_dir: Directory to save the downloaded file (default: "downloads")
@@ -165,7 +165,7 @@ def download_latest_aircraft_adsb_csv(
Path to the downloaded file
"""
assets = get_latest_release_assets(repo, github_token=github_token)
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_adsb_.*\.csv$")
asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to
@@ -176,8 +176,8 @@ def get_latest_aircraft_adsb_csv_df():
import pandas as pd
df = pd.read_csv(csv_path)
df = df.fillna("")
# Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}")