Compare commits

...

35 Commits

Author SHA1 Message Date
ggman12 50267f3c57 make faa work with no new data 2026-02-12 17:26:48 -05:00
ggman12 dd323f6e55 delete old files 2026-02-12 17:25:50 -05:00
ggman12 0e8b21daf9 rename from planequery to openairframes 2026-02-12 17:24:08 -05:00
ggman12 3960e6936c use start_date_end_date for adsb naming 2026-02-12 17:13:06 -05:00
ggman12 48623ef79e delete existign release 2026-02-12 17:12:09 -05:00
ggman12 5affe8937c rename to openairframes 2026-02-12 17:09:07 -05:00
ggman12 d0254146f3 update release to fix not grabbing FAA file 2026-02-12 16:42:47 -05:00
ggman12 1699ad6d8a rename file 2026-02-12 16:12:03 -05:00
ggman12 2a6892c347 fix download 2026-02-12 16:08:08 -05:00
ggman12 47ccecb9ba set fail-fast to true 2026-02-12 16:07:42 -05:00
ggman12 2826dfd450 remove notebook 2026-02-12 16:07:28 -05:00
ggman12 fecf9ff0ea format properly 2026-02-12 16:01:14 -05:00
ggman12 7e0a396fc7 only modify key parts of schemas/community_submission.v1.schema.json schema. Lowest diffs 2026-02-12 15:55:44 -05:00
ggman12 b0503bb3b2 fix: should update schema now 2026-02-12 15:46:11 -05:00
ggman12 0b89138daf modify existing json schema instead of creating a new file every time 2026-02-12 15:40:01 -05:00
ggman12 4b756cdaef fix syntax error 2026-02-12 15:32:37 -05:00
ggman12 9acffe1e56 handle multiple PRs with schema changes 2026-02-12 15:31:53 -05:00
ggman12 1694fe0b46 allow fileupload in submission 2026-02-12 15:26:45 -05:00
ggman12 c6d9e59d01 update template 2026-02-12 13:29:45 -05:00
ggman12 dd6cd7b6fd update schema with optional start_date and end_date scope 2026-02-12 13:28:43 -05:00
ggman12 f543b671f8 updating schema 2026-02-12 13:22:56 -05:00
ggman12 efb4cbb953 update example 2026-02-12 13:22:43 -05:00
ggman12 5578133a99 update schema to be uppercase only 2026-02-12 12:36:50 -05:00
ggman12 eace7d5a63 update folder 2026-02-12 12:34:27 -05:00
ggman12 82f47b662c make blank username work 2026-02-12 12:32:41 -05:00
ggman12 787796c3ab update approve_submission 2026-02-12 12:26:54 -05:00
ggman12 61aae586ee fix approve 2026-02-12 12:18:28 -05:00
ggman12 5abfa6b226 update submission validation 2026-02-12 12:15:04 -05:00
ggman12 a743b74ae5 Merge branch 'develop' 2026-02-12 12:10:24 -05:00
ggman12 53a020ab73 add jsonschema to requirements.txt 2026-02-12 12:09:03 -05:00
ggman12 2de41c9883 update historical. To check tar and fail fast if any maps fail 2026-02-12 12:01:13 -05:00
ggman12 bccc634158 remove existing release 2026-02-12 11:50:45 -05:00
ggman12 43b07942b0 add needed permissions 2026-02-12 11:42:49 -05:00
ggman12 2c9e994a12 add debug for FAA 2026-02-12 11:06:38 -05:00
ggman12 99b680476a delete parquet chunck after load to not use so much space for big historical run 2026-02-12 10:52:42 -05:00
27 changed files with 1037 additions and 780 deletions
@@ -13,29 +13,42 @@ body:
**Rules (enforced on review/automation):** **Rules (enforced on review/automation):**
- Each object must include **at least one** of: - Each object must include **at least one** of:
- `registration_number` - `registration_number`
- `transponder_code_hex` (6 hex chars) - `transponder_code_hex` (6 uppercase hex chars, e.g., `ABC123`)
- `planequery_airframe_id` - `openairframes_id`
- Your contributor name (entered below) will be applied to all objects. - Your contributor name (entered below) will be applied to all objects.
- `contributor_uuid` is derived from your GitHub account automatically. - `contributor_uuid` is derived from your GitHub account automatically.
- `creation_timestamp` is created by the system (you may omit it). - `creation_timestamp` is created by the system (you may omit it).
**Optional date scoping:**
- `start_date` - When the tags become valid (ISO 8601: `YYYY-MM-DD`)
- `end_date` - When the tags stop being valid (ISO 8601: `YYYY-MM-DD`)
**Example: single object** **Example: single object**
```json ```json
{ {
"transponder_code_hex": "a1b2c3" "registration_number": "N12345",
"tags": {"owner": "John Doe"},
"start_date": "2025-01-01"
} }
``` ```
**Example: multiple objects (array)** **Example: multiple objects (array)**
```json ```json
[ [
{ {
"registration_number": "N123AB" "registration_number": "N12345",
}, "tags": {"internet": "starlink"},
{ "start_date": "2025-05-01"
"planequery_airframe_id": "cessna|172s|12345", },
"transponder_code_hex": "0f1234" {
} "registration_number": "N12345",
"tags": {"owner": "John Doe"},
"start_date": "2025-01-01",
"end_date": "2025-07-20"
},
{
"transponder_code_hex": "ABC123",
"tags": {"internet": "viasat", "owner": "John Doe"}
}
] ]
``` ```
@@ -52,9 +65,11 @@ body:
id: submission_json id: submission_json
attributes: attributes:
label: Submission JSON label: Submission JSON
description: Paste either one JSON object or an array of JSON objects. Must be valid JSON. Do not include contributor_name or contributor_uuid in your JSON. description: |
Paste JSON directly, OR drag-and-drop a .json file here.
Must be valid JSON. Do not include contributor_name or contributor_uuid.
placeholder: | placeholder: |
Paste JSON here... Paste JSON here, or drag-and-drop a .json file...
validations: validations:
required: true required: true
@@ -38,9 +38,10 @@ jobs:
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_REPOSITORY: ${{ github.repository }}
ISSUE_BODY: ${{ github.event.issue.body }}
run: | run: |
python -m src.contributions.approve_submission \ python -m src.contributions.approve_submission \
--issue-number ${{ github.event.issue.number }} \ --issue-number ${{ github.event.issue.number }} \
--issue-body "${{ github.event.issue.body }}" \ --issue-body "$ISSUE_BODY" \
--author "${{ steps.author.outputs.username }}" \ --author "${{ steps.author.outputs.username }}" \
--author-id ${{ steps.author.outputs.user_id }} --author-id ${{ steps.author.outputs.user_id }}
+33 -12
View File
@@ -48,7 +48,7 @@ jobs:
matrix: matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }} chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
max-parallel: 3 max-parallel: 3
fail-fast: false fail-fast: true
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -81,8 +81,22 @@ jobs:
- name: Create tar of extracted data - name: Create tar of extracted data
run: | run: |
cd data/output cd data/output
tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "Some files may not exist" echo "=== Disk space before tar ==="
ls -lah extracted_data.tar || echo "No tar created" df -h .
echo "=== Files to tar ==="
ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
# Create tar with explicit error checking
if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
echo "=== Tar file created ==="
ls -lah extracted_data.tar
# Verify tar integrity
tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
else
echo "ERROR: No extracted directories found, cannot create tar"
exit 1
fi
- name: Upload extracted data - name: Upload extracted data
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
@@ -97,7 +111,7 @@ jobs:
needs: [generate-matrix, adsb-extract] needs: [generate-matrix, adsb-extract]
runs-on: ubuntu-24.04-arm runs-on: ubuntu-24.04-arm
strategy: strategy:
fail-fast: false fail-fast: true
matrix: matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }} chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
icao_chunk: [0, 1, 2, 3] icao_chunk: [0, 1, 2, 3]
@@ -134,7 +148,12 @@ jobs:
run: | run: |
cd data/output cd data/output
if [ -f extracted_data.tar ]; then if [ -f extracted_data.tar ]; then
tar -xf extracted_data.tar echo "=== Tar file info ==="
ls -lah extracted_data.tar
echo "=== Verifying tar integrity ==="
tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
echo "=== Extracting ==="
tar -xvf extracted_data.tar
rm extracted_data.tar rm extracted_data.tar
echo "has_data=true" >> "$GITHUB_OUTPUT" echo "has_data=true" >> "$GITHUB_OUTPUT"
echo "=== Contents of data/output ===" echo "=== Contents of data/output ==="
@@ -188,22 +207,24 @@ jobs:
- name: Debug downloaded files - name: Debug downloaded files
run: | run: |
echo "=== Disk space before processing ==="
df -h
echo "=== Listing data/output/adsb_chunks/ ===" echo "=== Listing data/output/adsb_chunks/ ==="
find data/output/adsb_chunks/ -type f 2>/dev/null | head -50 || echo "No files found" find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
echo "=== Looking for parquet files ===" echo "=== Total parquet size ==="
find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found" du -sh data/output/adsb_chunks/ || echo "No chunks dir"
- name: Combine chunks to CSV - name: Combine chunks to CSV
env: env:
START_DATE: ${{ needs.generate-matrix.outputs.global_start }} START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
END_DATE: ${{ needs.generate-matrix.outputs.global_end }} END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
run: | run: |
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
ls -lah data/planequery_aircraft/ ls -lah data/openairframes/
- name: Upload final artifact - name: Upload final artifact
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: planequery_aircraft_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }} name: openairframes_adsb-${{ needs.generate-matrix.outputs.global_start }}-${{ needs.generate-matrix.outputs.global_end }}
path: data/planequery_aircraft/*.csv path: data/openairframes/*.csv
retention-days: 30 retention-days: 30
@@ -1,4 +1,4 @@
name: planequery-aircraft Daily Release name: OpenAirframes Daily Release
on: on:
schedule: schedule:
@@ -22,7 +22,7 @@ jobs:
await github.rest.actions.createWorkflowDispatch({ await github.rest.actions.createWorkflowDispatch({
owner: context.repo.owner, owner: context.repo.owner,
repo: context.repo.repo, repo: context.repo.repo,
workflow_id: 'planequery-aircraft-daily-release.yaml', workflow_id: 'openairframes-daily-release.yaml',
ref: 'main' ref: 'main'
}); });
@@ -33,7 +33,7 @@ jobs:
await github.rest.actions.createWorkflowDispatch({ await github.rest.actions.createWorkflowDispatch({
owner: context.repo.owner, owner: context.repo.owner,
repo: context.repo.repo, repo: context.repo.repo,
workflow_id: 'planequery-aircraft-daily-release.yaml', workflow_id: 'openairframes-daily-release.yaml',
ref: 'develop' ref: 'develop'
}); });
@@ -58,16 +58,16 @@ jobs:
- name: Run FAA release script - name: Run FAA release script
run: | run: |
python src/create_daily_planequery_aircraft_release.py python src/create_daily_faa_release.py
ls -lah data/faa_releasable ls -lah data/faa_releasable
ls -lah data/planequery_aircraft ls -lah data/openairframes
- name: Upload FAA artifacts - name: Upload FAA artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: faa-release name: faa-release
path: | path: |
data/planequery_aircraft/planequery_aircraft_faa_*.csv data/openairframes/openairframes_faa_*.csv
data/faa_releasable/ReleasableAircraft_*.zip data/faa_releasable/ReleasableAircraft_*.zip
retention-days: 1 retention-days: 1
@@ -214,13 +214,13 @@ jobs:
mkdir -p data/output/adsb_chunks mkdir -p data/output/adsb_chunks
ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist" ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
ls -lah data/planequery_aircraft/ ls -lah data/openairframes/
- name: Upload ADS-B artifacts - name: Upload ADS-B artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: adsb-release name: adsb-release
path: data/planequery_aircraft/planequery_aircraft_adsb_*.csv path: data/openairframes/openairframes_adsb_*.csv
retention-days: 1 retention-days: 1
build-community: build-community:
@@ -245,13 +245,13 @@ jobs:
- name: Run Community release script - name: Run Community release script
run: | run: |
python -m src.contributions.create_daily_community_release python -m src.contributions.create_daily_community_release
ls -lah data/planequery_aircraft ls -lah data/openairframes
- name: Upload Community artifacts - name: Upload Community artifacts
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
name: community-release name: community-release
path: data/planequery_aircraft/planequery_aircraft_community_*.csv path: data/openairframes/openairframes_community_*.csv
retention-days: 1 retention-days: 1
create-release: create-release:
@@ -277,6 +277,15 @@ jobs:
name: community-release name: community-release
path: artifacts/community path: artifacts/community
- name: Debug artifact structure
run: |
echo "=== FAA artifacts ==="
find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
echo "=== ADS-B artifacts ==="
find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
echo "=== Community artifacts ==="
find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
- name: Prepare release metadata - name: Prepare release metadata
id: meta id: meta
run: | run: |
@@ -288,16 +297,16 @@ jobs:
elif [ "$BRANCH_NAME" = "develop" ]; then elif [ "$BRANCH_NAME" = "develop" ]; then
BRANCH_SUFFIX="-develop" BRANCH_SUFFIX="-develop"
fi fi
TAG="planequery-aircraft-${DATE}${BRANCH_SUFFIX}" TAG="openairframes-${DATE}${BRANCH_SUFFIX}"
# Find files from artifacts # Find files from artifacts using find (handles nested structures)
CSV_FILE_FAA=$(ls artifacts/faa/data/planequery_aircraft/planequery_aircraft_faa_*.csv | head -1) CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" | head -1)
CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA") CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
CSV_FILE_ADSB=$(ls artifacts/adsb/planequery_aircraft_adsb_*.csv | head -1) CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" | head -1)
CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB") CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
CSV_FILE_COMMUNITY=$(ls artifacts/community/planequery_aircraft_community_*.csv 2>/dev/null | head -1 || echo "") CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" 2>/dev/null | head -1 || echo "")
CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "") CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
ZIP_FILE=$(ls artifacts/faa/data/faa_releasable/ReleasableAircraft_*.zip | head -1) ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" | head -1)
ZIP_BASENAME=$(basename "$ZIP_FILE") ZIP_BASENAME=$(basename "$ZIP_FILE")
echo "date=$DATE" >> "$GITHUB_OUTPUT" echo "date=$DATE" >> "$GITHUB_OUTPUT"
@@ -310,7 +319,17 @@ jobs:
echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT" echo "csv_basename_community=$CSV_BASENAME_COMMUNITY" >> "$GITHUB_OUTPUT"
echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT" echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT" echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT" echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
- name: Checkout for gh CLI
uses: actions/checkout@v4
- name: Delete existing release if exists
run: |
echo "Attempting to delete release: ${{ steps.meta.outputs.tag }}"
gh release delete "${{ steps.meta.outputs.tag }}" --yes --cleanup-tag || echo "No existing release to delete"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Create GitHub Release and upload assets - name: Create GitHub Release and upload assets
uses: softprops/action-gh-release@v2 uses: softprops/action-gh-release@v2
@@ -0,0 +1,171 @@
name: Process Historical FAA Data
on:
workflow_dispatch: # Manual trigger
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Generate date ranges
id: set-matrix
run: |
python3 << 'EOF'
import json
from datetime import datetime, timedelta
start = datetime(2023, 8, 16)
end = datetime(2026, 1, 1)
ranges = []
current = start
# Process in 4-day chunks
while current < end:
chunk_end = current + timedelta(days=4)
# Don't go past the end date
if chunk_end > end:
chunk_end = end
ranges.append({
"since": current.strftime("%Y-%m-%d"),
"until": chunk_end.strftime("%Y-%m-%d")
})
current = chunk_end
print(f"::set-output name=matrix::{json.dumps(ranges)}")
EOF
clone-faa-repo:
runs-on: ubuntu-latest
steps:
- name: Cache FAA repository
id: cache-faa-repo
uses: actions/cache@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
- name: Clone FAA repository
if: steps.cache-faa-repo.outputs.cache-hit != 'true'
run: |
mkdir -p data
git clone https://github.com/simonw/scrape-faa-releasable-aircraft data/scrape-faa-releasable-aircraft
echo "Repository cloned successfully"
process-chunk:
needs: [generate-matrix, clone-faa-repo]
runs-on: ubuntu-latest
strategy:
max-parallel: 5 # Process 5 chunks at a time
matrix:
range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Restore FAA repository cache
uses: actions/cache/restore@v4
with:
path: data/scrape-faa-releasable-aircraft
key: faa-repo-v1
fail-on-cache-miss: true
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Process chunk ${{ matrix.range.since }} to ${{ matrix.range.until }}
run: |
python src/get_historical_faa.py "${{ matrix.range.since }}" "${{ matrix.range.until }}"
- name: Upload CSV artifact
uses: actions/upload-artifact@v4
with:
name: csv-${{ matrix.range.since }}-to-${{ matrix.range.until }}
path: data/faa_releasable_historical/*.csv
retention-days: 1
create-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare release files
run: |
mkdir -p release-files
find artifacts -name "*.csv" -exec cp {} release-files/ \;
ls -lh release-files/
- name: Create Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-${{ github.run_number }}
name: Historical FAA Data Release ${{ github.run_number }}
body: |
Automated release of historical FAA aircraft data
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: release-files/*.csv
draft: false
prerelease: false
concatenate-and-release:
needs: process-chunk
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Install dependencies
run: |
pip install -r requirements.txt
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
path: artifacts
- name: Prepare CSVs for concatenation
run: |
mkdir -p data/faa_releasable_historical
find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
ls -lh data/faa_releasable_historical/
- name: Concatenate all CSVs
run: |
python scripts/concat_csvs.py
- name: Create Combined Release
uses: softprops/action-gh-release@v1
with:
tag_name: historical-faa-combined-${{ github.run_number }}
name: Historical FAA Data Combined Release ${{ github.run_number }}
body: |
Combined historical FAA aircraft data (all chunks concatenated)
Processing period: 2023-08-16 to 2026-01-01
Generated: ${{ github.event.repository.updated_at }}
files: data/openairframes/*.csv
draft: false
prerelease: false
@@ -0,0 +1,77 @@
name: Update Community PRs After Merge
on:
push:
branches: [main]
paths:
- 'community/**'
- 'schemas/community_submission.v1.schema.json'
permissions:
contents: write
pull-requests: write
jobs:
update-open-prs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install dependencies
run: pip install jsonschema
- name: Find and update open community PRs
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Get list of open community PRs
prs=$(gh pr list --label community --state open --json number,headRefName --jq '.[] | "\(.number) \(.headRefName)"')
if [ -z "$prs" ]; then
echo "No open community PRs found"
exit 0
fi
echo "$prs" | while read pr_number branch_name; do
echo "Processing PR #$pr_number (branch: $branch_name)"
# Checkout PR branch
git fetch origin "$branch_name"
git checkout "$branch_name"
# Merge main into PR branch
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
if git merge origin/main -m "Merge main to update schema"; then
# Regenerate schema for this PR's submission (adds any new tags)
python -m src.contributions.regenerate_pr_schema || true
# If there are changes, commit and push
if [ -n "$(git status --porcelain schemas/)" ]; then
git add schemas/
git commit -m "Update schema with new tags"
git push origin "$branch_name"
echo " Updated PR #$pr_number with schema changes"
else
git push origin "$branch_name"
echo " Merged main into PR #$pr_number"
fi
else
echo " Merge conflict in PR #$pr_number, adding comment"
gh pr comment "$pr_number" --body $'⚠️ **Merge Conflict**\n\nAnother community submission was merged and this PR has conflicts.\n\nA maintainer may need to:\n1. Close this PR\n2. Remove the `approved` label from the original issue\n3. Re-add the `approved` label to regenerate the PR'
git merge --abort
fi
fi
git checkout main
done
@@ -4,6 +4,9 @@ on:
issues: issues:
types: [opened, edited] types: [opened, edited]
permissions:
issues: write
jobs: jobs:
validate: validate:
if: contains(github.event.issue.labels.*.name, 'submission') if: contains(github.event.issue.labels.*.name, 'submission')
@@ -20,11 +23,24 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: pip install jsonschema run: pip install jsonschema
- name: Debug issue body
run: |
echo "=== Issue Body ==="
cat << 'ISSUE_BODY_EOF'
${{ github.event.issue.body }}
ISSUE_BODY_EOF
- name: Save issue body to file
run: |
cat << 'ISSUE_BODY_EOF' > /tmp/issue_body.txt
${{ github.event.issue.body }}
ISSUE_BODY_EOF
- name: Validate submission - name: Validate submission
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_REPOSITORY: ${{ github.repository }}
run: | run: |
python -m src.contributions.validate_submission \ python -m src.contributions.validate_submission \
--issue-body "${{ github.event.issue.body }}" \ --issue-body-file /tmp/issue_body.txt \
--issue-number ${{ github.event.issue.number }} --issue-number ${{ github.event.issue.number }}
+1 -1
View File
@@ -1,6 +1,6 @@
MIT License MIT License
Copyright (c) 2026 PlaneQuery Copyright (c) 2026 OpenAirframes
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
+1 -1
View File
@@ -23,7 +23,7 @@ class AdsbProcessingStack(Stack):
# --- S3 bucket for intermediate and final results --- # --- S3 bucket for intermediate and final results ---
bucket = s3.Bucket( bucket = s3.Bucket(
self, "ResultsBucket", self, "ResultsBucket",
bucket_name="planequery-aircraft-dev", bucket_name="openairframes-dev",
removal_policy=RemovalPolicy.DESTROY, removal_policy=RemovalPolicy.DESTROY,
auto_delete_objects=True, auto_delete_objects=True,
lifecycle_rules=[ lifecycle_rules=[
-640
View File
@@ -1,640 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "06ae0319",
"metadata": {},
"outputs": [],
"source": [
"import clickhouse_connect\n",
"client = clickhouse_connect.get_client(\n",
" host=os.environ[\"CLICKHOUSE_HOST\"],\n",
" username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
" password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
" secure=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "779710f0",
"metadata": {},
"outputs": [],
"source": [
"df = client.query_df(\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '2024-01-01 00:00:00' AND time < '2024-01-02 00:00:00'\")\n",
"df_copy = df.copy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf024da8",
"metadata": {},
"outputs": [],
"source": [
"# -- military = dbFlags & 1; interesting = dbFlags & 2; PIA = dbFlags & 4; LADD = dbFlags & 8;"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "270607b5",
"metadata": {},
"outputs": [],
"source": [
"df = load_raw_adsb_for_day(datetime(2024,1,1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac06a30e",
"metadata": {},
"outputs": [],
"source": [
"df['aircraft']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91edab3e",
"metadata": {},
"outputs": [],
"source": [
"COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
"def compress_df(df):\n",
" icao = df.name\n",
" df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
" original_df = df.copy()\n",
" df = df.groupby(\"_signature\", as_index=False).last() # check if it works with both last and first.\n",
" # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
" def get_non_empty_dict(row):\n",
" return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
" \n",
" df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
" df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
" \n",
" # Check if row i's non-empty values are a subset of row j's non-empty values\n",
" def is_subset_of_any(idx):\n",
" row_dict = df.loc[idx, '_non_empty_dict']\n",
" row_count = df.loc[idx, '_non_empty_count']\n",
" \n",
" for other_idx in df.index:\n",
" if idx == other_idx:\n",
" continue\n",
" other_dict = df.loc[other_idx, '_non_empty_dict']\n",
" other_count = df.loc[other_idx, '_non_empty_count']\n",
" \n",
" # Check if all non-empty values in current row match those in other row\n",
" if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
" # If they match and other has more defined columns, current row is redundant\n",
" if other_count > row_count:\n",
" return True\n",
" return False\n",
" \n",
" # Keep rows that are not subsets of any other row\n",
" keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
" df = df[keep_mask]\n",
"\n",
" if len(df) > 1:\n",
" original_df = original_df[original_df['_signature'].isin(df['_signature'])]\n",
" value_counts = original_df[\"_signature\"].value_counts()\n",
" max_signature = value_counts.idxmax()\n",
" df = df[df['_signature'] == max_signature]\n",
"\n",
" df['icao'] = icao\n",
" df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
" return df\n",
"\n",
"# df = df_copy\n",
"# df = df_copy.iloc[0:100000]\n",
"# df = df[df['r'] == \"N4131T\"]\n",
"# df = df[(df['icao'] == \"008081\")]\n",
"# df = df.iloc[0:500]\n",
"df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
"df = df.drop(columns=['aircraft'])\n",
"df = df.sort_values(['icao', 'time'])\n",
"df[COLUMNS] = df[COLUMNS].fillna('')\n",
"ORIGINAL_COLUMNS = df.columns.tolist()\n",
"df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
"cols = df_compressed.columns.tolist()\n",
"cols.remove(\"icao\")\n",
"cols.insert(1, \"icao\")\n",
"df_compressed = df_compressed[cols]\n",
"df_compressed"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "efdfcb2c",
"metadata": {},
"outputs": [],
"source": [
"df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
"df[~df['aircraft_category'].isna()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "495c5025",
"metadata": {},
"outputs": [],
"source": [
"# SOME KIND OF MAP REDUCE SYSTEM\n",
"import os\n",
"\n",
"COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
"def compress_df(df):\n",
" icao = df.name\n",
" df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
" \n",
" # Compute signature counts before grouping (avoid copy)\n",
" signature_counts = df[\"_signature\"].value_counts()\n",
" \n",
" df = df.groupby(\"_signature\", as_index=False).first() # check if it works with both last and first.\n",
" # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
" def get_non_empty_dict(row):\n",
" return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
" \n",
" df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
" df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
" \n",
" # Check if row i's non-empty values are a subset of row j's non-empty values\n",
" def is_subset_of_any(idx):\n",
" row_dict = df.loc[idx, '_non_empty_dict']\n",
" row_count = df.loc[idx, '_non_empty_count']\n",
" \n",
" for other_idx in df.index:\n",
" if idx == other_idx:\n",
" continue\n",
" other_dict = df.loc[other_idx, '_non_empty_dict']\n",
" other_count = df.loc[other_idx, '_non_empty_count']\n",
" \n",
" # Check if all non-empty values in current row match those in other row\n",
" if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
" # If they match and other has more defined columns, current row is redundant\n",
" if other_count > row_count:\n",
" return True\n",
" return False\n",
" \n",
" # Keep rows that are not subsets of any other row\n",
" keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
" df = df[keep_mask]\n",
"\n",
" if len(df) > 1:\n",
" # Use pre-computed signature counts instead of original_df\n",
" remaining_sigs = df['_signature']\n",
" sig_counts = signature_counts[remaining_sigs]\n",
" max_signature = sig_counts.idxmax()\n",
" df = df[df['_signature'] == max_signature]\n",
"\n",
" df['icao'] = icao\n",
" df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
" return df\n",
"\n",
"# names of releases something like\n",
"# planequery_aircraft_adsb_2024-06-01T00-00-00Z.csv.gz\n",
"\n",
"# Let's build historical first. \n",
"\n",
"_ch_client = None\n",
"\n",
"def _get_clickhouse_client():\n",
" \"\"\"Return a reusable ClickHouse client, with retry/backoff for transient DNS or connection errors.\"\"\"\n",
" global _ch_client\n",
" if _ch_client is not None:\n",
" return _ch_client\n",
"\n",
" import clickhouse_connect\n",
" import time\n",
"\n",
" max_retries = 5\n",
" for attempt in range(1, max_retries + 1):\n",
" try:\n",
" _ch_client = clickhouse_connect.get_client(\n",
" host=os.environ[\"CLICKHOUSE_HOST\"],\n",
" username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
" password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
" secure=True,\n",
" )\n",
" return _ch_client\n",
" except Exception as e:\n",
" wait = min(2 ** attempt, 30)\n",
" print(f\" ClickHouse connect attempt {attempt}/{max_retries} failed: {e}\")\n",
" if attempt == max_retries:\n",
" raise\n",
" print(f\" Retrying in {wait}s...\")\n",
" time.sleep(wait)\n",
"\n",
"\n",
"def load_raw_adsb_for_day(day):\n",
" \"\"\"Load raw ADS-B data for a day from cache or ClickHouse.\"\"\"\n",
" from datetime import timedelta\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" import time\n",
" \n",
" start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)\n",
" end_time = start_time + timedelta(days=1)\n",
" \n",
" # Set up caching\n",
" cache_dir = Path(\"data/adsb\")\n",
" cache_dir.mkdir(parents=True, exist_ok=True)\n",
" cache_file = cache_dir / f\"adsb_raw_{start_time.strftime('%Y-%m-%d')}.csv.zst\"\n",
" \n",
" # Check if cache exists\n",
" if cache_file.exists():\n",
" print(f\" Loading from cache: {cache_file}\")\n",
" df = pd.read_csv(cache_file, compression='zstd')\n",
" df['time'] = pd.to_datetime(df['time'])\n",
" else:\n",
" # Format dates for the query\n",
" start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" \n",
" max_retries = 3\n",
" for attempt in range(1, max_retries + 1):\n",
" try:\n",
" client = _get_clickhouse_client()\n",
" print(f\" Querying ClickHouse for {start_time.strftime('%Y-%m-%d')}\")\n",
" df = client.query_df(f\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '{start_str}' AND time < '{end_str}'\")\n",
" break\n",
" except Exception as e:\n",
" wait = min(2 ** attempt, 30)\n",
" print(f\" Query attempt {attempt}/{max_retries} failed: {e}\")\n",
" if attempt == max_retries:\n",
" raise\n",
" # Reset client in case connection is stale\n",
" global _ch_client\n",
" _ch_client = None\n",
" print(f\" Retrying in {wait}s...\")\n",
" time.sleep(wait)\n",
" \n",
" # Save to cache\n",
" df.to_csv(cache_file, index=False, compression='zstd')\n",
" print(f\" Saved to cache: {cache_file}\")\n",
" \n",
" return df\n",
"\n",
"def load_historical_for_day(day):\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" \n",
" df = load_raw_adsb_for_day(day)\n",
" print(df)\n",
" df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
" df = df.drop(columns=['aircraft'])\n",
" df = df.sort_values(['icao', 'time'])\n",
" df[COLUMNS] = df[COLUMNS].fillna('')\n",
" df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
" cols = df_compressed.columns.tolist()\n",
" cols.remove('time')\n",
" cols.insert(0, 'time')\n",
" cols.remove(\"icao\")\n",
" cols.insert(1, \"icao\")\n",
" df_compressed = df_compressed[cols]\n",
" return df_compressed\n",
"\n",
"\n",
"def concat_compressed_dfs(df_base, df_new):\n",
" \"\"\"Concatenate base and new compressed dataframes, keeping the most informative row per ICAO.\"\"\"\n",
" import pandas as pd\n",
" \n",
" # Combine both dataframes\n",
" df_combined = pd.concat([df_base, df_new], ignore_index=True)\n",
" \n",
" # Sort by ICAO and time\n",
" df_combined = df_combined.sort_values(['icao', 'time'])\n",
" \n",
" # Fill NaN values\n",
" df_combined[COLUMNS] = df_combined[COLUMNS].fillna('')\n",
" \n",
" # Apply compression logic per ICAO to get the best row\n",
" df_compressed = df_combined.groupby('icao', group_keys=False).apply(compress_df)\n",
" \n",
" # Sort by time\n",
" df_compressed = df_compressed.sort_values('time')\n",
" \n",
" return df_compressed\n",
"\n",
"\n",
"def get_latest_aircraft_adsb_csv_df():\n",
" \"\"\"Download and load the latest ADS-B CSV from GitHub releases.\"\"\"\n",
" from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv\n",
" \n",
" import pandas as pd\n",
" import re\n",
" \n",
" csv_path = download_latest_aircraft_adsb_csv()\n",
" df = pd.read_csv(csv_path)\n",
" df = df.fillna(\"\")\n",
" \n",
" # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv\n",
" match = re.search(r\"planequery_aircraft_adsb_(\\d{4}-\\d{2}-\\d{2})_\", str(csv_path))\n",
" if not match:\n",
" raise ValueError(f\"Could not extract date from filename: {csv_path.name}\")\n",
" \n",
" date_str = match.group(1)\n",
" return df, date_str\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f66acf7",
"metadata": {},
"outputs": [],
"source": [
"# SOME KIND OF MAP REDUCE SYSTEM\n",
"\n",
"\n",
"COLUMNS = ['dbFlags', 'ownOp', 'year', 'desc', 'aircraft_category', 'r', 't']\n",
"def compress_df(df):\n",
" icao = df.name\n",
" df[\"_signature\"] = df[COLUMNS].astype(str).agg('|'.join, axis=1)\n",
" original_df = df.copy()\n",
" df = df.groupby(\"_signature\", as_index=False).first() # check if it works with both last and first.\n",
" # For each row, create a dict of non-empty column values. This is using sets and subsets...\n",
" def get_non_empty_dict(row):\n",
" return {col: row[col] for col in COLUMNS if row[col] != ''}\n",
" \n",
" df['_non_empty_dict'] = df.apply(get_non_empty_dict, axis=1)\n",
" df['_non_empty_count'] = df['_non_empty_dict'].apply(len)\n",
" \n",
" # Check if row i's non-empty values are a subset of row j's non-empty values\n",
" def is_subset_of_any(idx):\n",
" row_dict = df.loc[idx, '_non_empty_dict']\n",
" row_count = df.loc[idx, '_non_empty_count']\n",
" \n",
" for other_idx in df.index:\n",
" if idx == other_idx:\n",
" continue\n",
" other_dict = df.loc[other_idx, '_non_empty_dict']\n",
" other_count = df.loc[other_idx, '_non_empty_count']\n",
" \n",
" # Check if all non-empty values in current row match those in other row\n",
" if all(row_dict.get(k) == other_dict.get(k) for k in row_dict.keys()):\n",
" # If they match and other has more defined columns, current row is redundant\n",
" if other_count > row_count:\n",
" return True\n",
" return False\n",
" \n",
" # Keep rows that are not subsets of any other row\n",
" keep_mask = ~df.index.to_series().apply(is_subset_of_any)\n",
" df = df[keep_mask]\n",
"\n",
" if len(df) > 1:\n",
" original_df = original_df[original_df['_signature'].isin(df['_signature'])]\n",
" value_counts = original_df[\"_signature\"].value_counts()\n",
" max_signature = value_counts.idxmax()\n",
" df = df[df['_signature'] == max_signature]\n",
"\n",
" df['icao'] = icao\n",
" df = df.drop(columns=['_non_empty_dict', '_non_empty_count', '_signature'])\n",
" return df\n",
"\n",
"# names of releases something like\n",
"# planequery_aircraft_adsb_2024-06-01T00-00-00Z.csv.gz\n",
"\n",
"# Let's build historical first. \n",
"\n",
"def load_raw_adsb_for_day(day):\n",
" \"\"\"Load raw ADS-B data for a day from cache or ClickHouse.\"\"\"\n",
" from datetime import timedelta\n",
" import clickhouse_connect\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" \n",
" start_time = day.replace(hour=0, minute=0, second=0, microsecond=0)\n",
" end_time = start_time + timedelta(days=1)\n",
" \n",
" # Set up caching\n",
" cache_dir = Path(\"data/adsb\")\n",
" cache_dir.mkdir(parents=True, exist_ok=True)\n",
" cache_file = cache_dir / f\"adsb_raw_{start_time.strftime('%Y-%m-%d')}.csv.zst\"\n",
" \n",
" # Check if cache exists\n",
" if cache_file.exists():\n",
" print(f\" Loading from cache: {cache_file}\")\n",
" df = pd.read_csv(cache_file, compression='zstd')\n",
" df['time'] = pd.to_datetime(df['time'])\n",
" else:\n",
" # Format dates for the query\n",
" start_str = start_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" end_str = end_time.strftime('%Y-%m-%d %H:%M:%S')\n",
" \n",
" client = clickhouse_connect.get_client(\n",
" host=os.environ[\"CLICKHOUSE_HOST\"],\n",
" username=os.environ[\"CLICKHOUSE_USERNAME\"],\n",
" password=os.environ[\"CLICKHOUSE_PASSWORD\"],\n",
" secure=True,\n",
" )\n",
" print(f\" Querying ClickHouse for {start_time.strftime('%Y-%m-%d')}\")\n",
" df = client.query_df(f\"SELECT time, icao,r,t,dbFlags,ownOp,year,desc,aircraft FROM adsb_messages Where time > '{start_str}' AND time < '{end_str}'\")\n",
" \n",
" # Save to cache\n",
" df.to_csv(cache_file, index=False, compression='zstd')\n",
" print(f\" Saved to cache: {cache_file}\")\n",
" \n",
" return df\n",
"\n",
"def load_historical_for_day(day):\n",
" from pathlib import Path\n",
" import pandas as pd\n",
" \n",
" df = load_raw_adsb_for_day(day)\n",
" \n",
" df['aircraft_category'] = df['aircraft'].apply(lambda x: x.get('category') if isinstance(x, dict) else None)\n",
" df = df.drop(columns=['aircraft'])\n",
" df = df.sort_values(['icao', 'time'])\n",
" df[COLUMNS] = df[COLUMNS].fillna('')\n",
" df_compressed = df.groupby('icao',group_keys=False).apply(compress_df)\n",
" cols = df_compressed.columns.tolist()\n",
" cols.remove('time')\n",
" cols.insert(0, 'time')\n",
" cols.remove(\"icao\")\n",
" cols.insert(1, \"icao\")\n",
" df_compressed = df_compressed[cols]\n",
" return df_compressed\n",
"\n",
"\n",
"def concat_compressed_dfs(df_base, df_new):\n",
" \"\"\"Concatenate base and new compressed dataframes, keeping the most informative row per ICAO.\"\"\"\n",
" import pandas as pd\n",
" \n",
" # Combine both dataframes\n",
" df_combined = pd.concat([df_base, df_new], ignore_index=True)\n",
" \n",
" # Sort by ICAO and time\n",
" df_combined = df_combined.sort_values(['icao', 'time'])\n",
" \n",
" # Fill NaN values\n",
" df_combined[COLUMNS] = df_combined[COLUMNS].fillna('')\n",
" \n",
" # Apply compression logic per ICAO to get the best row\n",
" df_compressed = df_combined.groupby('icao', group_keys=False).apply(compress_df)\n",
" \n",
" # Sort by time\n",
" df_compressed = df_compressed.sort_values('time')\n",
" \n",
" return df_compressed\n",
"\n",
"\n",
"def get_latest_aircraft_adsb_csv_df():\n",
" \"\"\"Download and load the latest ADS-B CSV from GitHub releases.\"\"\"\n",
" from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv\n",
" \n",
" import pandas as pd\n",
" import re\n",
" \n",
" csv_path = download_latest_aircraft_adsb_csv()\n",
" df = pd.read_csv(csv_path)\n",
" df = df.fillna(\"\")\n",
" \n",
" # Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv\n",
" match = re.search(r\"planequery_aircraft_adsb_(\\d{4}-\\d{2}-\\d{2})_\", str(csv_path))\n",
" if not match:\n",
" raise ValueError(f\"Could not extract date from filename: {csv_path.name}\")\n",
" \n",
" date_str = match.group(1)\n",
" return df, date_str\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e14c8363",
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"df = load_historical_for_day(datetime(2024,1,1))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3874ba4d",
"metadata": {},
"outputs": [],
"source": [
"len(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bcae50ad",
"metadata": {},
"outputs": [],
"source": [
"df[(df['icao'] == \"008081\")]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "50921c86",
"metadata": {},
"outputs": [],
"source": [
"df[df['icao'] == \"a4e1d2\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8194d9aa",
"metadata": {},
"outputs": [],
"source": [
"df[df['r'] == \"N4131T\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e3b7aa2",
"metadata": {},
"outputs": [],
"source": [
"df_compressed[df_compressed['icao'].duplicated(keep=False)]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40613bc1",
"metadata": {},
"outputs": [],
"source": [
"import gzip\n",
"import json\n",
"\n",
"path = \"/Users/jonahgoode/Downloads/test_extract/traces/fb/trace_full_acbbfb.json\"\n",
"\n",
"with gzip.open(path, \"rt\", encoding=\"utf-8\") as f:\n",
" data = json.load(f)\n",
"\n",
"print(type(data))\n",
"# use `data` here\n",
"import json\n",
"print(json.dumps(data, indent=2)[:2000])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "320109b2",
"metadata": {},
"outputs": [],
"source": [
"# First, load the JSON to inspect its structure\n",
"import json\n",
"with open(\"/Users/jonahgoode/Documents/PlaneQuery/Other-Code/readsb-protobuf/webapp/src/db/aircrafts.json\", 'r') as f:\n",
" data = json.load(f)\n",
"\n",
"# Check the structure\n",
"print(type(data))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "590134f4",
"metadata": {},
"outputs": [],
"source": [
"data['AC97E3']"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+1
View File
@@ -3,3 +3,4 @@ pandas==3.0.0
pyarrow==23.0.0 pyarrow==23.0.0
orjson==3.11.7 orjson==3.11.7
polars==1.38.1 polars==1.38.1
jsonschema==4.26.0
+51 -18
View File
@@ -1,9 +1,8 @@
{ {
"$schema": "https://json-schema.org/draft/2020-12/schema", "$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "PlaneQuery Aircraft Community Submission (v1)", "title": "OpenAirframes Community Submission (v1)",
"type": "object", "type": "object",
"additionalProperties": false, "additionalProperties": false,
"properties": { "properties": {
"registration_number": { "registration_number": {
"type": "string", "type": "string",
@@ -11,13 +10,12 @@
}, },
"transponder_code_hex": { "transponder_code_hex": {
"type": "string", "type": "string",
"pattern": "^[0-9A-Fa-f]{6}$" "pattern": "^[0-9A-F]{6}$"
}, },
"planequery_airframe_id": { "openairframes_id": {
"type": "string", "type": "string",
"minLength": 1 "minLength": 1
}, },
"contributor_uuid": { "contributor_uuid": {
"type": "string", "type": "string",
"format": "uuid" "format": "uuid"
@@ -28,14 +26,24 @@
"maxLength": 150, "maxLength": 150,
"description": "Display name (may be blank)" "description": "Display name (may be blank)"
}, },
"creation_timestamp": { "creation_timestamp": {
"type": "string", "type": "string",
"format": "date-time", "format": "date-time",
"description": "Set by the system when the submission is persisted/approved.", "description": "Set by the system when the submission is persisted/approved.",
"readOnly": true "readOnly": true
}, },
"start_date": {
"type": "string",
"format": "date",
"pattern": "^\\d{4}-\\d{2}-\\d{2}$",
"description": "Optional start date for when this submission's tags are valid (ISO 8601, e.g., 2025-05-01)."
},
"end_date": {
"type": "string",
"format": "date",
"pattern": "^\\d{4}-\\d{2}-\\d{2}$",
"description": "Optional end date for when this submission's tags are valid (ISO 8601, e.g., 2025-07-03)."
},
"tags": { "tags": {
"type": "object", "type": "object",
"description": "Additional community-defined tags as key/value pairs (values may be scalar, array, or object).", "description": "Additional community-defined tags as key/value pairs (values may be scalar, array, or object).",
@@ -43,38 +51,63 @@
"type": "string", "type": "string",
"pattern": "^[a-z][a-z0-9_]{0,63}$" "pattern": "^[a-z][a-z0-9_]{0,63}$"
}, },
"additionalProperties": { "$ref": "#/$defs/tagValue" } "additionalProperties": {
"$ref": "#/$defs/tagValue"
},
"properties": {}
} }
}, },
"allOf": [ "allOf": [
{ {
"anyOf": [ "anyOf": [
{ "required": ["registration_number"] }, {
{ "required": ["transponder_code_hex"] }, "required": [
{ "required": ["planequery_airframe_id"] } "registration_number"
]
},
{
"required": [
"transponder_code_hex"
]
},
{
"required": [
"openairframes_id"
]
}
] ]
} }
], ],
"$defs": { "$defs": {
"tagScalar": { "tagScalar": {
"type": ["string", "number", "integer", "boolean", "null"] "type": [
"string",
"number",
"integer",
"boolean",
"null"
]
}, },
"tagValue": { "tagValue": {
"anyOf": [ "anyOf": [
{ "$ref": "#/$defs/tagScalar" }, {
"$ref": "#/$defs/tagScalar"
},
{ {
"type": "array", "type": "array",
"maxItems": 50, "maxItems": 50,
"items": { "$ref": "#/$defs/tagScalar" } "items": {
"$ref": "#/$defs/tagScalar"
}
}, },
{ {
"type": "object", "type": "object",
"maxProperties": 50, "maxProperties": 50,
"additionalProperties": { "$ref": "#/$defs/tagScalar" } "additionalProperties": {
"$ref": "#/$defs/tagScalar"
}
} }
] ]
} }
} }
} }
+24 -9
View File
@@ -27,7 +27,7 @@ from src.adsb.compress_adsb_to_aircraft_data import compress_multi_icao_df, COLU
DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks") DEFAULT_CHUNK_DIR = os.path.join(OUTPUT_DIR, "adsb_chunks")
FINAL_OUTPUT_DIR = "./data/planequery_aircraft" FINAL_OUTPUT_DIR = "./data/openairframes"
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True) os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
@@ -36,8 +36,13 @@ def get_target_day() -> datetime:
return datetime.utcnow() - timedelta(days=1) return datetime.utcnow() - timedelta(days=1)
def process_single_chunk(chunk_path: str) -> pl.DataFrame: def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
"""Load and compress a single chunk parquet file.""" """Load and compress a single chunk parquet file.
Args:
chunk_path: Path to parquet file
delete_after_load: If True, delete the parquet file after loading to free disk space
"""
print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}") print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
# Load chunk - only columns we need # Load chunk - only columns we need
@@ -45,6 +50,14 @@ def process_single_chunk(chunk_path: str) -> pl.DataFrame:
df = pl.read_parquet(chunk_path, columns=needed_columns) df = pl.read_parquet(chunk_path, columns=needed_columns)
print(f" Loaded {len(df)} rows") print(f" Loaded {len(df)} rows")
# Delete file immediately after loading to free disk space
if delete_after_load:
try:
os.remove(chunk_path)
print(f" Deleted {chunk_path} to free disk space")
except Exception as e:
print(f" Warning: Failed to delete {chunk_path}: {e}")
# Compress to aircraft records (one per ICAO) using shared function # Compress to aircraft records (one per ICAO) using shared function
compressed = compress_multi_icao_df(df, verbose=True) compressed = compress_multi_icao_df(df, verbose=True)
print(f" Compressed to {len(compressed)} aircraft records") print(f" Compressed to {len(compressed)} aircraft records")
@@ -72,12 +85,12 @@ def combine_compressed_chunks(compressed_dfs: list[pl.DataFrame]) -> pl.DataFram
def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame: def download_and_merge_base_release(compressed_df: pl.DataFrame) -> pl.DataFrame:
"""Download base release and merge with new data.""" """Download base release and merge with new data."""
from src.get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv from src.get_latest_release import download_latest_aircraft_adsb_csv
print("Downloading base ADS-B release...") print("Downloading base ADS-B release...")
try: try:
base_path = download_latest_aircraft_adsb_csv( base_path = download_latest_aircraft_adsb_csv(
output_dir="./data/planequery_aircraft_base" output_dir="./data/openairframes_base"
) )
print(f"Download returned: {base_path}") print(f"Download returned: {base_path}")
@@ -156,16 +169,17 @@ def main():
parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files") parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release") parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging") parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
args = parser.parse_args() args = parser.parse_args()
# Determine output ID and filename based on mode # Determine output ID and filename based on mode
if args.start_date and args.end_date: if args.start_date and args.end_date:
# Historical mode # Historical mode
output_id = f"{args.start_date}_{args.end_date}" output_id = f"{args.start_date}_{args.end_date}"
output_filename = f"planequery_aircraft_adsb_{args.start_date}_{args.end_date}.csv" output_filename = f"openairframes_adsb_{args.start_date}_{args.end_date}.csv"
print(f"Combining chunks for date range: {args.start_date} to {args.end_date}") print(f"Combining chunks for date range: {args.start_date} to {args.end_date}")
else: else:
# Daily mode # Daily mode - use same date for start and end
if args.date: if args.date:
target_day = datetime.strptime(args.date, "%Y-%m-%d") target_day = datetime.strptime(args.date, "%Y-%m-%d")
else: else:
@@ -173,7 +187,7 @@ def main():
date_str = target_day.strftime("%Y-%m-%d") date_str = target_day.strftime("%Y-%m-%d")
output_id = date_str output_id = date_str
output_filename = f"planequery_aircraft_adsb_{date_str}.csv" output_filename = f"openairframes_adsb_{date_str}_{date_str}.csv"
print(f"Combining chunks for {date_str}") print(f"Combining chunks for {date_str}")
chunks_dir = args.chunks_dir chunks_dir = args.chunks_dir
@@ -190,9 +204,10 @@ def main():
print(f"Found {len(chunk_files)} chunk files") print(f"Found {len(chunk_files)} chunk files")
# Process each chunk separately to save memory # Process each chunk separately to save memory
# With --stream, delete parquet files immediately after loading to save disk space
compressed_chunks = [] compressed_chunks = []
for chunk_path in chunk_files: for chunk_path in chunk_files:
compressed = process_single_chunk(chunk_path) compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
compressed_chunks.append(compressed) compressed_chunks.append(compressed)
gc.collect() gc.collect()
+3 -3
View File
@@ -253,7 +253,7 @@ def concat_compressed_dfs(df_base, df_new):
def get_latest_aircraft_adsb_csv_df(): def get_latest_aircraft_adsb_csv_df():
"""Download and load the latest ADS-B CSV from GitHub releases.""" """Download and load the latest ADS-B CSV from GitHub releases."""
from get_latest_planequery_aircraft_release import download_latest_aircraft_adsb_csv from get_latest_release import download_latest_aircraft_adsb_csv
import re import re
csv_path = download_latest_aircraft_adsb_csv() csv_path = download_latest_aircraft_adsb_csv()
@@ -264,8 +264,8 @@ def get_latest_aircraft_adsb_csv_df():
if df[col].dtype == pl.Utf8: if df[col].dtype == pl.Utf8:
df = df.with_columns(pl.col(col).fill_null("")) df = df.with_columns(pl.col(col).fill_null(""))
# Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match: if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}") raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+13 -5
View File
@@ -82,7 +82,8 @@ def fetch_releases(version_date: str) -> list:
if version_date == "v2024.12.31": if version_date == "v2024.12.31":
year = "2025" year = "2025"
BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases" BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
PATTERN = f"{version_date}-planes-readsb-prod-0" # Match exact release name, exclude tmp releases
PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+$"
releases = [] releases = []
page = 1 page = 1
@@ -187,19 +188,23 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
cat_proc = subprocess.Popen( cat_proc = subprocess.Popen(
["cat"] + file_paths, ["cat"] + file_paths,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL stderr=subprocess.PIPE
) )
tar_cmd = ["tar", "xf", "-", "-C", extract_dir, "--strip-components=1"] tar_cmd = ["tar", "xf", "-", "-C", extract_dir, "--strip-components=1"]
subprocess.run( result = subprocess.run(
tar_cmd, tar_cmd,
stdin=cat_proc.stdout, stdin=cat_proc.stdout,
stdout=subprocess.DEVNULL, stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL, stderr=subprocess.PIPE,
check=True check=True
) )
cat_proc.stdout.close() cat_proc.stdout.close()
cat_stderr = cat_proc.stderr.read().decode() if cat_proc.stderr else ""
cat_proc.wait() cat_proc.wait()
if cat_stderr:
print(f"cat stderr: {cat_stderr}")
print(f"Successfully extracted archive to {extract_dir}") print(f"Successfully extracted archive to {extract_dir}")
# Delete tar files immediately after extraction # Delete tar files immediately after extraction
@@ -217,7 +222,10 @@ def extract_split_archive(file_paths: list, extract_dir: str) -> bool:
return True return True
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
stderr_output = e.stderr.decode() if e.stderr else ""
print(f"Failed to extract split archive: {e}") print(f"Failed to extract split archive: {e}")
if stderr_output:
print(f"tar stderr: {stderr_output}")
return False return False
+2 -2
View File
@@ -76,8 +76,8 @@ def main():
print(f"After dedup: {df_accumulated.height} rows") print(f"After dedup: {df_accumulated.height} rows")
# Write and upload final result # Write and upload final result
output_name = f"planequery_aircraft_adsb_{global_start}_{global_end}.csv.gz" output_name = f"openairframes_adsb_{global_start}_{global_end}.csv.gz"
csv_output = Path(f"/tmp/planequery_aircraft_adsb_{global_start}_{global_end}.csv") csv_output = Path(f"/tmp/openairframes_adsb_{global_start}_{global_end}.csv")
gz_output = Path(f"/tmp/{output_name}") gz_output = Path(f"/tmp/{output_name}")
df_accumulated.write_csv(csv_output) df_accumulated.write_csv(csv_output)
+73 -13
View File
@@ -21,12 +21,14 @@ import urllib.request
import urllib.error import urllib.error
from datetime import datetime, timezone from datetime import datetime, timezone
from .schema import extract_json_from_issue_body, extract_contributor_name_from_issue_body, parse_and_validate from .schema import extract_json_from_issue_body, extract_contributor_name_from_issue_body, parse_and_validate, load_schema, SCHEMAS_DIR
from .contributor import ( from .contributor import (
generate_contributor_uuid, generate_contributor_uuid,
generate_submission_filename, generate_submission_filename,
compute_content_hash, compute_content_hash,
) )
from .update_schema import generate_updated_schema, check_for_new_tags, get_existing_tag_definitions
from .read_community_data import build_tag_type_registry
def github_api_request( def github_api_request(
@@ -54,7 +56,11 @@ def github_api_request(
try: try:
with urllib.request.urlopen(req) as response: with urllib.request.urlopen(req) as response:
return json.loads(response.read()) response_body = response.read()
# DELETE requests return empty body (204 No Content)
if not response_body:
return {}
return json.loads(response_body)
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
error_body = e.read().decode() if e.fp else "" error_body = e.read().decode() if e.fp else ""
print(f"GitHub API error: {e.code} {e.reason}: {error_body}", file=sys.stderr) print(f"GitHub API error: {e.code} {e.reason}: {error_body}", file=sys.stderr)
@@ -94,14 +100,30 @@ def create_branch(branch_name: str, sha: str) -> None:
raise raise
def get_file_sha(path: str, branch: str) -> str | None:
"""Get the SHA of an existing file, or None if it doesn't exist."""
try:
response = github_api_request("GET", f"/contents/{path}?ref={branch}")
return response.get("sha")
except Exception:
return None
def create_or_update_file(path: str, content: str, message: str, branch: str) -> None: def create_or_update_file(path: str, content: str, message: str, branch: str) -> None:
"""Create or update a file in the repository.""" """Create or update a file in the repository."""
content_b64 = base64.b64encode(content.encode()).decode() content_b64 = base64.b64encode(content.encode()).decode()
github_api_request("PUT", f"/contents/{path}", { payload = {
"message": message, "message": message,
"content": content_b64, "content": content_b64,
"branch": branch, "branch": branch,
}) }
# If file exists, we need to include its SHA to update it
sha = get_file_sha(path, branch)
if sha:
payload["sha"] = sha
github_api_request("PUT", f"/contents/{path}", payload)
def create_pull_request(title: str, head: str, base: str, body: str) -> dict: def create_pull_request(title: str, head: str, base: str, body: str) -> dict:
@@ -144,21 +166,19 @@ def process_submission(
return False return False
data, errors = parse_and_validate(json_str) data, errors = parse_and_validate(json_str)
if errors: if errors or data is None:
error_list = "\n".join(f"- {e}" for e in errors) error_list = "\n".join(f"- {e}" for e in errors) if errors else "Unknown error"
add_issue_comment(issue_number, f"❌ **Validation Failed**\n\n{error_list}") add_issue_comment(issue_number, f"❌ **Validation Failed**\n\n{error_list}")
return False return False
# Normalize to list # Normalize to list
submissions = data if isinstance(data, list) else [data] submissions: list[dict] = data if isinstance(data, list) else [data]
# Generate contributor UUID from GitHub ID # Generate contributor UUID from GitHub ID
contributor_uuid = generate_contributor_uuid(author_id) contributor_uuid = generate_contributor_uuid(author_id)
# Extract contributor name from issue form (or default to GitHub username) # Extract contributor name from issue form (None means user opted out of attribution)
contributor_name = extract_contributor_name_from_issue_body(issue_body) contributor_name = extract_contributor_name_from_issue_body(issue_body)
if not contributor_name:
contributor_name = f"@{author_username}"
# Add metadata to each submission # Add metadata to each submission
now = datetime.now(timezone.utc) now = datetime.now(timezone.utc)
@@ -167,14 +187,15 @@ def process_submission(
for submission in submissions: for submission in submissions:
submission["contributor_uuid"] = contributor_uuid submission["contributor_uuid"] = contributor_uuid
submission["contributor_name"] = contributor_name if contributor_name:
submission["contributor_name"] = contributor_name
submission["creation_timestamp"] = timestamp_str submission["creation_timestamp"] = timestamp_str
# Generate unique filename # Generate unique filename
content_json = json.dumps(submissions, indent=2, sort_keys=True) content_json = json.dumps(submissions, indent=2, sort_keys=True)
content_hash = compute_content_hash(content_json) content_hash = compute_content_hash(content_json)
filename = generate_submission_filename(author_username, date_str, content_hash) filename = generate_submission_filename(author_username, date_str, content_hash)
file_path = f"community/{filename}" file_path = f"community/{date_str}/{filename}"
# Create branch # Create branch
branch_name = f"community-submission-{issue_number}" branch_name = f"community-submission-{issue_number}"
@@ -185,14 +206,53 @@ def process_submission(
commit_message = f"Add community submission from @{author_username} (closes #{issue_number})" commit_message = f"Add community submission from @{author_username} (closes #{issue_number})"
create_or_update_file(file_path, content_json, commit_message, branch_name) create_or_update_file(file_path, content_json, commit_message, branch_name)
# Update schema with any new tags (modifies v1 in place)
schema_updated = False
new_tags = []
try:
# Build tag registry from new submissions
tag_registry = build_tag_type_registry(submissions)
# Get current schema and merge existing tags
current_schema = load_schema()
existing_tags = get_existing_tag_definitions(current_schema)
# Merge existing tags into registry
for tag_name, tag_def in existing_tags.items():
if tag_name not in tag_registry:
tag_type = tag_def.get("type", "string")
tag_registry[tag_name] = tag_type
# Check for new tags
new_tags = check_for_new_tags(tag_registry, current_schema)
if new_tags:
# Generate updated schema
updated_schema = generate_updated_schema(current_schema, tag_registry)
schema_json = json.dumps(updated_schema, indent=2) + "\n"
create_or_update_file(
"schemas/community_submission.v1.schema.json",
schema_json,
f"Update schema with new tags: {', '.join(new_tags)}",
branch_name
)
schema_updated = True
except Exception as e:
print(f"Warning: Could not update schema: {e}", file=sys.stderr)
# Create PR # Create PR
schema_note = ""
if schema_updated:
schema_note = f"\n**Schema Updated:** Added new tags: `{', '.join(new_tags)}`\n"
pr_body = f"""## Community Submission pr_body = f"""## Community Submission
Adds {len(submissions)} submission(s) from @{author_username}. Adds {len(submissions)} submission(s) from @{author_username}.
**File:** `{file_path}` **File:** `{file_path}`
**Contributor UUID:** `{contributor_uuid}` **Contributor UUID:** `{contributor_uuid}`
{schema_note}
Closes #{issue_number} Closes #{issue_number}
--- ---
@@ -17,7 +17,7 @@ import pandas as pd
COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community" COMMUNITY_DIR = Path(__file__).parent.parent.parent / "community"
OUT_ROOT = Path("data/planequery_aircraft") OUT_ROOT = Path("data/openairframes")
def read_all_submissions(community_dir: Path) -> list[dict]: def read_all_submissions(community_dir: Path) -> list[dict]:
@@ -47,7 +47,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
- creation_timestamp (first) - creation_timestamp (first)
- transponder_code_hex - transponder_code_hex
- registration_number - registration_number
- planequery_airframe_id - openairframes_id
- contributor_name - contributor_name
- [other columns alphabetically] - [other columns alphabetically]
- contributor_uuid (last) - contributor_uuid (last)
@@ -62,7 +62,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
"creation_timestamp", "creation_timestamp",
"transponder_code_hex", "transponder_code_hex",
"registration_number", "registration_number",
"planequery_airframe_id", "openairframes_id",
"contributor_name", "contributor_name",
"contributor_uuid", "contributor_uuid",
] ]
@@ -78,7 +78,7 @@ def submissions_to_dataframe(submissions: list[dict]) -> pd.DataFrame:
"creation_timestamp", "creation_timestamp",
"transponder_code_hex", "transponder_code_hex",
"registration_number", "registration_number",
"planequery_airframe_id", "openairframes_id",
"contributor_name", "contributor_name",
] ]
last_cols = ["contributor_uuid"] last_cols = ["contributor_uuid"]
@@ -108,7 +108,7 @@ def main():
"creation_timestamp", "creation_timestamp",
"transponder_code_hex", "transponder_code_hex",
"registration_number", "registration_number",
"planequery_airframe_id", "openairframes_id",
"contributor_name", "contributor_name",
"tags", "tags",
"contributor_uuid", "contributor_uuid",
@@ -127,7 +127,7 @@ def main():
# Output # Output
OUT_ROOT.mkdir(parents=True, exist_ok=True) OUT_ROOT.mkdir(parents=True, exist_ok=True)
output_file = OUT_ROOT / f"planequery_aircraft_community_{start_date_str}_{date_str}.csv" output_file = OUT_ROOT / f"openairframes_community_{start_date_str}_{date_str}.csv"
df.to_csv(output_file, index=False) df.to_csv(output_file, index=False)
+50 -3
View File
@@ -30,7 +30,8 @@ def read_all_submissions(community_dir: Path | None = None) -> list[dict]:
all_submissions = [] all_submissions = []
for json_file in sorted(community_dir.glob("*.json")): # Search both root directory and date subdirectories (e.g., 2026-02-12/)
for json_file in sorted(community_dir.glob("**/*.json")):
try: try:
with open(json_file) as f: with open(json_file) as f:
data = json.load(f) data = json.load(f)
@@ -50,6 +51,52 @@ def read_all_submissions(community_dir: Path | None = None) -> list[dict]:
return all_submissions return all_submissions
def get_python_type_name(value) -> str:
"""Get a normalized type name for a value."""
if value is None:
return "null"
if isinstance(value, bool):
return "boolean"
if isinstance(value, int):
return "integer"
if isinstance(value, float):
return "number"
if isinstance(value, str):
return "string"
if isinstance(value, list):
return "array"
if isinstance(value, dict):
return "object"
return type(value).__name__
def build_tag_type_registry(submissions: list[dict]) -> dict[str, str]:
"""
Build a registry of tag names to their expected types from existing submissions.
Args:
submissions: List of existing submission dictionaries
Returns:
Dict mapping tag name to expected type (e.g., {"internet": "string", "year_built": "integer"})
"""
tag_types = {}
for submission in submissions:
tags = submission.get("tags", {})
if not isinstance(tags, dict):
continue
for key, value in tags.items():
inferred_type = get_python_type_name(value)
if key not in tag_types:
tag_types[key] = inferred_type
# If there's a conflict, keep the first type (it's already in use)
return tag_types
def group_by_identifier(submissions: list[dict]) -> dict[str, list[dict]]: def group_by_identifier(submissions: list[dict]) -> dict[str, list[dict]]:
""" """
Group submissions by their identifier (registration, transponder, or airframe ID). Group submissions by their identifier (registration, transponder, or airframe ID).
@@ -65,8 +112,8 @@ def group_by_identifier(submissions: list[dict]) -> dict[str, list[dict]]:
key = f"reg:{submission['registration_number']}" key = f"reg:{submission['registration_number']}"
elif "transponder_code_hex" in submission: elif "transponder_code_hex" in submission:
key = f"icao:{submission['transponder_code_hex']}" key = f"icao:{submission['transponder_code_hex']}"
elif "planequery_airframe_id" in submission: elif "openairframes_id" in submission:
key = f"id:{submission['planequery_airframe_id']}" key = f"id:{submission['openairframes_id']}"
else: else:
key = "_unknown" key = "_unknown"
+66
View File
@@ -0,0 +1,66 @@
#!/usr/bin/env python3
"""
Regenerate schema for a PR branch after main has been merged in.
This script looks at the submission files in this branch and updates
the schema if new tags were introduced.
Usage: python -m src.contributions.regenerate_pr_schema
"""
import json
import sys
from pathlib import Path
# Add parent to path for imports when running as script
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from src.contributions.read_community_data import read_all_submissions, build_tag_type_registry
from src.contributions.update_schema import (
get_existing_tag_definitions,
check_for_new_tags,
generate_updated_schema,
)
from src.contributions.schema import load_schema, SCHEMAS_DIR
def main():
"""Main entry point."""
# Load current schema
current_schema = load_schema()
# Get existing tag definitions from schema
existing_tags = get_existing_tag_definitions(current_schema)
# Read all submissions (including ones from this PR branch)
submissions = read_all_submissions()
if not submissions:
print("No submissions found")
return
# Build tag registry from all submissions
tag_registry = build_tag_type_registry(submissions)
# Check for new tags not in the current schema
new_tags = check_for_new_tags(tag_registry, current_schema)
if new_tags:
print(f"Found new tags: {new_tags}")
print("Updating schema...")
# Generate updated schema
updated_schema = generate_updated_schema(current_schema, tag_registry)
# Write updated schema (in place)
schema_path = SCHEMAS_DIR / "community_submission.v1.schema.json"
with open(schema_path, 'w') as f:
json.dump(updated_schema, f, indent=2)
f.write("\n")
print(f"Updated {schema_path}")
else:
print("No new tags found, schema is up to date")
if __name__ == "__main__":
main()
+116 -8
View File
@@ -10,12 +10,59 @@ except ImportError:
Draft202012Validator = None Draft202012Validator = None
SCHEMA_PATH = Path(__file__).parent.parent.parent / "schemas" / "community_submission.v1.schema.json" SCHEMAS_DIR = Path(__file__).parent.parent.parent / "schemas"
# For backwards compatibility
SCHEMA_PATH = SCHEMAS_DIR / "community_submission.v1.schema.json"
def load_schema() -> dict: def get_latest_schema_version() -> int:
"""Load the community submission schema.""" """
with open(SCHEMA_PATH) as f: Find the latest schema version number.
Returns:
Latest version number (e.g., 1, 2, 3)
"""
import re
pattern = re.compile(r"community_submission\.v(\d+)\.schema\.json$")
max_version = 0
for path in SCHEMAS_DIR.glob("community_submission.v*.schema.json"):
match = pattern.search(path.name)
if match:
version = int(match.group(1))
max_version = max(max_version, version)
return max_version
def get_schema_path(version: int | None = None) -> Path:
"""
Get path to a specific schema version, or latest if version is None.
Args:
version: Schema version number, or None for latest
Returns:
Path to schema file
"""
if version is None:
version = get_latest_schema_version()
return SCHEMAS_DIR / f"community_submission.v{version}.schema.json"
def load_schema(version: int | None = None) -> dict:
"""
Load the community submission schema.
Args:
version: Schema version to load. If None, loads the latest version.
Returns:
Schema dict
"""
schema_path = get_schema_path(version)
with open(schema_path) as f:
return json.load(f) return json.load(f)
@@ -50,11 +97,36 @@ def validate_submission(data: dict | list, schema: dict | None = None) -> list[s
return errors return errors
def download_github_attachment(url: str) -> str | None:
"""
Download content from a GitHub attachment URL.
Args:
url: GitHub attachment URL (e.g., https://github.com/user-attachments/files/...)
Returns:
File content as string, or None if download failed
"""
import urllib.request
import urllib.error
try:
req = urllib.request.Request(url, headers={"User-Agent": "OpenAirframes-Bot"})
with urllib.request.urlopen(req, timeout=30) as response:
return response.read().decode("utf-8")
except (urllib.error.URLError, urllib.error.HTTPError, UnicodeDecodeError) as e:
print(f"Failed to download attachment from {url}: {e}")
return None
def extract_json_from_issue_body(body: str) -> str | None: def extract_json_from_issue_body(body: str) -> str | None:
""" """
Extract JSON from GitHub issue body. Extract JSON from GitHub issue body.
Looks for JSON in the 'Submission JSON' section wrapped in code blocks. Looks for JSON in the 'Submission JSON' section, either:
- A GitHub file attachment URL (drag-and-drop .json file)
- Wrapped in code blocks (```json ... ``` or ``` ... ```)
- Or raw JSON after the header
Args: Args:
body: The issue body text body: The issue body text
@@ -62,13 +134,49 @@ def extract_json_from_issue_body(body: str) -> str | None:
Returns: Returns:
Extracted JSON string or None if not found Extracted JSON string or None if not found
""" """
# Match JSON in "### Submission JSON" section # Try: GitHub attachment URL in the Submission JSON section
pattern = r"### Submission JSON\s*\n\s*```(?:json)?\s*\n([\s\S]*?)\n\s*```" # Format: [filename.json](https://github.com/user-attachments/files/...)
match = re.search(pattern, body) # Or just the raw URL
pattern_attachment = r"### Submission JSON\s*\n[\s\S]*?(https://github\.com/(?:user-attachments/files|.*?/files)/[^\s\)\]]+\.json)"
match = re.search(pattern_attachment, body)
if match:
url = match.group(1)
content = download_github_attachment(url)
if content:
return content.strip()
# Also check for GitHub user-attachments URL anywhere in submission section
pattern_attachment_alt = r"\[.*?\.json\]\((https://github\.com/[^\)]+)\)"
match = re.search(pattern_attachment_alt, body)
if match:
url = match.group(1)
if ".json" in url or "user-attachments" in url:
content = download_github_attachment(url)
if content:
return content.strip()
# Try: JSON in code blocks after "### Submission JSON"
pattern_codeblock = r"### Submission JSON\s*\n\s*```(?:json)?\s*\n([\s\S]*?)\n\s*```"
match = re.search(pattern_codeblock, body)
if match: if match:
return match.group(1).strip() return match.group(1).strip()
# Try: Raw JSON after "### Submission JSON" until next section or end
pattern_raw = r"### Submission JSON\s*\n\s*([\[{][\s\S]*?[\]}])(?=\n###|\n\n###|$)"
match = re.search(pattern_raw, body)
if match:
return match.group(1).strip()
# Try: Any JSON object/array in the body (fallback)
pattern_any = r"([\[{][\s\S]*?[\]}])"
for match in re.finditer(pattern_any, body):
candidate = match.group(1).strip()
# Validate it looks like JSON
if candidate.startswith('{') and candidate.endswith('}'):
return candidate
if candidate.startswith('[') and candidate.endswith(']'):
return candidate
return None return None
+154
View File
@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""
Update the schema with tag type definitions from existing submissions.
This script reads all community submissions and generates a new schema version
that includes explicit type definitions for all known tags.
When new tags are introduced, a new schema version is created (e.g., v1 -> v2 -> v3).
Usage:
python -m src.contributions.update_schema
python -m src.contributions.update_schema --check # Check if update needed
"""
import argparse
import json
import sys
from pathlib import Path
from .read_community_data import read_all_submissions, build_tag_type_registry
from .schema import SCHEMAS_DIR, get_latest_schema_version, get_schema_path, load_schema
def get_existing_tag_definitions(schema: dict) -> dict[str, dict]:
"""Extract existing tag property definitions from schema."""
tags_props = schema.get("properties", {}).get("tags", {}).get("properties", {})
return tags_props
def type_name_to_json_schema(type_name: str) -> dict:
"""Convert a type name to a JSON Schema type definition."""
type_map = {
"string": {"type": "string"},
"integer": {"type": "integer"},
"number": {"type": "number"},
"boolean": {"type": "boolean"},
"null": {"type": "null"},
"array": {"type": "array", "items": {"$ref": "#/$defs/tagScalar"}},
"object": {"type": "object", "additionalProperties": {"$ref": "#/$defs/tagScalar"}},
}
return type_map.get(type_name, {"$ref": "#/$defs/tagValue"})
def generate_updated_schema(base_schema: dict, tag_registry: dict[str, str]) -> dict:
"""
Generate an updated schema with explicit tag definitions.
Args:
base_schema: The current schema to update
tag_registry: Dict mapping tag name to type name
Returns:
Updated schema dict
"""
schema = json.loads(json.dumps(base_schema)) # Deep copy
# Build tag properties with explicit types
tag_properties = {}
for tag_name, type_name in sorted(tag_registry.items()):
tag_properties[tag_name] = type_name_to_json_schema(type_name)
# Only add/update the properties key within tags, preserve everything else
if "properties" in schema and "tags" in schema["properties"]:
schema["properties"]["tags"]["properties"] = tag_properties
return schema
def check_for_new_tags(tag_registry: dict[str, str], current_schema: dict) -> list[str]:
"""
Check which tags in the registry are not yet defined in the schema.
Returns:
List of new tag names
"""
existing_tags = get_existing_tag_definitions(current_schema)
return [tag for tag in tag_registry if tag not in existing_tags]
def update_schema_file(
tag_registry: dict[str, str],
check_only: bool = False
) -> tuple[bool, list[str]]:
"""
Update the v1 schema file with new tag definitions.
Args:
tag_registry: Dict mapping tag name to type name
check_only: If True, only check if update is needed without writing
Returns:
Tuple of (was_updated, list_of_new_tags)
"""
current_schema = load_schema()
# Find new tags
new_tags = check_for_new_tags(tag_registry, current_schema)
if not new_tags:
return False, []
if check_only:
return True, new_tags
# Generate and write updated schema (in place)
updated_schema = generate_updated_schema(current_schema, tag_registry)
schema_path = get_schema_path()
with open(schema_path, "w") as f:
json.dump(updated_schema, f, indent=2)
f.write("\n")
return True, new_tags
def update_schema_from_submissions(check_only: bool = False) -> tuple[bool, list[str]]:
"""
Read all submissions and update the schema if needed.
Args:
check_only: If True, only check if update is needed without writing
Returns:
Tuple of (was_updated, list_of_new_tags)
"""
submissions = read_all_submissions()
tag_registry = build_tag_type_registry(submissions)
return update_schema_file(tag_registry, check_only)
def main():
parser = argparse.ArgumentParser(description="Update schema with tag definitions")
parser.add_argument("--check", action="store_true", help="Check if update needed without writing")
args = parser.parse_args()
was_updated, new_tags = update_schema_from_submissions(check_only=args.check)
if args.check:
if was_updated:
print(f"Schema update needed. New tags: {', '.join(new_tags)}")
sys.exit(1)
else:
print("Schema is up to date")
sys.exit(0)
else:
if was_updated:
print(f"Updated {get_schema_path()}")
print(f"Added tags: {', '.join(new_tags)}")
else:
print("No update needed")
if __name__ == "__main__":
main()
+78
View File
@@ -7,6 +7,7 @@ submissions when issues are opened or edited.
Usage: Usage:
python -m src.contributions.validate_submission --issue-body "..." python -m src.contributions.validate_submission --issue-body "..."
python -m src.contributions.validate_submission --issue-body-file /path/to/body.txt
python -m src.contributions.validate_submission --file submission.json python -m src.contributions.validate_submission --file submission.json
echo '{"registration_number": "N12345"}' | python -m src.contributions.validate_submission --stdin echo '{"registration_number": "N12345"}' | python -m src.contributions.validate_submission --stdin
@@ -23,6 +24,7 @@ import urllib.request
import urllib.error import urllib.error
from .schema import extract_json_from_issue_body, parse_and_validate, load_schema from .schema import extract_json_from_issue_body, parse_and_validate, load_schema
from .read_community_data import read_all_submissions, build_tag_type_registry, get_python_type_name
def github_api_request(method: str, endpoint: str, data: dict | None = None) -> dict: def github_api_request(method: str, endpoint: str, data: dict | None = None) -> dict:
@@ -65,6 +67,40 @@ def remove_issue_label(issue_number: int, label: str) -> None:
pass # Label might not exist pass # Label might not exist
def validate_tag_consistency(data: dict | list, tag_registry: dict[str, str]) -> list[str]:
"""
Check that tag types in new submissions match existing tag types.
Args:
data: Single submission dict or list of submissions
tag_registry: Dict mapping tag name to expected type
Returns:
List of error messages. Empty list means validation passed.
"""
errors = []
submissions = data if isinstance(data, list) else [data]
for i, submission in enumerate(submissions):
prefix = f"[{i}] " if len(submissions) > 1 else ""
tags = submission.get("tags", {})
if not isinstance(tags, dict):
continue
for key, value in tags.items():
actual_type = get_python_type_name(value)
if key in tag_registry:
expected_type = tag_registry[key]
if actual_type != expected_type:
errors.append(
f"{prefix}tags.{key}: expected type '{expected_type}', got '{actual_type}'"
)
return errors
def validate_and_report(json_str: str, issue_number: int | None = None) -> bool: def validate_and_report(json_str: str, issue_number: int | None = None) -> bool:
""" """
Validate JSON and optionally report to GitHub issue. Validate JSON and optionally report to GitHub issue.
@@ -90,6 +126,33 @@ def validate_and_report(json_str: str, issue_number: int | None = None) -> bool:
return False return False
# Check tag type consistency against existing submissions
if data is not None:
try:
existing_submissions = read_all_submissions()
tag_registry = build_tag_type_registry(existing_submissions)
tag_errors = validate_tag_consistency(data, tag_registry)
if tag_errors:
error_list = "\n".join(f"- {e}" for e in tag_errors)
message = (
f"❌ **Tag Type Mismatch**\n\n"
f"Your submission uses tags with types that don't match existing submissions:\n\n"
f"{error_list}\n\n"
f"Please use the same type as existing tags, or use a different tag name."
)
print(message, file=sys.stderr)
if issue_number:
add_issue_comment(issue_number, message)
remove_issue_label(issue_number, "validated")
return False
except Exception as e:
# Don't fail validation if we can't read existing submissions
print(f"Warning: Could not check tag consistency: {e}", file=sys.stderr)
count = len(data) if isinstance(data, list) else 1 count = len(data) if isinstance(data, list) else 1
message = f"✅ **Validation Passed**\n\n{count} submission(s) validated successfully against the schema.\n\nA maintainer can approve this submission by adding the `approved` label." message = f"✅ **Validation Passed**\n\n{count} submission(s) validated successfully against the schema.\n\nA maintainer can approve this submission by adding the `approved` label."
@@ -106,6 +169,7 @@ def main():
parser = argparse.ArgumentParser(description="Validate community submission JSON") parser = argparse.ArgumentParser(description="Validate community submission JSON")
source_group = parser.add_mutually_exclusive_group(required=True) source_group = parser.add_mutually_exclusive_group(required=True)
source_group.add_argument("--issue-body", help="Issue body text containing JSON") source_group.add_argument("--issue-body", help="Issue body text containing JSON")
source_group.add_argument("--issue-body-file", help="File containing issue body text")
source_group.add_argument("--file", help="JSON file to validate") source_group.add_argument("--file", help="JSON file to validate")
source_group.add_argument("--stdin", action="store_true", help="Read JSON from stdin") source_group.add_argument("--stdin", action="store_true", help="Read JSON from stdin")
@@ -125,6 +189,20 @@ def main():
"Please ensure your JSON is in the 'Submission JSON' field wrapped in code blocks." "Please ensure your JSON is in the 'Submission JSON' field wrapped in code blocks."
) )
sys.exit(1) sys.exit(1)
elif args.issue_body_file:
with open(args.issue_body_file) as f:
issue_body = f.read()
json_str = extract_json_from_issue_body(issue_body)
if not json_str:
print("❌ Could not extract JSON from issue body", file=sys.stderr)
print(f"Issue body:\n{issue_body}", file=sys.stderr)
if args.issue_number:
add_issue_comment(
args.issue_number,
"❌ **Validation Failed**\n\nCould not extract JSON from submission. "
"Please ensure your JSON is in the 'Submission JSON' field."
)
sys.exit(1)
elif args.file: elif args.file:
with open(args.file) as f: with open(args.file) as f:
json_str = f.read() json_str = f.read()
@@ -74,10 +74,10 @@ if __name__ == '__main__':
) )
# Save the result # Save the result
OUT_ROOT = Path("data/planequery_aircraft") OUT_ROOT = Path("data/openairframes")
OUT_ROOT.mkdir(parents=True, exist_ok=True) OUT_ROOT.mkdir(parents=True, exist_ok=True)
output_file = OUT_ROOT / f"planequery_aircraft_adsb_{start_date_str}_{date_str}.csv" output_file = OUT_ROOT / f"openairframes_adsb_{start_date_str}_{date_str}.csv"
df_combined.write_csv(output_file) df_combined.write_csv(output_file)
print(f"Saved: {output_file}") print(f"Saved: {output_file}")
@@ -22,12 +22,19 @@ if not zip_path.exists():
body = r.read() body = r.read()
zip_path.write_bytes(body) zip_path.write_bytes(body)
OUT_ROOT = Path("data/planequery_aircraft") OUT_ROOT = Path("data/openairframes")
OUT_ROOT.mkdir(parents=True, exist_ok=True) OUT_ROOT.mkdir(parents=True, exist_ok=True)
from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
from get_latest_planequery_aircraft_release import get_latest_aircraft_faa_csv_df from get_latest_release import get_latest_aircraft_faa_csv_df
df_new = convert_faa_master_txt_to_df(zip_path, date_str) df_new = convert_faa_master_txt_to_df(zip_path, date_str)
df_base, start_date_str = get_latest_aircraft_faa_csv_df()
df_base = concat_faa_historical_df(df_base, df_new) try:
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" df_base, start_date_str = get_latest_aircraft_faa_csv_df()
df_base.to_csv(OUT_ROOT / f"planequery_aircraft_faa_{start_date_str}_{date_str}.csv", index=False) df_base = concat_faa_historical_df(df_base, df_new)
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
except Exception as e:
print(f"No existing FAA release found, using only new data: {e}")
df_base = df_new
start_date_str = date_str
df_base.to_csv(OUT_ROOT / f"openairframes_faa_{start_date_str}_{date_str}.csv", index=False)
+5 -5
View File
@@ -29,8 +29,8 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_") certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
df = df.drop(columns="certification").join(certification) df = df.drop(columns="certification").join(certification)
# Create planequery_airframe_id # Create openairframes_id
df["planequery_airframe_id"] = ( df["openairframes_id"] = (
normalize(df["aircraft_manufacturer"]) normalize(df["aircraft_manufacturer"])
+ "|" + "|"
+ normalize(df["aircraft_model"]) + normalize(df["aircraft_model"])
@@ -38,11 +38,11 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
+ normalize(df["serial_number"]) + normalize(df["serial_number"])
) )
# Move planequery_airframe_id to come after registration_number # Move openairframes_id to come after registration_number
cols = df.columns.tolist() cols = df.columns.tolist()
cols.remove("planequery_airframe_id") cols.remove("openairframes_id")
reg_idx = cols.index("registration_number") reg_idx = cols.index("registration_number")
cols.insert(reg_idx + 1, "planequery_airframe_id") cols.insert(reg_idx + 1, "openairframes_id")
df = df[cols] df = df[cols]
# Convert all NaN to empty strings # Convert all NaN to empty strings
@@ -9,7 +9,7 @@ import urllib.error
import json import json
REPO = "PlaneQuery/planequery-aircraft" REPO = "PlaneQuery/openairframes"
LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest" LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest"
@@ -31,7 +31,7 @@ def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = No
url = f"https://api.github.com/repos/{repo}/releases/latest" url = f"https://api.github.com/repos/{repo}/releases/latest"
headers = { headers = {
"Accept": "application/vnd.github+json", "Accept": "application/vnd.github+json",
"User-Agent": "planequery-aircraft-downloader/1.0", "User-Agent": "openairframes-downloader/1.0",
} }
if github_token: if github_token:
headers["Authorization"] = f"Bearer {github_token}" headers["Authorization"] = f"Bearer {github_token}"
@@ -80,7 +80,7 @@ def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[s
out_path.parent.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
headers = { headers = {
"User-Agent": "planequery-aircraft-downloader/1.0", "User-Agent": "openairframes-downloader/1.0",
"Accept": "application/octet-stream", "Accept": "application/octet-stream",
} }
if github_token: if github_token:
@@ -109,7 +109,7 @@ def download_latest_aircraft_csv(
repo: str = REPO, repo: str = REPO,
) -> Path: ) -> Path:
""" """
Download the latest planequery_aircraft_faa_*.csv file from the latest GitHub release. Download the latest openairframes_faa_*.csv file from the latest GitHub release.
Args: Args:
output_dir: Directory to save the downloaded file (default: "downloads") output_dir: Directory to save the downloaded file (default: "downloads")
@@ -121,10 +121,10 @@ def download_latest_aircraft_csv(
""" """
assets = get_latest_release_assets(repo, github_token=github_token) assets = get_latest_release_assets(repo, github_token=github_token)
try: try:
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_faa_.*\.csv$") asset = pick_asset(assets, name_regex=r"^openairframes_faa_.*\.csv$")
except FileNotFoundError: except FileNotFoundError:
# Fallback to old naming pattern # Fallback to old naming pattern
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_\d{4}-\d{2}-\d{2}_.*\.csv$") asset = pick_asset(assets, name_regex=r"^openairframes_\d{4}-\d{2}-\d{2}_.*\.csv$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to return saved_to
@@ -136,11 +136,11 @@ def get_latest_aircraft_faa_csv_df():
'unique_regulatory_id': str, 'unique_regulatory_id': str,
'registrant_county': str}) 'registrant_county': str})
df = df.fillna("") df = df.fillna("")
# Extract start date from filename pattern: planequery_aircraft_faa_{start_date}_{end_date}.csv # Extract start date from filename pattern: openairframes_faa_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path)) match = re.search(r"openairframes_faa_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match: if not match:
# Fallback to old naming pattern: planequery_aircraft_{start_date}_{end_date}.csv # Fallback to old naming pattern: openairframes_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path)) match = re.search(r"openairframes_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match: if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}") raise ValueError(f"Could not extract date from filename: {csv_path.name}")
@@ -154,7 +154,7 @@ def download_latest_aircraft_adsb_csv(
repo: str = REPO, repo: str = REPO,
) -> Path: ) -> Path:
""" """
Download the latest planequery_aircraft_adsb_*.csv file from the latest GitHub release. Download the latest openairframes_adsb_*.csv file from the latest GitHub release.
Args: Args:
output_dir: Directory to save the downloaded file (default: "downloads") output_dir: Directory to save the downloaded file (default: "downloads")
@@ -165,7 +165,7 @@ def download_latest_aircraft_adsb_csv(
Path to the downloaded file Path to the downloaded file
""" """
assets = get_latest_release_assets(repo, github_token=github_token) assets = get_latest_release_assets(repo, github_token=github_token)
asset = pick_asset(assets, name_regex=r"^planequery_aircraft_adsb_.*\.csv$") asset = pick_asset(assets, name_regex=r"^openairframes_adsb_.*\.csv$")
saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token) saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}") print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
return saved_to return saved_to
@@ -176,8 +176,8 @@ def get_latest_aircraft_adsb_csv_df():
import pandas as pd import pandas as pd
df = pd.read_csv(csv_path) df = pd.read_csv(csv_path)
df = df.fillna("") df = df.fillna("")
# Extract start date from filename pattern: planequery_aircraft_adsb_{start_date}_{end_date}.csv # Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv
match = re.search(r"planequery_aircraft_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path)) match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
if not match: if not match:
raise ValueError(f"Could not extract date from filename: {csv_path.name}") raise ValueError(f"Could not extract date from filename: {csv_path.name}")