Compare commits

...

7 Commits

Author SHA1 Message Date
ggman12 6a250a63fb fix None value comparision 2026-02-14 20:21:32 -05:00
ggman12 9e24fcbc63 update integrity checker. Hopefully solve issue. 2026-02-14 19:56:25 -05:00
ggman12 8ce04f1f83 Revert "update for historical run"
This reverts commit ccf55b2308.
2026-02-14 18:44:21 -05:00
ggman12 9441761ac9 use temp release too. 2026-02-14 18:43:25 -05:00
ggman12 ccf55b2308 update for historical run 2026-02-14 15:57:16 -05:00
ggman12 76eaf118ef add run_local.py 2026-02-14 15:54:36 -05:00
ggman12 0fcbad0fbc let mictronics retry 2026-02-14 15:07:08 -05:00
7 changed files with 247 additions and 30 deletions
+32 -14
View File
@@ -95,20 +95,27 @@ jobs:
# Verify tar integrity # Verify tar integrity
tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; } tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
# Create checksum of the FULL tar before splitting (for verification after reassembly) # Record tar size and checksum for verification after reassembly
echo "=== Creating checksum of full tar ===" echo "=== Recording tar metadata ==="
sha256sum extracted_data.tar > full_tar.sha256 ORIGINAL_SIZE=$(stat --format=%s extracted_data.tar)
cat full_tar.sha256 ORIGINAL_SHA=$(sha256sum extracted_data.tar | awk '{print $1}')
echo "Size: $ORIGINAL_SIZE"
echo "SHA256: $ORIGINAL_SHA"
# Split into 500MB chunks to avoid artifact upload issues # Split into 500MB chunks to avoid artifact upload issues
echo "=== Splitting tar into 500MB chunks ===" echo "=== Splitting tar into 500MB chunks ==="
mkdir -p tar_chunks mkdir -p tar_chunks
split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_ split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
rm extracted_data.tar rm extracted_data.tar
mv full_tar.sha256 tar_chunks/
# Write metadata file (plain text so artifact upload won't skip it)
echo "$ORIGINAL_SHA extracted_data.tar" > tar_chunks/checksum.txt
echo "$ORIGINAL_SIZE" >> tar_chunks/checksum.txt
echo "=== Chunks created ===" echo "=== Chunks created ==="
ls -lah tar_chunks/ ls -lah tar_chunks/
echo "=== Checksum file ==="
cat tar_chunks/checksum.txt
else else
echo "ERROR: No extracted directories found, cannot create tar" echo "ERROR: No extracted directories found, cannot create tar"
exit 1 exit 1
@@ -179,19 +186,30 @@ jobs:
echo "=== Reassembled tar file info ===" echo "=== Reassembled tar file info ==="
ls -lah extracted_data.tar ls -lah extracted_data.tar
# Verify checksum of reassembled tar matches original # Verify integrity
echo "=== Verifying reassembled tar checksum ===" echo "=== Verifying reassembled tar ==="
echo "Original checksum:" if [ -f tar_chunks/checksum.txt ]; then
cat tar_chunks/full_tar.sha256 EXPECTED_SHA=$(head -1 tar_chunks/checksum.txt | awk '{print $1}')
echo "Reassembled checksum:" EXPECTED_SIZE=$(sed -n '2p' tar_chunks/checksum.txt)
sha256sum extracted_data.tar ACTUAL_SHA=$(sha256sum extracted_data.tar | awk '{print $1}')
sha256sum -c tar_chunks/full_tar.sha256 || { echo "ERROR: Reassembled tar checksum mismatch - data corrupted during transfer"; exit 1; } ACTUAL_SIZE=$(stat --format=%s extracted_data.tar)
echo "Checksum verified - data integrity confirmed" echo "Expected: SHA=$EXPECTED_SHA Size=$EXPECTED_SIZE"
echo "Actual: SHA=$ACTUAL_SHA Size=$ACTUAL_SIZE"
if [ "$EXPECTED_SHA" != "$ACTUAL_SHA" ] || [ "$EXPECTED_SIZE" != "$ACTUAL_SIZE" ]; then
echo "ERROR: Reassembled tar does not match original - data corrupted during transfer"
exit 1
fi
echo "Checksum and size verified"
else
echo "WARNING: No checksum file found, falling back to tar integrity check"
tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
echo "Tar integrity check passed"
fi
rm -rf tar_chunks rm -rf tar_chunks
echo "=== Extracting ===" echo "=== Extracting ==="
tar -xvf extracted_data.tar tar -xf extracted_data.tar
rm extracted_data.tar rm extracted_data.tar
echo "has_data=true" >> "$GITHUB_OUTPUT" echo "has_data=true" >> "$GITHUB_OUTPUT"
echo "=== Contents of data/output ===" echo "=== Contents of data/output ==="
@@ -302,6 +302,7 @@ jobs:
python-version: "3.14" python-version: "3.14"
- name: Run Mictronics DB release script - name: Run Mictronics DB release script
continue-on-error: true
run: | run: |
python -m src.contributions.create_daily_microtonics_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }} python -m src.contributions.create_daily_microtonics_release ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/openairframes ls -lah data/openairframes
@@ -312,11 +313,12 @@ jobs:
name: mictronics-db name: mictronics-db
path: data/openairframes/mictronics-db_*.zip path: data/openairframes/mictronics-db_*.zip
retention-days: 1 retention-days: 1
if-no-files-found: ignore
create-release: create-release:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: [build-faa, adsb-reduce, build-community] needs: [build-faa, adsb-reduce, build-community, build-adsbexchange-json, build-mictronics-db]
if: github.event_name != 'schedule' if: github.event_name != 'schedule' && !failure() && !cancelled()
steps: steps:
- name: Checkout for gh CLI - name: Checkout for gh CLI
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -351,6 +353,7 @@ jobs:
- name: Download Mictronics DB artifact - name: Download Mictronics DB artifact
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
continue-on-error: true
with: with:
name: mictronics-db name: mictronics-db
path: artifacts/mictronics path: artifacts/mictronics
@@ -405,8 +408,12 @@ jobs:
if [ -z "$JSON_FILE_ADSBX" ] || [ ! -f "$JSON_FILE_ADSBX" ]; then if [ -z "$JSON_FILE_ADSBX" ] || [ ! -f "$JSON_FILE_ADSBX" ]; then
MISSING_FILES="$MISSING_FILES ADSBX_JSON" MISSING_FILES="$MISSING_FILES ADSBX_JSON"
fi fi
# Optional files - warn but don't fail
OPTIONAL_MISSING=""
if [ -z "$ZIP_FILE_MICTRONICS" ] || [ ! -f "$ZIP_FILE_MICTRONICS" ]; then if [ -z "$ZIP_FILE_MICTRONICS" ] || [ ! -f "$ZIP_FILE_MICTRONICS" ]; then
MISSING_FILES="$MISSING_FILES MICTRONICS_ZIP" OPTIONAL_MISSING="$OPTIONAL_MISSING MICTRONICS_ZIP"
ZIP_FILE_MICTRONICS=""
fi fi
if [ -n "$MISSING_FILES" ]; then if [ -n "$MISSING_FILES" ]; then
@@ -425,7 +432,14 @@ jobs:
CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "") CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
ZIP_BASENAME=$(basename "$ZIP_FILE") ZIP_BASENAME=$(basename "$ZIP_FILE")
JSON_BASENAME_ADSBX=$(basename "$JSON_FILE_ADSBX") JSON_BASENAME_ADSBX=$(basename "$JSON_FILE_ADSBX")
ZIP_BASENAME_MICTRONICS=$(basename "$ZIP_FILE_MICTRONICS") ZIP_BASENAME_MICTRONICS=""
if [ -n "$ZIP_FILE_MICTRONICS" ]; then
ZIP_BASENAME_MICTRONICS=$(basename "$ZIP_FILE_MICTRONICS")
fi
if [ -n "$OPTIONAL_MISSING" ]; then
echo "WARNING: Optional files missing:$OPTIONAL_MISSING (will continue without them)"
fi
echo "date=$DATE" >> "$GITHUB_OUTPUT" echo "date=$DATE" >> "$GITHUB_OUTPUT"
echo "tag=$TAG" >> "$GITHUB_OUTPUT" echo "tag=$TAG" >> "$GITHUB_OUTPUT"
@@ -463,7 +477,7 @@ jobs:
with: with:
tag_name: ${{ steps.meta.outputs.tag }} tag_name: ${{ steps.meta.outputs.tag }}
name: ${{ steps.meta.outputs.name }} name: ${{ steps.meta.outputs.name }}
fail_on_unmatched_files: true fail_on_unmatched_files: false
body: | body: |
Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}. Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
@@ -473,7 +487,7 @@ jobs:
- ${{ steps.meta.outputs.csv_basename_community }} - ${{ steps.meta.outputs.csv_basename_community }}
- ${{ steps.meta.outputs.zip_basename }} - ${{ steps.meta.outputs.zip_basename }}
- ${{ steps.meta.outputs.json_basename_adsbx }} - ${{ steps.meta.outputs.json_basename_adsbx }}
- ${{ steps.meta.outputs.zip_basename_mictronics }} ${{ steps.meta.outputs.zip_basename_mictronics && format('- {0}', steps.meta.outputs.zip_basename_mictronics) || '' }}
files: | files: |
${{ steps.meta.outputs.csv_file_faa }} ${{ steps.meta.outputs.csv_file_faa }}
${{ steps.meta.outputs.csv_file_adsb }} ${{ steps.meta.outputs.csv_file_adsb }}
+8 -2
View File
@@ -82,8 +82,8 @@ def fetch_releases(version_date: str) -> list:
if version_date == "v2024.12.31": if version_date == "v2024.12.31":
year = "2025" year = "2025"
BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases" BASE_URL = f"https://api.github.com/repos/adsblol/globe_history_{year}/releases"
# Match exact release name, exclude tmp releases # Match both normal and tmp releases
PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+$" PATTERN = rf"^{re.escape(version_date)}-planes-readsb-prod-\d+(tmp)?$"
releases = [] releases = []
page = 1 page = 1
@@ -582,6 +582,12 @@ def process_version_date(version_date: str, keep_folders: bool = False):
print(f"No releases found for {vd}.") print(f"No releases found for {vd}.")
return None return None
# Prefer non-tmp releases; only use tmp if no normal releases exist
normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
releases = normal_releases if normal_releases else tmp_releases
print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
downloaded_files = [] downloaded_files = []
for release in releases: for release in releases:
tag_name = release["tag_name"] tag_name = release["tag_name"]
+6
View File
@@ -59,6 +59,12 @@ def download_and_extract(version_date: str) -> str | None:
print(f"No releases found for {version_date}") print(f"No releases found for {version_date}")
return None return None
# Prefer non-tmp releases; only use tmp if no normal releases exist
normal_releases = [r for r in releases if "tmp" not in r["tag_name"]]
tmp_releases = [r for r in releases if "tmp" in r["tag_name"]]
releases = normal_releases if normal_releases else tmp_releases
print(f"Using {'normal' if normal_releases else 'tmp'} releases ({len(releases)} found)")
downloaded_files = [] downloaded_files = []
for release in releases: for release in releases:
tag_name = release["tag_name"] tag_name = release["tag_name"]
+155
View File
@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
Run the full ADS-B processing pipeline locally.
Downloads adsb.lol data, processes trace files, and outputs openairframes_adsb CSV.
Usage:
# Single day (yesterday by default)
python -m src.adsb.run_local
# Single day (specific date)
python -m src.adsb.run_local 2024-01-15
# Date range (inclusive)
python -m src.adsb.run_local 2024-01-01 2024-01-07
"""
import argparse
import os
import subprocess
import sys
from datetime import datetime, timedelta
def run_cmd(cmd: list[str], description: str) -> None:
"""Run a command and exit on failure."""
print(f"\n>>> {' '.join(cmd)}")
result = subprocess.run(cmd)
if result.returncode != 0:
print(f"ERROR: {description} failed with exit code {result.returncode}")
sys.exit(result.returncode)
def main():
parser = argparse.ArgumentParser(
description="Run full ADS-B processing pipeline locally",
usage="python -m src.adsb.run_local [start_date] [end_date]"
)
parser.add_argument(
"start_date",
nargs="?",
help="Start date (YYYY-MM-DD). Default: yesterday"
)
parser.add_argument(
"end_date",
nargs="?",
help="End date (YYYY-MM-DD, inclusive). If omitted, processes single day"
)
parser.add_argument(
"--chunks",
type=int,
default=4,
help="Number of parallel chunks (default: 4)"
)
parser.add_argument(
"--skip-base",
action="store_true",
help="Skip downloading and merging with base release"
)
args = parser.parse_args()
# Determine dates
if args.start_date:
start_date = datetime.strptime(args.start_date, "%Y-%m-%d")
else:
start_date = datetime.utcnow() - timedelta(days=1)
end_date = None
if args.end_date:
end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
start_str = start_date.strftime("%Y-%m-%d")
end_str = end_date.strftime("%Y-%m-%d") if end_date else None
print("=" * 60)
print("ADS-B Processing Pipeline")
print("=" * 60)
if end_str:
print(f"Date range: {start_str} to {end_str}")
else:
print(f"Date: {start_str}")
print(f"Chunks: {args.chunks}")
print("=" * 60)
# Step 1: Download and extract
print("\n" + "=" * 60)
print("Step 1: Download and Extract")
print("=" * 60)
if end_str:
cmd = ["python", "-m", "src.adsb.download_and_list_icaos",
"--start-date", start_str, "--end-date", end_str]
else:
cmd = ["python", "-m", "src.adsb.download_and_list_icaos",
"--date", start_str]
run_cmd(cmd, "Download and extract")
# Step 2: Process chunks
print("\n" + "=" * 60)
print("Step 2: Process Chunks")
print("=" * 60)
for chunk_id in range(args.chunks):
print(f"\n--- Chunk {chunk_id + 1}/{args.chunks} ---")
if end_str:
cmd = ["python", "-m", "src.adsb.process_icao_chunk",
"--chunk-id", str(chunk_id),
"--total-chunks", str(args.chunks),
"--start-date", start_str,
"--end-date", end_str]
else:
cmd = ["python", "-m", "src.adsb.process_icao_chunk",
"--chunk-id", str(chunk_id),
"--total-chunks", str(args.chunks),
"--date", start_str]
run_cmd(cmd, f"Process chunk {chunk_id}")
# Step 3: Combine chunks to CSV
print("\n" + "=" * 60)
print("Step 3: Combine to CSV")
print("=" * 60)
chunks_dir = "./data/output/adsb_chunks"
cmd = ["python", "-m", "src.adsb.combine_chunks_to_csv",
"--chunks-dir", chunks_dir]
if end_str:
cmd.extend(["--start-date", start_str, "--end-date", end_str])
else:
cmd.extend(["--date", start_str])
if args.skip_base:
cmd.append("--skip-base")
run_cmd(cmd, "Combine chunks")
print("\n" + "=" * 60)
print("Done!")
print("=" * 60)
# Show output
output_dir = "./data/openairframes"
if end_str:
output_file = f"openairframes_adsb_{start_str}_{end_str}.csv"
else:
output_file = f"openairframes_adsb_{start_str}_{start_str}.csv"
output_path = os.path.join(output_dir, output_file)
if os.path.exists(output_path):
size_mb = os.path.getsize(output_path) / (1024 * 1024)
print(f"Output: {output_path}")
print(f"Size: {size_mb:.1f} MB")
if __name__ == "__main__":
main()
@@ -9,12 +9,17 @@ from __future__ import annotations
import argparse import argparse
import shutil import shutil
import sys
import time
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from urllib.error import URLError
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
URL = "https://www.mictronics.de/aircraft-database/indexedDB_old.php" URL = "https://www.mictronics.de/aircraft-database/indexedDB_old.php"
OUT_ROOT = Path("data/openairframes") OUT_ROOT = Path("data/openairframes")
MAX_RETRIES = 3
RETRY_DELAY = 30 # seconds
def main() -> None: def main() -> None:
@@ -28,12 +33,22 @@ def main() -> None:
zip_path = OUT_ROOT / f"mictronics-db_{date_str}.zip" zip_path = OUT_ROOT / f"mictronics-db_{date_str}.zip"
print(f"Downloading {URL}...") for attempt in range(1, MAX_RETRIES + 1):
req = Request(URL, headers={"User-Agent": "openairframes-downloader/1.0"}, method="GET") try:
with urlopen(req, timeout=300) as r, zip_path.open("wb") as f: print(f"Downloading {URL} (attempt {attempt}/{MAX_RETRIES})...")
shutil.copyfileobj(r, f) req = Request(URL, headers={"User-Agent": "Mozilla/5.0 (compatible; openairframes-downloader/1.0)"}, method="GET")
with urlopen(req, timeout=120) as r, zip_path.open("wb") as f:
print(f"Wrote: {zip_path}") shutil.copyfileobj(r, f)
print(f"Wrote: {zip_path}")
return
except (URLError, TimeoutError) as e:
print(f"Attempt {attempt} failed: {e}")
if attempt < MAX_RETRIES:
print(f"Retrying in {RETRY_DELAY} seconds...")
time.sleep(RETRY_DELAY)
else:
print("All retries exhausted. Mictronics download failed.")
sys.exit(1)
if __name__ == "__main__": if __name__ == "__main__":
+5 -2
View File
@@ -47,6 +47,9 @@ def convert_faa_master_txt_to_df(zip_path: Path, date: str):
# Convert all NaN to empty strings # Convert all NaN to empty strings
df = df.fillna("") df = df.fillna("")
# The FAA parser can produce the literal string "None" for missing values;
# replace those so they match the empty-string convention used everywhere else.
df = df.replace("None", "")
return df return df
@@ -84,8 +87,8 @@ def concat_faa_historical_df(df_base, df_new):
# Convert to string # Convert to string
val_str = str(val).strip() val_str = str(val).strip()
# Handle empty strings # Handle empty strings and null-like literals
if val_str == "" or val_str == "nan": if val_str == "" or val_str == "nan" or val_str == "None":
return "" return ""
# Check if it looks like a list representation (starts with [ ) # Check if it looks like a list representation (starts with [ )