From 99b680476adb429c252bb1daf3c881ed38a832f2 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Thu, 12 Feb 2026 10:52:42 -0500 Subject: [PATCH 1/6] delete parquet chunck after load to not use so much space for big historical run --- .github/workflows/historical-adsb.yaml | 10 ++++++---- src/adsb/combine_chunks_to_csv.py | 21 ++++++++++++++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/.github/workflows/historical-adsb.yaml b/.github/workflows/historical-adsb.yaml index 0bb99a1..456d500 100644 --- a/.github/workflows/historical-adsb.yaml +++ b/.github/workflows/historical-adsb.yaml @@ -188,17 +188,19 @@ jobs: - name: Debug downloaded files run: | + echo "=== Disk space before processing ===" + df -h echo "=== Listing data/output/adsb_chunks/ ===" - find data/output/adsb_chunks/ -type f 2>/dev/null | head -50 || echo "No files found" - echo "=== Looking for parquet files ===" - find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found" + find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l + echo "=== Total parquet size ===" + du -sh data/output/adsb_chunks/ || echo "No chunks dir" - name: Combine chunks to CSV env: START_DATE: ${{ needs.generate-matrix.outputs.global_start }} END_DATE: ${{ needs.generate-matrix.outputs.global_end }} run: | - python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base + python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream ls -lah data/planequery_aircraft/ - name: Upload final artifact diff --git a/src/adsb/combine_chunks_to_csv.py b/src/adsb/combine_chunks_to_csv.py index 2fe8b4e..9b6eaab 100644 --- a/src/adsb/combine_chunks_to_csv.py +++ b/src/adsb/combine_chunks_to_csv.py @@ -36,8 +36,13 @@ def get_target_day() -> datetime: return datetime.utcnow() - timedelta(days=1) -def process_single_chunk(chunk_path: str) -> pl.DataFrame: - """Load and compress a single chunk parquet file.""" +def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame: + """Load and compress a single chunk parquet file. + + Args: + chunk_path: Path to parquet file + delete_after_load: If True, delete the parquet file after loading to free disk space + """ print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}") # Load chunk - only columns we need @@ -45,6 +50,14 @@ def process_single_chunk(chunk_path: str) -> pl.DataFrame: df = pl.read_parquet(chunk_path, columns=needed_columns) print(f" Loaded {len(df)} rows") + # Delete file immediately after loading to free disk space + if delete_after_load: + try: + os.remove(chunk_path) + print(f" Deleted {chunk_path} to free disk space") + except Exception as e: + print(f" Warning: Failed to delete {chunk_path}: {e}") + # Compress to aircraft records (one per ICAO) using shared function compressed = compress_multi_icao_df(df, verbose=True) print(f" Compressed to {len(compressed)} aircraft records") @@ -156,6 +169,7 @@ def main(): parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files") parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release") parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging") + parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space") args = parser.parse_args() # Determine output ID and filename based on mode @@ -190,9 +204,10 @@ def main(): print(f"Found {len(chunk_files)} chunk files") # Process each chunk separately to save memory + # With --stream, delete parquet files immediately after loading to save disk space compressed_chunks = [] for chunk_path in chunk_files: - compressed = process_single_chunk(chunk_path) + compressed = process_single_chunk(chunk_path, delete_after_load=args.stream) compressed_chunks.append(compressed) gc.collect() From 2c9e994a12dbd7e9188d6bdd6b60721bc394713a Mon Sep 17 00:00:00 2001 From: ggman12 Date: Thu, 12 Feb 2026 11:06:38 -0500 Subject: [PATCH 2/6] add debug for FAA --- .github/workflows/planequery-aircraft-daily-release.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/planequery-aircraft-daily-release.yaml b/.github/workflows/planequery-aircraft-daily-release.yaml index 00838cb..8dc1c96 100644 --- a/.github/workflows/planequery-aircraft-daily-release.yaml +++ b/.github/workflows/planequery-aircraft-daily-release.yaml @@ -277,6 +277,15 @@ jobs: name: community-release path: artifacts/community + - name: Debug artifact structure + run: | + echo "=== FAA artifacts ===" + find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa" + echo "=== ADS-B artifacts ===" + find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb" + echo "=== Community artifacts ===" + find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community" + - name: Prepare release metadata id: meta run: | From 43b07942b05b06986ea52e688f37061f891bcf69 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Thu, 12 Feb 2026 11:42:49 -0500 Subject: [PATCH 3/6] add needed permissions --- .github/workflows/validate-community-submission.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/validate-community-submission.yaml b/.github/workflows/validate-community-submission.yaml index e217401..dbb1a34 100644 --- a/.github/workflows/validate-community-submission.yaml +++ b/.github/workflows/validate-community-submission.yaml @@ -4,6 +4,9 @@ on: issues: types: [opened, edited] +permissions: + issues: write + jobs: validate: if: contains(github.event.issue.labels.*.name, 'submission') @@ -20,6 +23,13 @@ jobs: - name: Install dependencies run: pip install jsonschema + - name: Debug issue body + run: | + echo "=== Issue Body ===" + cat << 'ISSUE_BODY_EOF' + ${{ github.event.issue.body }} + ISSUE_BODY_EOF + - name: Validate submission env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From bccc634158026d6f4c48f7f31a9d3103ef2c3cd4 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Thu, 12 Feb 2026 11:50:45 -0500 Subject: [PATCH 4/6] remove existing release --- .github/workflows/planequery-aircraft-daily-release.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/planequery-aircraft-daily-release.yaml b/.github/workflows/planequery-aircraft-daily-release.yaml index 8dc1c96..ac62132 100644 --- a/.github/workflows/planequery-aircraft-daily-release.yaml +++ b/.github/workflows/planequery-aircraft-daily-release.yaml @@ -321,6 +321,13 @@ jobs: echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT" echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT" + - name: Delete existing release if exists + run: | + gh release delete "${{ steps.meta.outputs.tag }}" --yes 2>/dev/null || true + git push --delete origin "refs/tags/${{ steps.meta.outputs.tag }}" 2>/dev/null || true + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Create GitHub Release and upload assets uses: softprops/action-gh-release@v2 with: From 2de41c98835fd0e97845f0626dc4b98c0df9e811 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Thu, 12 Feb 2026 12:01:13 -0500 Subject: [PATCH 5/6] update historical. To check tar and fail fast if any maps fail --- .github/workflows/historical-adsb.yaml | 27 ++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/.github/workflows/historical-adsb.yaml b/.github/workflows/historical-adsb.yaml index 456d500..596833a 100644 --- a/.github/workflows/historical-adsb.yaml +++ b/.github/workflows/historical-adsb.yaml @@ -81,8 +81,22 @@ jobs: - name: Create tar of extracted data run: | cd data/output - tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "Some files may not exist" - ls -lah extracted_data.tar || echo "No tar created" + echo "=== Disk space before tar ===" + df -h . + echo "=== Files to tar ===" + ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found" + + # Create tar with explicit error checking + if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then + tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt + echo "=== Tar file created ===" + ls -lah extracted_data.tar + # Verify tar integrity + tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; } + else + echo "ERROR: No extracted directories found, cannot create tar" + exit 1 + fi - name: Upload extracted data uses: actions/upload-artifact@v4 @@ -97,7 +111,7 @@ jobs: needs: [generate-matrix, adsb-extract] runs-on: ubuntu-24.04-arm strategy: - fail-fast: false + fail-fast: true matrix: chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }} icao_chunk: [0, 1, 2, 3] @@ -134,7 +148,12 @@ jobs: run: | cd data/output if [ -f extracted_data.tar ]; then - tar -xf extracted_data.tar + echo "=== Tar file info ===" + ls -lah extracted_data.tar + echo "=== Verifying tar integrity ===" + tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; } + echo "=== Extracting ===" + tar -xvf extracted_data.tar rm extracted_data.tar echo "has_data=true" >> "$GITHUB_OUTPUT" echo "=== Contents of data/output ===" From 53a020ab7329deb8bbce0f4f32d792f2dfd02395 Mon Sep 17 00:00:00 2001 From: ggman12 Date: Thu, 12 Feb 2026 12:09:03 -0500 Subject: [PATCH 6/6] add jsonschema to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 6a4ec9a..5d93f27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ pandas==3.0.0 pyarrow==23.0.0 orjson==3.11.7 polars==1.38.1 +jsonschema==4.26.0 \ No newline at end of file