diff --git a/.github/workflows/historical-adsb.yaml b/.github/workflows/historical-adsb.yaml index 0bb99a1..596833a 100644 --- a/.github/workflows/historical-adsb.yaml +++ b/.github/workflows/historical-adsb.yaml @@ -81,8 +81,22 @@ jobs: - name: Create tar of extracted data run: | cd data/output - tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "Some files may not exist" - ls -lah extracted_data.tar || echo "No tar created" + echo "=== Disk space before tar ===" + df -h . + echo "=== Files to tar ===" + ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found" + + # Create tar with explicit error checking + if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then + tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt + echo "=== Tar file created ===" + ls -lah extracted_data.tar + # Verify tar integrity + tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; } + else + echo "ERROR: No extracted directories found, cannot create tar" + exit 1 + fi - name: Upload extracted data uses: actions/upload-artifact@v4 @@ -97,7 +111,7 @@ jobs: needs: [generate-matrix, adsb-extract] runs-on: ubuntu-24.04-arm strategy: - fail-fast: false + fail-fast: true matrix: chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }} icao_chunk: [0, 1, 2, 3] @@ -134,7 +148,12 @@ jobs: run: | cd data/output if [ -f extracted_data.tar ]; then - tar -xf extracted_data.tar + echo "=== Tar file info ===" + ls -lah extracted_data.tar + echo "=== Verifying tar integrity ===" + tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; } + echo "=== Extracting ===" + tar -xvf extracted_data.tar rm extracted_data.tar echo "has_data=true" >> "$GITHUB_OUTPUT" echo "=== Contents of data/output ===" @@ -188,17 +207,19 @@ jobs: - name: Debug downloaded files run: | + echo "=== Disk space before processing ===" + df -h echo "=== Listing data/output/adsb_chunks/ ===" - find data/output/adsb_chunks/ -type f 2>/dev/null | head -50 || echo "No files found" - echo "=== Looking for parquet files ===" - find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found" + find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l + echo "=== Total parquet size ===" + du -sh data/output/adsb_chunks/ || echo "No chunks dir" - name: Combine chunks to CSV env: START_DATE: ${{ needs.generate-matrix.outputs.global_start }} END_DATE: ${{ needs.generate-matrix.outputs.global_end }} run: | - python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base + python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream ls -lah data/planequery_aircraft/ - name: Upload final artifact diff --git a/.github/workflows/planequery-aircraft-daily-release.yaml b/.github/workflows/planequery-aircraft-daily-release.yaml index 00838cb..ac62132 100644 --- a/.github/workflows/planequery-aircraft-daily-release.yaml +++ b/.github/workflows/planequery-aircraft-daily-release.yaml @@ -277,6 +277,15 @@ jobs: name: community-release path: artifacts/community + - name: Debug artifact structure + run: | + echo "=== FAA artifacts ===" + find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa" + echo "=== ADS-B artifacts ===" + find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb" + echo "=== Community artifacts ===" + find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community" + - name: Prepare release metadata id: meta run: | @@ -312,6 +321,13 @@ jobs: echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT" echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT" + - name: Delete existing release if exists + run: | + gh release delete "${{ steps.meta.outputs.tag }}" --yes 2>/dev/null || true + git push --delete origin "refs/tags/${{ steps.meta.outputs.tag }}" 2>/dev/null || true + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Create GitHub Release and upload assets uses: softprops/action-gh-release@v2 with: diff --git a/.github/workflows/validate-community-submission.yaml b/.github/workflows/validate-community-submission.yaml index e217401..dbb1a34 100644 --- a/.github/workflows/validate-community-submission.yaml +++ b/.github/workflows/validate-community-submission.yaml @@ -4,6 +4,9 @@ on: issues: types: [opened, edited] +permissions: + issues: write + jobs: validate: if: contains(github.event.issue.labels.*.name, 'submission') @@ -20,6 +23,13 @@ jobs: - name: Install dependencies run: pip install jsonschema + - name: Debug issue body + run: | + echo "=== Issue Body ===" + cat << 'ISSUE_BODY_EOF' + ${{ github.event.issue.body }} + ISSUE_BODY_EOF + - name: Validate submission env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/requirements.txt b/requirements.txt index 6a4ec9a..5d93f27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ pandas==3.0.0 pyarrow==23.0.0 orjson==3.11.7 polars==1.38.1 +jsonschema==4.26.0 \ No newline at end of file diff --git a/src/adsb/combine_chunks_to_csv.py b/src/adsb/combine_chunks_to_csv.py index 2fe8b4e..9b6eaab 100644 --- a/src/adsb/combine_chunks_to_csv.py +++ b/src/adsb/combine_chunks_to_csv.py @@ -36,8 +36,13 @@ def get_target_day() -> datetime: return datetime.utcnow() - timedelta(days=1) -def process_single_chunk(chunk_path: str) -> pl.DataFrame: - """Load and compress a single chunk parquet file.""" +def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame: + """Load and compress a single chunk parquet file. + + Args: + chunk_path: Path to parquet file + delete_after_load: If True, delete the parquet file after loading to free disk space + """ print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}") # Load chunk - only columns we need @@ -45,6 +50,14 @@ def process_single_chunk(chunk_path: str) -> pl.DataFrame: df = pl.read_parquet(chunk_path, columns=needed_columns) print(f" Loaded {len(df)} rows") + # Delete file immediately after loading to free disk space + if delete_after_load: + try: + os.remove(chunk_path) + print(f" Deleted {chunk_path} to free disk space") + except Exception as e: + print(f" Warning: Failed to delete {chunk_path}: {e}") + # Compress to aircraft records (one per ICAO) using shared function compressed = compress_multi_icao_df(df, verbose=True) print(f" Compressed to {len(compressed)} aircraft records") @@ -156,6 +169,7 @@ def main(): parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files") parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release") parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging") + parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space") args = parser.parse_args() # Determine output ID and filename based on mode @@ -190,9 +204,10 @@ def main(): print(f"Found {len(chunk_files)} chunk files") # Process each chunk separately to save memory + # With --stream, delete parquet files immediately after loading to save disk space compressed_chunks = [] for chunk_path in chunk_files: - compressed = process_single_chunk(chunk_path) + compressed = process_single_chunk(chunk_path, delete_after_load=args.stream) compressed_chunks.append(compressed) gc.collect()