Merge branch 'develop'

This commit is contained in:
ggman12
2026-02-12 12:10:24 -05:00
5 changed files with 74 additions and 11 deletions
+29 -8
View File
@@ -81,8 +81,22 @@ jobs:
- name: Create tar of extracted data
run: |
cd data/output
tar -cf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "Some files may not exist"
ls -lah extracted_data.tar || echo "No tar created"
echo "=== Disk space before tar ==="
df -h .
echo "=== Files to tar ==="
ls -lah *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt 2>/dev/null || echo "No files found"
# Create tar with explicit error checking
if ls *-planes-readsb-prod-0.tar_0 1>/dev/null 2>&1; then
tar -cvf extracted_data.tar *-planes-readsb-prod-0.tar_0 icao_manifest_*.txt
echo "=== Tar file created ==="
ls -lah extracted_data.tar
# Verify tar integrity
tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
else
echo "ERROR: No extracted directories found, cannot create tar"
exit 1
fi
- name: Upload extracted data
uses: actions/upload-artifact@v4
@@ -97,7 +111,7 @@ jobs:
needs: [generate-matrix, adsb-extract]
runs-on: ubuntu-24.04-arm
strategy:
fail-fast: false
fail-fast: true
matrix:
chunk: ${{ fromJson(needs.generate-matrix.outputs.chunks) }}
icao_chunk: [0, 1, 2, 3]
@@ -134,7 +148,12 @@ jobs:
run: |
cd data/output
if [ -f extracted_data.tar ]; then
tar -xf extracted_data.tar
echo "=== Tar file info ==="
ls -lah extracted_data.tar
echo "=== Verifying tar integrity ==="
tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
echo "=== Extracting ==="
tar -xvf extracted_data.tar
rm extracted_data.tar
echo "has_data=true" >> "$GITHUB_OUTPUT"
echo "=== Contents of data/output ==="
@@ -188,17 +207,19 @@ jobs:
- name: Debug downloaded files
run: |
echo "=== Disk space before processing ==="
df -h
echo "=== Listing data/output/adsb_chunks/ ==="
find data/output/adsb_chunks/ -type f 2>/dev/null | head -50 || echo "No files found"
echo "=== Looking for parquet files ==="
find . -name "*.parquet" 2>/dev/null | head -20 || echo "No parquet files found"
find data/output/adsb_chunks/ -type f 2>/dev/null | wc -l
echo "=== Total parquet size ==="
du -sh data/output/adsb_chunks/ || echo "No chunks dir"
- name: Combine chunks to CSV
env:
START_DATE: ${{ needs.generate-matrix.outputs.global_start }}
END_DATE: ${{ needs.generate-matrix.outputs.global_end }}
run: |
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks --start-date "$START_DATE" --end-date "$END_DATE" --skip-base --stream
ls -lah data/planequery_aircraft/
- name: Upload final artifact
@@ -277,6 +277,15 @@ jobs:
name: community-release
path: artifacts/community
- name: Debug artifact structure
run: |
echo "=== FAA artifacts ==="
find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
echo "=== ADS-B artifacts ==="
find artifacts/adsb -type f 2>/dev/null || echo "No files found in artifacts/adsb"
echo "=== Community artifacts ==="
find artifacts/community -type f 2>/dev/null || echo "No files found in artifacts/community"
- name: Prepare release metadata
id: meta
run: |
@@ -312,6 +321,13 @@ jobs:
echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
echo "name=planequery-aircraft snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
- name: Delete existing release if exists
run: |
gh release delete "${{ steps.meta.outputs.tag }}" --yes 2>/dev/null || true
git push --delete origin "refs/tags/${{ steps.meta.outputs.tag }}" 2>/dev/null || true
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Create GitHub Release and upload assets
uses: softprops/action-gh-release@v2
with:
@@ -4,6 +4,9 @@ on:
issues:
types: [opened, edited]
permissions:
issues: write
jobs:
validate:
if: contains(github.event.issue.labels.*.name, 'submission')
@@ -20,6 +23,13 @@ jobs:
- name: Install dependencies
run: pip install jsonschema
- name: Debug issue body
run: |
echo "=== Issue Body ==="
cat << 'ISSUE_BODY_EOF'
${{ github.event.issue.body }}
ISSUE_BODY_EOF
- name: Validate submission
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+1
View File
@@ -3,3 +3,4 @@ pandas==3.0.0
pyarrow==23.0.0
orjson==3.11.7
polars==1.38.1
jsonschema==4.26.0
+18 -3
View File
@@ -36,8 +36,13 @@ def get_target_day() -> datetime:
return datetime.utcnow() - timedelta(days=1)
def process_single_chunk(chunk_path: str) -> pl.DataFrame:
"""Load and compress a single chunk parquet file."""
def process_single_chunk(chunk_path: str, delete_after_load: bool = False) -> pl.DataFrame:
"""Load and compress a single chunk parquet file.
Args:
chunk_path: Path to parquet file
delete_after_load: If True, delete the parquet file after loading to free disk space
"""
print(f"Processing {os.path.basename(chunk_path)}... | {get_resource_usage()}")
# Load chunk - only columns we need
@@ -45,6 +50,14 @@ def process_single_chunk(chunk_path: str) -> pl.DataFrame:
df = pl.read_parquet(chunk_path, columns=needed_columns)
print(f" Loaded {len(df)} rows")
# Delete file immediately after loading to free disk space
if delete_after_load:
try:
os.remove(chunk_path)
print(f" Deleted {chunk_path} to free disk space")
except Exception as e:
print(f" Warning: Failed to delete {chunk_path}: {e}")
# Compress to aircraft records (one per ICAO) using shared function
compressed = compress_multi_icao_df(df, verbose=True)
print(f" Compressed to {len(compressed)} aircraft records")
@@ -156,6 +169,7 @@ def main():
parser.add_argument("--chunks-dir", type=str, default=DEFAULT_CHUNK_DIR, help="Directory containing chunk parquet files")
parser.add_argument("--skip-base", action="store_true", help="Skip downloading and merging base release")
parser.add_argument("--keep-chunks", action="store_true", help="Keep chunk files after merging")
parser.add_argument("--stream", action="store_true", help="Delete parquet files immediately after loading to save disk space")
args = parser.parse_args()
# Determine output ID and filename based on mode
@@ -190,9 +204,10 @@ def main():
print(f"Found {len(chunk_files)} chunk files")
# Process each chunk separately to save memory
# With --stream, delete parquet files immediately after loading to save disk space
compressed_chunks = []
for chunk_path in chunk_files:
compressed = process_single_chunk(chunk_path)
compressed = process_single_chunk(chunk_path, delete_after_load=args.stream)
compressed_chunks.append(compressed)
gc.collect()