mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-04-23 19:46:09 +02:00
183 lines
5.4 KiB
Python
183 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Download and concatenate artifacts from a specific set of workflow runs.
|
|
|
|
Usage:
|
|
python scripts/download_and_concat_runs.py triggered_runs_20260216_123456.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def download_run_artifact(run_id, output_dir):
|
|
"""Download artifact from a specific workflow run."""
|
|
print(f" Downloading artifacts from run {run_id}...")
|
|
|
|
cmd = [
|
|
'gh', 'run', 'download', str(run_id),
|
|
'--pattern', 'openairframes_adsb-*',
|
|
'--dir', output_dir
|
|
]
|
|
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
print(f" ✓ Downloaded")
|
|
return True
|
|
else:
|
|
if "no artifacts" in result.stderr.lower():
|
|
print(f" ⚠ No artifacts found (workflow may still be running)")
|
|
else:
|
|
print(f" ✗ Failed: {result.stderr}")
|
|
return False
|
|
|
|
|
|
def find_csv_files(download_dir):
|
|
"""Find all CSV.gz files in the download directory."""
|
|
csv_files = []
|
|
for root, dirs, files in os.walk(download_dir):
|
|
for file in files:
|
|
if file.endswith('.csv.gz'):
|
|
csv_files.append(os.path.join(root, file))
|
|
return sorted(csv_files)
|
|
|
|
|
|
def concatenate_csv_files(csv_files, output_file):
|
|
"""Concatenate CSV files in order, preserving headers."""
|
|
import gzip
|
|
|
|
print(f"\nConcatenating {len(csv_files)} CSV files...")
|
|
|
|
with gzip.open(output_file, 'wt') as outf:
|
|
header_written = False
|
|
|
|
for i, csv_file in enumerate(csv_files, 1):
|
|
print(f" [{i}/{len(csv_files)}] Processing {os.path.basename(csv_file)}")
|
|
|
|
with gzip.open(csv_file, 'rt') as inf:
|
|
lines = inf.readlines()
|
|
|
|
if not header_written:
|
|
# Write header from first file
|
|
outf.writelines(lines)
|
|
header_written = True
|
|
else:
|
|
# Skip header for subsequent files
|
|
outf.writelines(lines[1:])
|
|
|
|
print(f"\n✓ Concatenated CSV saved to: {output_file}")
|
|
|
|
# Show file size
|
|
size_mb = os.path.getsize(output_file) / (1024 * 1024)
|
|
print(f" Size: {size_mb:.1f} MB")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Download and concatenate artifacts from workflow runs'
|
|
)
|
|
parser.add_argument(
|
|
'runs_file',
|
|
help='JSON file containing run IDs (from run_historical_adsb_action.py)'
|
|
)
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
default='./downloads/historical_concat',
|
|
help='Directory for downloads (default: ./downloads/historical_concat)'
|
|
)
|
|
parser.add_argument(
|
|
'--wait',
|
|
action='store_true',
|
|
help='Wait for workflows to complete before downloading'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load run IDs
|
|
if not os.path.exists(args.runs_file):
|
|
print(f"Error: File not found: {args.runs_file}")
|
|
sys.exit(1)
|
|
|
|
with open(args.runs_file, 'r') as f:
|
|
data = json.load(f)
|
|
|
|
runs = data['runs']
|
|
start_date = data['start_date']
|
|
end_date = data['end_date']
|
|
|
|
print("=" * 60)
|
|
print("Download and Concatenate Historical Artifacts")
|
|
print("=" * 60)
|
|
print(f"Date range: {start_date} to {end_date}")
|
|
print(f"Workflow runs: {len(runs)}")
|
|
print(f"Output directory: {args.output_dir}")
|
|
print("=" * 60)
|
|
|
|
# Create output directory
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
# Wait for workflows to complete if requested
|
|
if args.wait:
|
|
print("\nWaiting for workflows to complete...")
|
|
for run_info in runs:
|
|
run_id = run_info['run_id']
|
|
print(f" Checking run {run_id}...")
|
|
|
|
cmd = ['gh', 'run', 'watch', str(run_id)]
|
|
subprocess.run(cmd)
|
|
|
|
# Download artifacts
|
|
print("\nDownloading artifacts...")
|
|
successful_downloads = 0
|
|
|
|
for i, run_info in enumerate(runs, 1):
|
|
run_id = run_info['run_id']
|
|
print(f"\n[{i}/{len(runs)}] Run {run_id} ({run_info['start']} to {run_info['end']})")
|
|
|
|
if download_run_artifact(run_id, args.output_dir):
|
|
successful_downloads += 1
|
|
|
|
print(f"\n\nDownload Summary: {successful_downloads}/{len(runs)} artifacts downloaded")
|
|
|
|
if successful_downloads == 0:
|
|
print("\nNo artifacts downloaded. Workflows may still be running.")
|
|
print("Use --wait to wait for completion, or try again later.")
|
|
sys.exit(1)
|
|
|
|
# Find all CSV files
|
|
csv_files = find_csv_files(args.output_dir)
|
|
|
|
if not csv_files:
|
|
print("\nError: No CSV files found in download directory")
|
|
sys.exit(1)
|
|
|
|
print(f"\nFound {len(csv_files)} CSV file(s):")
|
|
for csv_file in csv_files:
|
|
print(f" - {os.path.basename(csv_file)}")
|
|
|
|
# Concatenate
|
|
# Calculate actual end date for filename (end_date - 1 day since it's exclusive)
|
|
from datetime import datetime, timedelta
|
|
end_dt = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
|
|
actual_end = end_dt.strftime('%Y-%m-%d')
|
|
|
|
output_file = os.path.join(
|
|
args.output_dir,
|
|
f"openairframes_adsb_{start_date}_{actual_end}.csv.gz"
|
|
)
|
|
|
|
concatenate_csv_files(csv_files, output_file)
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Done!")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|