Compare commits

...

7 Commits

Author SHA1 Message Date
ggman12 8999a943a9 update histoircal 2026-02-13 00:12:18 -05:00
ggman12 74625b9bc9 split large file into chuncks 2026-02-12 20:22:36 -05:00
ggman12 f2728d6156 delete aws 2026-02-12 20:13:40 -05:00
ggman12 5ed10ec42e update 2026-02-12 19:32:34 -05:00
ggman12 3b8a14a4b9 add ability for custom run input date 2026-02-12 19:09:35 -05:00
ggman12 e5f124428f use github token for adsb.lol downlaods 2026-02-12 19:03:23 -05:00
ggman12 d5039fb766 update to fix files 2026-02-12 19:01:02 -05:00
8 changed files with 113 additions and 343 deletions
+49 -11
View File
@@ -74,11 +74,12 @@ jobs:
env:
START_DATE: ${{ matrix.chunk.start_date }}
END_DATE: ${{ matrix.chunk.end_date }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
ls -lah data/output/
- name: Create tar of extracted data
- name: Create tar of extracted data and split into chunks
run: |
cd data/output
echo "=== Disk space before tar ==="
@@ -93,16 +94,31 @@ jobs:
ls -lah extracted_data.tar
# Verify tar integrity
tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
# Create checksum of the FULL tar before splitting (for verification after reassembly)
echo "=== Creating checksum of full tar ==="
sha256sum extracted_data.tar > full_tar.sha256
cat full_tar.sha256
# Split into 500MB chunks to avoid artifact upload issues
echo "=== Splitting tar into 500MB chunks ==="
mkdir -p tar_chunks
split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
rm extracted_data.tar
mv full_tar.sha256 tar_chunks/
echo "=== Chunks created ==="
ls -lah tar_chunks/
else
echo "ERROR: No extracted directories found, cannot create tar"
exit 1
fi
- name: Upload extracted data
- name: Upload extracted data chunks
uses: actions/upload-artifact@v4
with:
name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
path: data/output/extracted_data.tar
path: data/output/tar_chunks/
retention-days: 1
compression-level: 0
if-no-files-found: warn
@@ -140,18 +156,40 @@ jobs:
uses: actions/download-artifact@v4
with:
name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
path: data/output/
continue-on-error: true
path: data/output/tar_chunks/
- name: Extract tar
- name: Reassemble and extract tar
id: extract
run: |
cd data/output
if [ -f extracted_data.tar ]; then
echo "=== Tar file info ==="
if [ -d tar_chunks ] && ls tar_chunks/extracted_data.tar.part_* 1>/dev/null 2>&1; then
echo "=== Chunk files info ==="
ls -lah tar_chunks/
cd tar_chunks
# Reassemble tar with explicit sorting
echo "=== Reassembling tar file ==="
ls -1 extracted_data.tar.part_?? | sort | while read part; do
echo "Appending $part..."
cat "$part" >> ../extracted_data.tar
done
cd ..
echo "=== Reassembled tar file info ==="
ls -lah extracted_data.tar
echo "=== Verifying tar integrity ==="
tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
# Verify checksum of reassembled tar matches original
echo "=== Verifying reassembled tar checksum ==="
echo "Original checksum:"
cat tar_chunks/full_tar.sha256
echo "Reassembled checksum:"
sha256sum extracted_data.tar
sha256sum -c tar_chunks/full_tar.sha256 || { echo "ERROR: Reassembled tar checksum mismatch - data corrupted during transfer"; exit 1; }
echo "Checksum verified - data integrity confirmed"
rm -rf tar_chunks
echo "=== Extracting ==="
tar -xvf extracted_data.tar
rm extracted_data.tar
@@ -159,7 +197,7 @@ jobs:
echo "=== Contents of data/output ==="
ls -lah
else
echo "No extracted_data.tar found"
echo "No tar chunks found"
echo "has_data=false" >> "$GITHUB_OUTPUT"
fi
@@ -5,6 +5,11 @@ on:
# 6:00pm UTC every day - runs on default branch, triggers both
- cron: "0 06 * * *"
workflow_dispatch:
inputs:
date:
description: 'Date to process (YYYY-MM-DD format, default: yesterday)'
required: false
type: string
permissions:
contents: write
@@ -58,7 +63,7 @@ jobs:
- name: Run FAA release script
run: |
python src/create_daily_faa_release.py
python src/create_daily_faa_release.py ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/faa_releasable
ls -lah data/openairframes
@@ -93,8 +98,10 @@ jobs:
pip install -r requirements.txt
- name: Download and extract ADS-B data
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
python -m src.adsb.download_and_list_icaos
python -m src.adsb.download_and_list_icaos ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/output/
- name: Check manifest exists
@@ -164,7 +171,7 @@ jobs:
- name: Process chunk ${{ matrix.chunk }}
run: |
python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4
python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4 ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
mkdir -p data/output/adsb_chunks
ls -lah data/output/adsb_chunks/ || echo "No chunks created"
@@ -213,7 +220,7 @@ jobs:
run: |
mkdir -p data/output/adsb_chunks
ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
ls -lah data/openairframes/
- name: Upload ADS-B artifacts
@@ -259,6 +266,13 @@ jobs:
needs: [build-faa, adsb-reduce, build-community]
if: github.event_name != 'schedule'
steps:
- name: Checkout for gh CLI
uses: actions/checkout@v4
with:
sparse-checkout: |
.github
sparse-checkout-cone-mode: false
- name: Download FAA artifacts
uses: actions/download-artifact@v4
with:
@@ -279,6 +293,8 @@ jobs:
- name: Debug artifact structure
run: |
echo "=== Full artifacts tree ==="
find artifacts -type f 2>/dev/null || echo "No files found in artifacts"
echo "=== FAA artifacts ==="
find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
echo "=== ADS-B artifacts ==="
@@ -300,13 +316,35 @@ jobs:
TAG="openairframes-${DATE}${BRANCH_SUFFIX}"
# Find files from artifacts using find (handles nested structures)
CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" | head -1)
CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" -type f 2>/dev/null | head -1)
CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
# Validate required files exist
MISSING_FILES=""
if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then
MISSING_FILES="$MISSING_FILES FAA_CSV"
fi
if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
MISSING_FILES="$MISSING_FILES ADSB_CSV"
fi
if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then
MISSING_FILES="$MISSING_FILES FAA_ZIP"
fi
if [ -n "$MISSING_FILES" ]; then
echo "ERROR: Missing required release files:$MISSING_FILES"
echo "FAA CSV: $CSV_FILE_FAA"
echo "ADSB CSV: $CSV_FILE_ADSB"
echo "ZIP: $ZIP_FILE"
exit 1
fi
# Get basenames for display
CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" | head -1)
CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" 2>/dev/null | head -1 || echo "")
CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" | head -1)
ZIP_BASENAME=$(basename "$ZIP_FILE")
echo "date=$DATE" >> "$GITHUB_OUTPUT"
@@ -320,9 +358,12 @@ jobs:
echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
- name: Checkout for gh CLI
uses: actions/checkout@v4
echo "Found files:"
echo " FAA CSV: $CSV_FILE_FAA"
echo " ADSB CSV: $CSV_FILE_ADSB"
echo " Community CSV: $CSV_FILE_COMMUNITY"
echo " ZIP: $ZIP_FILE"
- name: Delete existing release if exists
run: |
@@ -336,6 +377,7 @@ jobs:
with:
tag_name: ${{ steps.meta.outputs.tag }}
name: ${{ steps.meta.outputs.name }}
fail_on_unmatched_files: true
body: |
Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.
-11
View File
@@ -1,11 +0,0 @@
#!/usr/bin/env python3
import os
import aws_cdk as cdk
from stack import AdsbProcessingStack
app = cdk.App()
AdsbProcessingStack(app, "AdsbProcessingStack", env=cdk.Environment(
account=os.environ["CDK_DEFAULT_ACCOUNT"],
region=os.environ["CDK_DEFAULT_REGION"],
))
app.synth()
-3
View File
@@ -1,3 +0,0 @@
{
"app": "python3 app.py"
}
-2
View File
@@ -1,2 +0,0 @@
aws-cdk-lib>=2.170.0
constructs>=10.0.0
-213
View File
@@ -1,213 +0,0 @@
import aws_cdk as cdk
from aws_cdk import (
Stack,
Duration,
RemovalPolicy,
aws_s3 as s3,
aws_ecs as ecs,
aws_ec2 as ec2,
aws_ecr_assets,
aws_iam as iam,
aws_logs as logs,
aws_stepfunctions as sfn,
aws_stepfunctions_tasks as sfn_tasks,
)
from constructs import Construct
from pathlib import Path
class AdsbProcessingStack(Stack):
def __init__(self, scope: Construct, id: str, **kwargs):
super().__init__(scope, id, **kwargs)
# --- S3 bucket for intermediate and final results ---
bucket = s3.Bucket(
self, "ResultsBucket",
bucket_name="openairframes-dev",
removal_policy=RemovalPolicy.DESTROY,
auto_delete_objects=True,
lifecycle_rules=[
s3.LifecycleRule(
prefix="intermediate/",
expiration=Duration.days(7),
)
],
)
# --- Use default VPC (no additional cost) ---
vpc = ec2.Vpc.from_lookup(
self, "Vpc",
is_default=True,
)
# --- ECS Cluster ---
cluster = ecs.Cluster(
self, "Cluster",
vpc=vpc,
container_insights=True,
)
# --- Log group ---
log_group = logs.LogGroup(
self, "LogGroup",
log_group_name="/adsb-processing",
removal_policy=RemovalPolicy.DESTROY,
retention=logs.RetentionDays.TWO_WEEKS,
)
# --- Docker images (built from local Dockerfiles) ---
adsb_dir = str(Path(__file__).parent.parent / "src" / "adsb")
worker_image = ecs.ContainerImage.from_asset(
adsb_dir,
file="Dockerfile.worker",
platform=cdk.aws_ecr_assets.Platform.LINUX_ARM64,
)
reducer_image = ecs.ContainerImage.from_asset(
adsb_dir,
file="Dockerfile.reducer",
platform=cdk.aws_ecr_assets.Platform.LINUX_ARM64,
)
# --- Task role (shared) ---
task_role = iam.Role(
self, "TaskRole",
assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
)
bucket.grant_read_write(task_role)
# --- MAP: worker task definition ---
map_task_def = ecs.FargateTaskDefinition(
self, "MapTaskDef",
cpu=4096, # 4 vCPU
memory_limit_mib=30720, # 30 GB
task_role=task_role,
runtime_platform=ecs.RuntimePlatform(
cpu_architecture=ecs.CpuArchitecture.ARM64,
operating_system_family=ecs.OperatingSystemFamily.LINUX,
),
)
map_container = map_task_def.add_container(
"worker",
image=worker_image,
logging=ecs.LogDrivers.aws_logs(
stream_prefix="map",
log_group=log_group,
),
environment={
"S3_BUCKET": bucket.bucket_name,
},
)
# --- REDUCE: reducer task definition ---
reduce_task_def = ecs.FargateTaskDefinition(
self, "ReduceTaskDef",
cpu=4096, # 4 vCPU
memory_limit_mib=30720, # 30 GB — must hold full year in memory
task_role=task_role,
runtime_platform=ecs.RuntimePlatform(
cpu_architecture=ecs.CpuArchitecture.ARM64,
operating_system_family=ecs.OperatingSystemFamily.LINUX,
),
)
reduce_container = reduce_task_def.add_container(
"reducer",
image=reducer_image,
logging=ecs.LogDrivers.aws_logs(
stream_prefix="reduce",
log_group=log_group,
),
environment={
"S3_BUCKET": bucket.bucket_name,
},
)
# --- Step Functions ---
# Map task: run ECS Fargate for each date chunk
map_ecs_task = sfn_tasks.EcsRunTask(
self, "ProcessChunk",
integration_pattern=sfn.IntegrationPattern.RUN_JOB,
cluster=cluster,
task_definition=map_task_def,
launch_target=sfn_tasks.EcsFargateLaunchTarget(
platform_version=ecs.FargatePlatformVersion.LATEST,
),
container_overrides=[
sfn_tasks.ContainerOverride(
container_definition=map_container,
environment=[
sfn_tasks.TaskEnvironmentVariable(
name="START_DATE",
value=sfn.JsonPath.string_at("$.start_date"),
),
sfn_tasks.TaskEnvironmentVariable(
name="END_DATE",
value=sfn.JsonPath.string_at("$.end_date"),
),
sfn_tasks.TaskEnvironmentVariable(
name="RUN_ID",
value=sfn.JsonPath.string_at("$.run_id"),
),
],
)
],
assign_public_ip=True,
subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
result_path="$.task_result",
)
# Map state — max 3 concurrent workers
map_state = sfn.Map(
self, "FanOutChunks",
items_path="$.chunks",
max_concurrency=3,
result_path="$.map_results",
)
map_state.item_processor(map_ecs_task)
# Reduce task: combine all chunk CSVs
reduce_ecs_task = sfn_tasks.EcsRunTask(
self, "ReduceResults",
integration_pattern=sfn.IntegrationPattern.RUN_JOB,
cluster=cluster,
task_definition=reduce_task_def,
launch_target=sfn_tasks.EcsFargateLaunchTarget(
platform_version=ecs.FargatePlatformVersion.LATEST,
),
container_overrides=[
sfn_tasks.ContainerOverride(
container_definition=reduce_container,
environment=[
sfn_tasks.TaskEnvironmentVariable(
name="RUN_ID",
value=sfn.JsonPath.string_at("$.run_id"),
),
sfn_tasks.TaskEnvironmentVariable(
name="GLOBAL_START_DATE",
value=sfn.JsonPath.string_at("$.global_start_date"),
),
sfn_tasks.TaskEnvironmentVariable(
name="GLOBAL_END_DATE",
value=sfn.JsonPath.string_at("$.global_end_date"),
),
],
)
],
assign_public_ip=True,
subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
)
# Chain: fan-out map → reduce
definition = map_state.next(reduce_ecs_task)
sfn.StateMachine(
self, "Pipeline",
state_machine_name="adsb-map-reduce",
definition_body=sfn.DefinitionBody.from_chainable(definition),
timeout=Duration.hours(48),
)
# --- Outputs ---
cdk.CfnOutput(self, "BucketName", value=bucket.bucket_name)
cdk.CfnOutput(self, "StateMachineName", value="adsb-map-reduce")
+11 -2
View File
@@ -1,6 +1,15 @@
from pathlib import Path
from datetime import datetime, timezone
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
from datetime import datetime, timezone, timedelta
import argparse
parser = argparse.ArgumentParser(description="Create daily FAA release")
parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today)")
args = parser.parse_args()
if args.date:
date_str = args.date
else:
date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
out_dir = Path("data/faa_releasable")
out_dir.mkdir(parents=True, exist_ok=True)
-90
View File
@@ -1,90 +0,0 @@
"""
Generate Step Functions input and start the pipeline.
Usage:
python trigger_pipeline.py 2024-01-01 2025-01-01
python trigger_pipeline.py 2024-01-01 2025-01-01 --chunk-days 30
python trigger_pipeline.py 2024-01-01 2025-01-01 --dry-run
"""
import argparse
import json
import os
import uuid
from datetime import datetime, timedelta
import boto3
def generate_chunks(start_date: str, end_date: str, chunk_days: int = 1):
"""Split a date range into chunks of chunk_days."""
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
chunks = []
current = start
while current < end:
chunk_end = min(current + timedelta(days=chunk_days), end)
chunks.append({
"start_date": current.strftime("%Y-%m-%d"),
"end_date": chunk_end.strftime("%Y-%m-%d"),
})
current = chunk_end
return chunks
def main():
parser = argparse.ArgumentParser(description="Trigger ADS-B map-reduce pipeline")
parser.add_argument("start_date", help="Start date (YYYY-MM-DD, inclusive)")
parser.add_argument("end_date", help="End date (YYYY-MM-DD, exclusive)")
parser.add_argument("--chunk-days", type=int, default=1,
help="Days per chunk (default: 1)")
parser.add_argument("--dry-run", action="store_true",
help="Print input JSON without starting execution")
args = parser.parse_args()
run_id = f"run-{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}-{uuid.uuid4().hex[:8]}"
chunks = generate_chunks(args.start_date, args.end_date, args.chunk_days)
# Inject run_id into each chunk
for chunk in chunks:
chunk["run_id"] = run_id
sfn_input = {
"run_id": run_id,
"global_start_date": args.start_date,
"global_end_date": args.end_date,
"chunks": chunks,
}
print(f"Run ID: {run_id}")
print(f"Chunks: {len(chunks)} (at {args.chunk_days} days each)")
print(f"Max concurrency: 3 (enforced by Step Functions Map state)")
print()
print(json.dumps(sfn_input, indent=2))
if args.dry_run:
print("\n--dry-run: not starting execution")
return
client = boto3.client("stepfunctions")
# Find the state machine ARN
machines = client.list_state_machines()["stateMachines"]
arn = next(
m["stateMachineArn"]
for m in machines
if m["name"] == "adsb-map-reduce"
)
response = client.start_execution(
stateMachineArn=arn,
name=run_id,
input=json.dumps(sfn_input),
)
print(f"\nStarted execution: {response['executionArn']}")
if __name__ == "__main__":
main()