update histoircal

split large file into chuncks
delete aws
2026-06-17 18:20:06 +02:00 · 2026-02-13 00:12:18 -05:00 · 2026-02-12 20:22:36 -05:00 · 2026-02-12 20:13:40 -05:00 · 2026-02-12 19:32:34 -05:00 · 2026-02-12 19:09:35 -05:00
8 changed files with 113 additions and 343 deletions
@@ -74,11 +74,12 @@ jobs:
        env:
          START_DATE: ${{ matrix.chunk.start_date }}
          END_DATE: ${{ matrix.chunk.end_date }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          python -m src.adsb.download_and_list_icaos --start-date "$START_DATE" --end-date "$END_DATE"
          ls -lah data/output/

-      - name: Create tar of extracted data
+      - name: Create tar of extracted data and split into chunks
        run: |
          cd data/output
          echo "=== Disk space before tar ==="
@@ -93,16 +94,31 @@ jobs:
            ls -lah extracted_data.tar
            # Verify tar integrity
            tar -tf extracted_data.tar > /dev/null && echo "Tar integrity check passed" || { echo "Tar integrity check FAILED"; exit 1; }
+            
+            # Create checksum of the FULL tar before splitting (for verification after reassembly)
+            echo "=== Creating checksum of full tar ==="
+            sha256sum extracted_data.tar > full_tar.sha256
+            cat full_tar.sha256
+            
+            # Split into 500MB chunks to avoid artifact upload issues
+            echo "=== Splitting tar into 500MB chunks ==="
+            mkdir -p tar_chunks
+            split -b 500M extracted_data.tar tar_chunks/extracted_data.tar.part_
+            rm extracted_data.tar
+            mv full_tar.sha256 tar_chunks/
+            
+            echo "=== Chunks created ==="
+            ls -lah tar_chunks/
          else
            echo "ERROR: No extracted directories found, cannot create tar"
            exit 1
          fi

-      - name: Upload extracted data
+      - name: Upload extracted data chunks
        uses: actions/upload-artifact@v4
        with:
          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
-          path: data/output/extracted_data.tar
+          path: data/output/tar_chunks/
          retention-days: 1
          compression-level: 0
          if-no-files-found: warn
@@ -140,18 +156,40 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          name: adsb-extracted-${{ matrix.chunk.start_date }}-${{ matrix.chunk.end_date }}
-          path: data/output/
-        continue-on-error: true
+          path: data/output/tar_chunks/

-      - name: Extract tar
+      - name: Reassemble and extract tar
        id: extract
        run: |
          cd data/output
-          if [ -f extracted_data.tar ]; then
-            echo "=== Tar file info ==="
+          if [ -d tar_chunks ] && ls tar_chunks/extracted_data.tar.part_* 1>/dev/null 2>&1; then
+            echo "=== Chunk files info ==="
+            ls -lah tar_chunks/
+            
+            cd tar_chunks
+            
+            # Reassemble tar with explicit sorting
+            echo "=== Reassembling tar file ==="
+            ls -1 extracted_data.tar.part_?? | sort | while read part; do
+              echo "Appending $part..."
+              cat "$part" >> ../extracted_data.tar
+            done
+            cd ..
+            
+            echo "=== Reassembled tar file info ==="
            ls -lah extracted_data.tar
-            echo "=== Verifying tar integrity ==="
-            tar -tf extracted_data.tar > /dev/null || { echo "ERROR: Tar file is corrupted"; exit 1; }
+            
+            # Verify checksum of reassembled tar matches original
+            echo "=== Verifying reassembled tar checksum ==="
+            echo "Original checksum:"
+            cat tar_chunks/full_tar.sha256
+            echo "Reassembled checksum:"
+            sha256sum extracted_data.tar
+            sha256sum -c tar_chunks/full_tar.sha256 || { echo "ERROR: Reassembled tar checksum mismatch - data corrupted during transfer"; exit 1; }
+            echo "Checksum verified - data integrity confirmed"
+            
+            rm -rf tar_chunks
+            
            echo "=== Extracting ==="
            tar -xvf extracted_data.tar
            rm extracted_data.tar
@@ -159,7 +197,7 @@ jobs:
            echo "=== Contents of data/output ==="
            ls -lah
          else
-            echo "No extracted_data.tar found"
+            echo "No tar chunks found"
            echo "has_data=false" >> "$GITHUB_OUTPUT"
          fi

@@ -5,6 +5,11 @@ on:
    # 6:00pm UTC every day - runs on default branch, triggers both
    - cron: "0 06 * * *"
  workflow_dispatch:
+    inputs:
+      date:
+        description: 'Date to process (YYYY-MM-DD format, default: yesterday)'
+        required: false
+        type: string

 permissions:
  contents: write
@@ -58,7 +63,7 @@ jobs:

      - name: Run FAA release script
        run: |
-          python src/create_daily_faa_release.py
+          python src/create_daily_faa_release.py ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          ls -lah data/faa_releasable
          ls -lah data/openairframes

@@ -93,8 +98,10 @@ jobs:
          pip install -r requirements.txt

      - name: Download and extract ADS-B data
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          python -m src.adsb.download_and_list_icaos
+          python -m src.adsb.download_and_list_icaos ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          ls -lah data/output/

      - name: Check manifest exists
@@ -164,7 +171,7 @@ jobs:

      - name: Process chunk ${{ matrix.chunk }}
        run: |
-          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4
+          python -m src.adsb.process_icao_chunk --chunk-id ${{ matrix.chunk }} --total-chunks 4 ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          mkdir -p data/output/adsb_chunks
          ls -lah data/output/adsb_chunks/ || echo "No chunks created"

@@ -213,7 +220,7 @@ jobs:
        run: |
          mkdir -p data/output/adsb_chunks
          ls -lah data/output/adsb_chunks/ || echo "Directory empty or does not exist"
-          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks
+          python -m src.adsb.combine_chunks_to_csv --chunks-dir data/output/adsb_chunks ${{ inputs.date && format('--date {0}', inputs.date) || '' }}
          ls -lah data/openairframes/

      - name: Upload ADS-B artifacts
@@ -259,6 +266,13 @@ jobs:
    needs: [build-faa, adsb-reduce, build-community]
    if: github.event_name != 'schedule'
    steps:
+      - name: Checkout for gh CLI
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            .github
+          sparse-checkout-cone-mode: false
+
      - name: Download FAA artifacts
        uses: actions/download-artifact@v4
        with:
@@ -279,6 +293,8 @@ jobs:

      - name: Debug artifact structure
        run: |
+          echo "=== Full artifacts tree ==="
+          find artifacts -type f 2>/dev/null || echo "No files found in artifacts"
          echo "=== FAA artifacts ==="
          find artifacts/faa -type f 2>/dev/null || echo "No files found in artifacts/faa"
          echo "=== ADS-B artifacts ==="
@@ -300,13 +316,35 @@ jobs:
          TAG="openairframes-${DATE}${BRANCH_SUFFIX}"
          
          # Find files from artifacts using find (handles nested structures)
-          CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" | head -1)
+          CSV_FILE_FAA=$(find artifacts/faa -name "openairframes_faa_*.csv" -type f 2>/dev/null | head -1)
+          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" -type f 2>/dev/null | head -1)
+          CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" -type f 2>/dev/null | head -1)
+          ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" -type f 2>/dev/null | head -1)
+          
+          # Validate required files exist
+          MISSING_FILES=""
+          if [ -z "$CSV_FILE_FAA" ] || [ ! -f "$CSV_FILE_FAA" ]; then
+            MISSING_FILES="$MISSING_FILES FAA_CSV"
+          fi
+          if [ -z "$CSV_FILE_ADSB" ] || [ ! -f "$CSV_FILE_ADSB" ]; then
+            MISSING_FILES="$MISSING_FILES ADSB_CSV"
+          fi
+          if [ -z "$ZIP_FILE" ] || [ ! -f "$ZIP_FILE" ]; then
+            MISSING_FILES="$MISSING_FILES FAA_ZIP"
+          fi
+          
+          if [ -n "$MISSING_FILES" ]; then
+            echo "ERROR: Missing required release files:$MISSING_FILES"
+            echo "FAA CSV: $CSV_FILE_FAA"
+            echo "ADSB CSV: $CSV_FILE_ADSB"
+            echo "ZIP: $ZIP_FILE"
+            exit 1
+          fi
+          
+          # Get basenames for display
          CSV_BASENAME_FAA=$(basename "$CSV_FILE_FAA")
-          CSV_FILE_ADSB=$(find artifacts/adsb -name "openairframes_adsb_*.csv" | head -1)
          CSV_BASENAME_ADSB=$(basename "$CSV_FILE_ADSB")
-          CSV_FILE_COMMUNITY=$(find artifacts/community -name "openairframes_community_*.csv" 2>/dev/null | head -1 || echo "")
          CSV_BASENAME_COMMUNITY=$(basename "$CSV_FILE_COMMUNITY" 2>/dev/null || echo "")
-          ZIP_FILE=$(find artifacts/faa -name "ReleasableAircraft_*.zip" | head -1)
          ZIP_BASENAME=$(basename "$ZIP_FILE")
          
          echo "date=$DATE" >> "$GITHUB_OUTPUT"
@@ -320,9 +358,12 @@ jobs:
          echo "zip_file=$ZIP_FILE" >> "$GITHUB_OUTPUT"
          echo "zip_basename=$ZIP_BASENAME" >> "$GITHUB_OUTPUT"
          echo "name=OpenAirframes snapshot ($DATE)${BRANCH_SUFFIX}" >> "$GITHUB_OUTPUT"
-
-      - name: Checkout for gh CLI
-        uses: actions/checkout@v4
+          
+          echo "Found files:"
+          echo "  FAA CSV: $CSV_FILE_FAA"
+          echo "  ADSB CSV: $CSV_FILE_ADSB"
+          echo "  Community CSV: $CSV_FILE_COMMUNITY"
+          echo "  ZIP: $ZIP_FILE"

      - name: Delete existing release if exists
        run: |
@@ -336,6 +377,7 @@ jobs:
        with:
          tag_name: ${{ steps.meta.outputs.tag }}
          name: ${{ steps.meta.outputs.name }}
+          fail_on_unmatched_files: true
          body: |
            Automated daily snapshot generated at 06:00 UTC for ${{ steps.meta.outputs.date }}.

@@ -1,11 +0,0 @@
-#!/usr/bin/env python3
-import os
-import aws_cdk as cdk
-from stack import AdsbProcessingStack
-
-app = cdk.App()
-AdsbProcessingStack(app, "AdsbProcessingStack", env=cdk.Environment(
-    account=os.environ["CDK_DEFAULT_ACCOUNT"],
-    region=os.environ["CDK_DEFAULT_REGION"],
-))
-app.synth()
@@ -1,3 +0,0 @@
-{
-  "app": "python3 app.py"
-}
@@ -1,2 +0,0 @@
-aws-cdk-lib>=2.170.0
-constructs>=10.0.0
@@ -1,213 +0,0 @@
-import aws_cdk as cdk
-from aws_cdk import (
-    Stack,
-    Duration,
-    RemovalPolicy,
-    aws_s3 as s3,
-    aws_ecs as ecs,
-    aws_ec2 as ec2,
-    aws_ecr_assets,
-    aws_iam as iam,
-    aws_logs as logs,
-    aws_stepfunctions as sfn,
-    aws_stepfunctions_tasks as sfn_tasks,
-)
-from constructs import Construct
-from pathlib import Path
-
-
-class AdsbProcessingStack(Stack):
-    def __init__(self, scope: Construct, id: str, **kwargs):
-        super().__init__(scope, id, **kwargs)
-
-        # --- S3 bucket for intermediate and final results ---
-        bucket = s3.Bucket(
-            self, "ResultsBucket",
-            bucket_name="openairframes-dev",
-            removal_policy=RemovalPolicy.DESTROY,
-            auto_delete_objects=True,
-            lifecycle_rules=[
-                s3.LifecycleRule(
-                    prefix="intermediate/",
-                    expiration=Duration.days(7),
-                )
-            ],
-        )
-
-        # --- Use default VPC (no additional cost) ---
-        vpc = ec2.Vpc.from_lookup(
-            self, "Vpc",
-            is_default=True,
-        )
-
-        # --- ECS Cluster ---
-        cluster = ecs.Cluster(
-            self, "Cluster",
-            vpc=vpc,
-            container_insights=True,
-        )
-
-        # --- Log group ---
-        log_group = logs.LogGroup(
-            self, "LogGroup",
-            log_group_name="/adsb-processing",
-            removal_policy=RemovalPolicy.DESTROY,
-            retention=logs.RetentionDays.TWO_WEEKS,
-        )
-
-        # --- Docker images (built from local Dockerfiles) ---
-        adsb_dir = str(Path(__file__).parent.parent / "src" / "adsb")
-
-        worker_image = ecs.ContainerImage.from_asset(
-            adsb_dir,
-            file="Dockerfile.worker",
-            platform=cdk.aws_ecr_assets.Platform.LINUX_ARM64,
-        )
-        reducer_image = ecs.ContainerImage.from_asset(
-            adsb_dir,
-            file="Dockerfile.reducer",
-            platform=cdk.aws_ecr_assets.Platform.LINUX_ARM64,
-        )
-
-        # --- Task role (shared) ---
-        task_role = iam.Role(
-            self, "TaskRole",
-            assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"),
-        )
-        bucket.grant_read_write(task_role)
-
-        # --- MAP: worker task definition ---
-        map_task_def = ecs.FargateTaskDefinition(
-            self, "MapTaskDef",
-            cpu=4096,           # 4 vCPU
-            memory_limit_mib=30720,  # 30 GB
-            task_role=task_role,
-            runtime_platform=ecs.RuntimePlatform(
-                cpu_architecture=ecs.CpuArchitecture.ARM64,
-                operating_system_family=ecs.OperatingSystemFamily.LINUX,
-            ),
-        )
-        map_container = map_task_def.add_container(
-            "worker",
-            image=worker_image,
-            logging=ecs.LogDrivers.aws_logs(
-                stream_prefix="map",
-                log_group=log_group,
-            ),
-            environment={
-                "S3_BUCKET": bucket.bucket_name,
-            },
-        )
-
-        # --- REDUCE: reducer task definition ---
-        reduce_task_def = ecs.FargateTaskDefinition(
-            self, "ReduceTaskDef",
-            cpu=4096,            # 4 vCPU
-            memory_limit_mib=30720,  # 30 GB — must hold full year in memory
-            task_role=task_role,
-            runtime_platform=ecs.RuntimePlatform(
-                cpu_architecture=ecs.CpuArchitecture.ARM64,
-                operating_system_family=ecs.OperatingSystemFamily.LINUX,
-            ),
-        )
-        reduce_container = reduce_task_def.add_container(
-            "reducer",
-            image=reducer_image,
-            logging=ecs.LogDrivers.aws_logs(
-                stream_prefix="reduce",
-                log_group=log_group,
-            ),
-            environment={
-                "S3_BUCKET": bucket.bucket_name,
-            },
-        )
-
-        # --- Step Functions ---
-
-        # Map task: run ECS Fargate for each date chunk
-        map_ecs_task = sfn_tasks.EcsRunTask(
-            self, "ProcessChunk",
-            integration_pattern=sfn.IntegrationPattern.RUN_JOB,
-            cluster=cluster,
-            task_definition=map_task_def,
-            launch_target=sfn_tasks.EcsFargateLaunchTarget(
-                platform_version=ecs.FargatePlatformVersion.LATEST,
-            ),
-            container_overrides=[
-                sfn_tasks.ContainerOverride(
-                    container_definition=map_container,
-                    environment=[
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="START_DATE",
-                            value=sfn.JsonPath.string_at("$.start_date"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="END_DATE",
-                            value=sfn.JsonPath.string_at("$.end_date"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="RUN_ID",
-                            value=sfn.JsonPath.string_at("$.run_id"),
-                        ),
-                    ],
-                )
-            ],
-            assign_public_ip=True,
-            subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
-            result_path="$.task_result",
-        )
-
-        # Map state — max 3 concurrent workers
-        map_state = sfn.Map(
-            self, "FanOutChunks",
-            items_path="$.chunks",
-            max_concurrency=3,
-            result_path="$.map_results",
-        )
-        map_state.item_processor(map_ecs_task)
-
-        # Reduce task: combine all chunk CSVs
-        reduce_ecs_task = sfn_tasks.EcsRunTask(
-            self, "ReduceResults",
-            integration_pattern=sfn.IntegrationPattern.RUN_JOB,
-            cluster=cluster,
-            task_definition=reduce_task_def,
-            launch_target=sfn_tasks.EcsFargateLaunchTarget(
-                platform_version=ecs.FargatePlatformVersion.LATEST,
-            ),
-            container_overrides=[
-                sfn_tasks.ContainerOverride(
-                    container_definition=reduce_container,
-                    environment=[
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="RUN_ID",
-                            value=sfn.JsonPath.string_at("$.run_id"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="GLOBAL_START_DATE",
-                            value=sfn.JsonPath.string_at("$.global_start_date"),
-                        ),
-                        sfn_tasks.TaskEnvironmentVariable(
-                            name="GLOBAL_END_DATE",
-                            value=sfn.JsonPath.string_at("$.global_end_date"),
-                        ),
-                    ],
-                )
-            ],
-            assign_public_ip=True,
-            subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
-        )
-
-        # Chain: fan-out map → reduce
-        definition = map_state.next(reduce_ecs_task)
-
-        sfn.StateMachine(
-            self, "Pipeline",
-            state_machine_name="adsb-map-reduce",
-            definition_body=sfn.DefinitionBody.from_chainable(definition),
-            timeout=Duration.hours(48),
-        )
-
-        # --- Outputs ---
-        cdk.CfnOutput(self, "BucketName", value=bucket.bucket_name)
-        cdk.CfnOutput(self, "StateMachineName", value="adsb-map-reduce")
@@ -1,6 +1,15 @@
 from pathlib import Path
-from datetime import datetime, timezone
-date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+from datetime import datetime, timezone, timedelta
+import argparse
+
+parser = argparse.ArgumentParser(description="Create daily FAA release")
+parser.add_argument("--date", type=str, help="Date to process (YYYY-MM-DD format, default: today)")
+args = parser.parse_args()
+
+if args.date:
+    date_str = args.date
+else:
+    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")

 out_dir = Path("data/faa_releasable")
 out_dir.mkdir(parents=True, exist_ok=True)
@@ -1,90 +0,0 @@
-"""
-Generate Step Functions input and start the pipeline.
-
-Usage:
-  python trigger_pipeline.py 2024-01-01 2025-01-01
-  python trigger_pipeline.py 2024-01-01 2025-01-01 --chunk-days 30
-  python trigger_pipeline.py 2024-01-01 2025-01-01 --dry-run
-"""
-import argparse
-import json
-import os
-import uuid
-from datetime import datetime, timedelta
-
-import boto3
-
-
-def generate_chunks(start_date: str, end_date: str, chunk_days: int = 1):
-    """Split a date range into chunks of chunk_days."""
-    start = datetime.strptime(start_date, "%Y-%m-%d")
-    end = datetime.strptime(end_date, "%Y-%m-%d")
-
-    chunks = []
-    current = start
-    while current < end:
-        chunk_end = min(current + timedelta(days=chunk_days), end)
-        chunks.append({
-            "start_date": current.strftime("%Y-%m-%d"),
-            "end_date": chunk_end.strftime("%Y-%m-%d"),
-        })
-        current = chunk_end
-
-    return chunks
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Trigger ADS-B map-reduce pipeline")
-    parser.add_argument("start_date", help="Start date (YYYY-MM-DD, inclusive)")
-    parser.add_argument("end_date", help="End date (YYYY-MM-DD, exclusive)")
-    parser.add_argument("--chunk-days", type=int, default=1,
-                        help="Days per chunk (default: 1)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print input JSON without starting execution")
-    args = parser.parse_args()
-
-    run_id = f"run-{datetime.utcnow().strftime('%Y%m%dT%H%M%S')}-{uuid.uuid4().hex[:8]}"
-    chunks = generate_chunks(args.start_date, args.end_date, args.chunk_days)
-
-    # Inject run_id into each chunk
-    for chunk in chunks:
-        chunk["run_id"] = run_id
-
-    sfn_input = {
-        "run_id": run_id,
-        "global_start_date": args.start_date,
-        "global_end_date": args.end_date,
-        "chunks": chunks,
-    }
-
-    print(f"Run ID:    {run_id}")
-    print(f"Chunks:    {len(chunks)} (at {args.chunk_days} days each)")
-    print(f"Max concurrency: 3 (enforced by Step Functions Map state)")
-    print()
-    print(json.dumps(sfn_input, indent=2))
-
-    if args.dry_run:
-        print("\n--dry-run: not starting execution")
-        return
-
-    client = boto3.client("stepfunctions")
-
-    # Find the state machine ARN
-    machines = client.list_state_machines()["stateMachines"]
-    arn = next(
-        m["stateMachineArn"]
-        for m in machines
-        if m["name"] == "adsb-map-reduce"
-    )
-
-    response = client.start_execution(
-        stateMachineArn=arn,
-        name=run_id,
-        input=json.dumps(sfn_input),
-    )
-
-    print(f"\nStarted execution: {response['executionArn']}")
-
-
-if __name__ == "__main__":
-    main()
Author	SHA1	Message	Date
ggman12	8999a943a9	update histoircal	2026-02-13 00:12:18 -05:00
ggman12	74625b9bc9	split large file into chuncks	2026-02-12 20:22:36 -05:00
ggman12	f2728d6156	delete aws	2026-02-12 20:13:40 -05:00
ggman12	5ed10ec42e	update	2026-02-12 19:32:34 -05:00
ggman12	3b8a14a4b9	add ability for custom run input date	2026-02-12 19:09:35 -05:00
ggman12	e5f124428f	use github token for adsb.lol downlaods	2026-02-12 19:03:23 -05:00
ggman12	d5039fb766	update to fix files	2026-02-12 19:01:02 -05:00