add concat csvs

2026-04-23 19:46:09 +02:00 · 2026-02-01 21:47:07 -05:00
parent 2763e923fc
commit 1ea839669c
2 changed files with 143 additions and 13 deletions
@@ -22,24 +22,19 @@ jobs:
          ranges = []
          current = start
          
+          # Process in 7-day chunks (weekly)
          while current < end:
-            # Start of current month
-            month_start = current
-            # Calculate next month (handle year rollover)
-            if current.month == 12:
-              month_end = datetime(current.year + 1, 1, 1)
-            else:
-              month_end = datetime(current.year, current.month + 1, 1)
+            chunk_end = current + timedelta(days=7)
            # Don't go past the end date
-            if month_end > end:
-              month_end = end
+            if chunk_end > end:
+              chunk_end = end
            
            ranges.append({
-              "since": month_start.strftime("%Y-%m-%d"),
-              "until": month_end.strftime("%Y-%m-%d")
+              "since": current.strftime("%Y-%m-%d"),
+              "until": chunk_end.strftime("%Y-%m-%d")
            })
            
-            current = month_end
+            current = chunk_end
          
          print(f"::set-output name=matrix::{json.dumps(ranges)}")
          EOF
@@ -65,7 +60,7 @@ jobs:
    needs: [generate-matrix, clone-faa-repo]
    runs-on: ubuntu-latest
    strategy:
-      max-parallel: 5  # Process 5 chunks at a time
+      max-parallel: 10  # Process 10 chunks at a time
      matrix:
        range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
    steps:
@@ -127,4 +122,50 @@ jobs:
            Generated: ${{ github.event.repository.updated_at }}
          files: release-files/*.csv
          draft: false
+          prerelease: false
+
+  concatenate-and-release:
+    needs: process-chunk
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+      
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      
+      - name: Prepare CSVs for concatenation
+        run: |
+          mkdir -p data/faa_releasable_historical
+          find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
+          ls -lh data/faa_releasable_historical/
+      
+      - name: Concatenate all CSVs
+        run: |
+          python scripts/concat_csvs.py
+      
+      - name: Create Combined Release
+        uses: softprops/action-gh-release@v1
+        with:
+          tag_name: historical-faa-combined-${{ github.run_number }}
+          name: Historical FAA Data Combined Release ${{ github.run_number }}
+          body: |
+            Combined historical FAA aircraft data (all chunks concatenated)
+            Processing period: 2023-08-16 to 2026-01-01
+            Generated: ${{ github.event.repository.updated_at }}
+          files: data/planequery_aircraft/*.csv
+          draft: false
          prerelease: false
@@ -0,0 +1,89 @@
+from pathlib import Path
+import pandas as pd
+import re
+from derive_from_faa_master_txt import concat_faa_historical_df
+
+def concatenate_aircraft_csvs(
+    input_dir: Path = Path("data/faa_releasable_historical"),
+    output_dir: Path = Path("data/planequery_aircraft"),
+    filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
+):
+    """
+    Read all CSVs matching the pattern from input_dir in order,
+    concatenate them using concat_faa_historical_df, and output a single CSV.
+    
+    Args:
+        input_dir: Directory containing the CSV files to concatenate
+        output_dir: Directory where the output CSV will be saved
+        filename_pattern: Regex pattern to match CSV filenames
+    """
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Find all matching CSV files
+    pattern = re.compile(filename_pattern)
+    csv_files = []
+    
+    for csv_path in sorted(input_dir.glob("*.csv")):
+        match = pattern.search(csv_path.name)
+        if match:
+            start_date = match.group(1)
+            end_date = match.group(2)
+            csv_files.append((start_date, end_date, csv_path))
+    
+    # Sort by start date, then end date
+    csv_files.sort(key=lambda x: (x[0], x[1]))
+    
+    if not csv_files:
+        raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
+    
+    print(f"Found {len(csv_files)} CSV files to concatenate")
+    
+    # Read first CSV as base
+    first_start_date, first_end_date, first_path = csv_files[0]
+    print(f"Reading base file: {first_path.name}")
+    df_base = pd.read_csv(
+        first_path,
+        dtype={
+            'transponder_code': str,
+            'unique_regulatory_id': str,
+            'registrant_county': str
+        }
+    )
+    
+    # Concatenate remaining CSVs
+    for start_date, end_date, csv_path in csv_files[1:]:
+        print(f"Concatenating: {csv_path.name}")
+        df_new = pd.read_csv(
+            csv_path,
+            dtype={
+                'transponder_code': str,
+                'unique_regulatory_id': str,
+                'registrant_county': str
+            }
+        )
+        df_base = concat_faa_historical_df(df_base, df_new)
+    
+    # Verify monotonic increasing download_date
+    assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
+    
+    # Output filename uses first start date and last end date
+    last_start_date, last_end_date, _ = csv_files[-1]
+    output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
+    output_path = output_dir / output_filename
+    
+    print(f"Writing output to: {output_path}")
+    df_base.to_csv(output_path, index=False)
+    print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
+    print(f"Total rows: {len(df_base)}")
+    
+    return output_path
+
+
+if __name__ == "__main__":
+    # Example usage - modify these paths as needed
+    concatenate_aircraft_csvs(
+        input_dir=Path("data/faa_releasable_historical"),
+        output_dir=Path("data/planequery_aircraft")
+    )