add concat csvs

2026-08-01 14:18:35 +02:00 · 2026-02-01 21:47:07 -05:00
parent 2763e923fc
commit 1ea839669c
2 changed files with 143 additions and 13 deletions
@@ -22,24 +22,19 @@ jobs:
          ranges = []
          current = start
          # Process in 7-day chunks (weekly)
          while current < end:
-            # Start of current month
+            chunk_end = current + timedelta(days=7)
            month_start = current
            # Calculate next month (handle year rollover)
            if current.month == 12:
              month_end = datetime(current.year + 1, 1, 1)
            else:
              month_end = datetime(current.year, current.month + 1, 1)
            # Don't go past the end date
-            if month_end > end:
+            if chunk_end > end:
-              month_end = end
+              chunk_end = end
            ranges.append({
-              "since": month_start.strftime("%Y-%m-%d"),
+              "since": current.strftime("%Y-%m-%d"),
-              "until": month_end.strftime("%Y-%m-%d")
+              "until": chunk_end.strftime("%Y-%m-%d")
            })
-            current = month_end
+            current = chunk_end
          print(f"::set-output name=matrix::{json.dumps(ranges)}")
          EOF
@@ -65,7 +60,7 @@ jobs:
    needs: [generate-matrix, clone-faa-repo]
    runs-on: ubuntu-latest
    strategy:
-      max-parallel: 5  # Process 5 chunks at a time
+      max-parallel: 10  # Process 10 chunks at a time
      matrix:
        range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
    steps:
@@ -127,4 +122,50 @@ jobs:
            Generated: ${{ github.event.repository.updated_at }}
          files: release-files/*.csv
          draft: false
          prerelease: false
  concatenate-and-release:
    needs: process-chunk
    runs-on: ubuntu-latest
    permissions:
      contents: write
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
      - name: Install dependencies
        run: |
          pip install -r requirements.txt
      - name: Download all artifacts
        uses: actions/download-artifact@v4
        with:
          path: artifacts
      - name: Prepare CSVs for concatenation
        run: |
          mkdir -p data/faa_releasable_historical
          find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
          ls -lh data/faa_releasable_historical/
      - name: Concatenate all CSVs
        run: |
          python scripts/concat_csvs.py
      - name: Create Combined Release
        uses: softprops/action-gh-release@v1
        with:
          tag_name: historical-faa-combined-${{ github.run_number }}
          name: Historical FAA Data Combined Release ${{ github.run_number }}
          body: |
            Combined historical FAA aircraft data (all chunks concatenated)
            Processing period: 2023-08-16 to 2026-01-01
            Generated: ${{ github.event.repository.updated_at }}
          files: data/planequery_aircraft/*.csv
          draft: false
          prerelease: false
@@ -0,0 +1,89 @@
 from pathlib import Path
 import pandas as pd
 import re
 from derive_from_faa_master_txt import concat_faa_historical_df
 def concatenate_aircraft_csvs(
    input_dir: Path = Path("data/faa_releasable_historical"),
    output_dir: Path = Path("data/planequery_aircraft"),
    filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
 ):
    """
    Read all CSVs matching the pattern from input_dir in order,
    concatenate them using concat_faa_historical_df, and output a single CSV.
    Args:
        input_dir: Directory containing the CSV files to concatenate
        output_dir: Directory where the output CSV will be saved
        filename_pattern: Regex pattern to match CSV filenames
    """
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Find all matching CSV files
    pattern = re.compile(filename_pattern)
    csv_files = []
    for csv_path in sorted(input_dir.glob("*.csv")):
        match = pattern.search(csv_path.name)
        if match:
            start_date = match.group(1)
            end_date = match.group(2)
            csv_files.append((start_date, end_date, csv_path))
    # Sort by start date, then end date
    csv_files.sort(key=lambda x: (x[0], x[1]))
    if not csv_files:
        raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
    print(f"Found {len(csv_files)} CSV files to concatenate")
    # Read first CSV as base
    first_start_date, first_end_date, first_path = csv_files[0]
    print(f"Reading base file: {first_path.name}")
    df_base = pd.read_csv(
        first_path,
        dtype={
            'transponder_code': str,
            'unique_regulatory_id': str,
            'registrant_county': str
        }
    )
    # Concatenate remaining CSVs
    for start_date, end_date, csv_path in csv_files[1:]:
        print(f"Concatenating: {csv_path.name}")
        df_new = pd.read_csv(
            csv_path,
            dtype={
                'transponder_code': str,
                'unique_regulatory_id': str,
                'registrant_county': str
            }
        )
        df_base = concat_faa_historical_df(df_base, df_new)
    # Verify monotonic increasing download_date
    assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
    # Output filename uses first start date and last end date
    last_start_date, last_end_date, _ = csv_files[-1]
    output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
    output_path = output_dir / output_filename
    print(f"Writing output to: {output_path}")
    df_base.to_csv(output_path, index=False)
    print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
    print(f"Total rows: {len(df_base)}")
    return output_path
 if __name__ == "__main__":
    # Example usage - modify these paths as needed
    concatenate_aircraft_csvs(
        input_dir=Path("data/faa_releasable_historical"),
        output_dir=Path("data/planequery_aircraft")
    )