diff --git a/.github/workflows/process-historical-faa.yaml b/.github/workflows/process-historical-faa.yaml index 358e440..0153cd8 100644 --- a/.github/workflows/process-historical-faa.yaml +++ b/.github/workflows/process-historical-faa.yaml @@ -22,24 +22,19 @@ jobs: ranges = [] current = start + # Process in 7-day chunks (weekly) while current < end: - # Start of current month - month_start = current - # Calculate next month (handle year rollover) - if current.month == 12: - month_end = datetime(current.year + 1, 1, 1) - else: - month_end = datetime(current.year, current.month + 1, 1) + chunk_end = current + timedelta(days=7) # Don't go past the end date - if month_end > end: - month_end = end + if chunk_end > end: + chunk_end = end ranges.append({ - "since": month_start.strftime("%Y-%m-%d"), - "until": month_end.strftime("%Y-%m-%d") + "since": current.strftime("%Y-%m-%d"), + "until": chunk_end.strftime("%Y-%m-%d") }) - current = month_end + current = chunk_end print(f"::set-output name=matrix::{json.dumps(ranges)}") EOF @@ -65,7 +60,7 @@ jobs: needs: [generate-matrix, clone-faa-repo] runs-on: ubuntu-latest strategy: - max-parallel: 5 # Process 5 chunks at a time + max-parallel: 10 # Process 10 chunks at a time matrix: range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }} steps: @@ -127,4 +122,50 @@ jobs: Generated: ${{ github.event.repository.updated_at }} files: release-files/*.csv draft: false + prerelease: false + + concatenate-and-release: + needs: process-chunk + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + pip install -r requirements.txt + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Prepare CSVs for concatenation + run: | + mkdir -p data/faa_releasable_historical + find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \; + ls -lh data/faa_releasable_historical/ + + - name: Concatenate all CSVs + run: | + python scripts/concat_csvs.py + + - name: Create Combined Release + uses: softprops/action-gh-release@v1 + with: + tag_name: historical-faa-combined-${{ github.run_number }} + name: Historical FAA Data Combined Release ${{ github.run_number }} + body: | + Combined historical FAA aircraft data (all chunks concatenated) + Processing period: 2023-08-16 to 2026-01-01 + Generated: ${{ github.event.repository.updated_at }} + files: data/planequery_aircraft/*.csv + draft: false prerelease: false \ No newline at end of file diff --git a/scripts/concat_csvs.py b/scripts/concat_csvs.py new file mode 100644 index 0000000..943e48b --- /dev/null +++ b/scripts/concat_csvs.py @@ -0,0 +1,89 @@ +from pathlib import Path +import pandas as pd +import re +from derive_from_faa_master_txt import concat_faa_historical_df + +def concatenate_aircraft_csvs( + input_dir: Path = Path("data/faa_releasable_historical"), + output_dir: Path = Path("data/planequery_aircraft"), + filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv" +): + """ + Read all CSVs matching the pattern from input_dir in order, + concatenate them using concat_faa_historical_df, and output a single CSV. + + Args: + input_dir: Directory containing the CSV files to concatenate + output_dir: Directory where the output CSV will be saved + filename_pattern: Regex pattern to match CSV filenames + """ + input_dir = Path(input_dir) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Find all matching CSV files + pattern = re.compile(filename_pattern) + csv_files = [] + + for csv_path in sorted(input_dir.glob("*.csv")): + match = pattern.search(csv_path.name) + if match: + start_date = match.group(1) + end_date = match.group(2) + csv_files.append((start_date, end_date, csv_path)) + + # Sort by start date, then end date + csv_files.sort(key=lambda x: (x[0], x[1])) + + if not csv_files: + raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}") + + print(f"Found {len(csv_files)} CSV files to concatenate") + + # Read first CSV as base + first_start_date, first_end_date, first_path = csv_files[0] + print(f"Reading base file: {first_path.name}") + df_base = pd.read_csv( + first_path, + dtype={ + 'transponder_code': str, + 'unique_regulatory_id': str, + 'registrant_county': str + } + ) + + # Concatenate remaining CSVs + for start_date, end_date, csv_path in csv_files[1:]: + print(f"Concatenating: {csv_path.name}") + df_new = pd.read_csv( + csv_path, + dtype={ + 'transponder_code': str, + 'unique_regulatory_id': str, + 'registrant_county': str + } + ) + df_base = concat_faa_historical_df(df_base, df_new) + + # Verify monotonic increasing download_date + assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing" + + # Output filename uses first start date and last end date + last_start_date, last_end_date, _ = csv_files[-1] + output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv" + output_path = output_dir / output_filename + + print(f"Writing output to: {output_path}") + df_base.to_csv(output_path, index=False) + print(f"Successfully concatenated {len(csv_files)} files into {output_filename}") + print(f"Total rows: {len(df_base)}") + + return output_path + + +if __name__ == "__main__": + # Example usage - modify these paths as needed + concatenate_aircraft_csvs( + input_dir=Path("data/faa_releasable_historical"), + output_dir=Path("data/planequery_aircraft") + ) \ No newline at end of file