mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-06-08 22:23:56 +02:00
add concat csvs
This commit is contained in:
@@ -22,24 +22,19 @@ jobs:
|
|||||||
ranges = []
|
ranges = []
|
||||||
current = start
|
current = start
|
||||||
|
|
||||||
|
# Process in 7-day chunks (weekly)
|
||||||
while current < end:
|
while current < end:
|
||||||
# Start of current month
|
chunk_end = current + timedelta(days=7)
|
||||||
month_start = current
|
|
||||||
# Calculate next month (handle year rollover)
|
|
||||||
if current.month == 12:
|
|
||||||
month_end = datetime(current.year + 1, 1, 1)
|
|
||||||
else:
|
|
||||||
month_end = datetime(current.year, current.month + 1, 1)
|
|
||||||
# Don't go past the end date
|
# Don't go past the end date
|
||||||
if month_end > end:
|
if chunk_end > end:
|
||||||
month_end = end
|
chunk_end = end
|
||||||
|
|
||||||
ranges.append({
|
ranges.append({
|
||||||
"since": month_start.strftime("%Y-%m-%d"),
|
"since": current.strftime("%Y-%m-%d"),
|
||||||
"until": month_end.strftime("%Y-%m-%d")
|
"until": chunk_end.strftime("%Y-%m-%d")
|
||||||
})
|
})
|
||||||
|
|
||||||
current = month_end
|
current = chunk_end
|
||||||
|
|
||||||
print(f"::set-output name=matrix::{json.dumps(ranges)}")
|
print(f"::set-output name=matrix::{json.dumps(ranges)}")
|
||||||
EOF
|
EOF
|
||||||
@@ -65,7 +60,7 @@ jobs:
|
|||||||
needs: [generate-matrix, clone-faa-repo]
|
needs: [generate-matrix, clone-faa-repo]
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
max-parallel: 5 # Process 5 chunks at a time
|
max-parallel: 10 # Process 10 chunks at a time
|
||||||
matrix:
|
matrix:
|
||||||
range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
|
range: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
|
||||||
steps:
|
steps:
|
||||||
@@ -127,4 +122,50 @@ jobs:
|
|||||||
Generated: ${{ github.event.repository.updated_at }}
|
Generated: ${{ github.event.repository.updated_at }}
|
||||||
files: release-files/*.csv
|
files: release-files/*.csv
|
||||||
draft: false
|
draft: false
|
||||||
|
prerelease: false
|
||||||
|
|
||||||
|
concatenate-and-release:
|
||||||
|
needs: process-chunk
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Download all artifacts
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: artifacts
|
||||||
|
|
||||||
|
- name: Prepare CSVs for concatenation
|
||||||
|
run: |
|
||||||
|
mkdir -p data/faa_releasable_historical
|
||||||
|
find artifacts -name "*.csv" -exec cp {} data/faa_releasable_historical/ \;
|
||||||
|
ls -lh data/faa_releasable_historical/
|
||||||
|
|
||||||
|
- name: Concatenate all CSVs
|
||||||
|
run: |
|
||||||
|
python scripts/concat_csvs.py
|
||||||
|
|
||||||
|
- name: Create Combined Release
|
||||||
|
uses: softprops/action-gh-release@v1
|
||||||
|
with:
|
||||||
|
tag_name: historical-faa-combined-${{ github.run_number }}
|
||||||
|
name: Historical FAA Data Combined Release ${{ github.run_number }}
|
||||||
|
body: |
|
||||||
|
Combined historical FAA aircraft data (all chunks concatenated)
|
||||||
|
Processing period: 2023-08-16 to 2026-01-01
|
||||||
|
Generated: ${{ github.event.repository.updated_at }}
|
||||||
|
files: data/planequery_aircraft/*.csv
|
||||||
|
draft: false
|
||||||
prerelease: false
|
prerelease: false
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
from derive_from_faa_master_txt import concat_faa_historical_df
|
||||||
|
|
||||||
|
def concatenate_aircraft_csvs(
|
||||||
|
input_dir: Path = Path("data/faa_releasable_historical"),
|
||||||
|
output_dir: Path = Path("data/planequery_aircraft"),
|
||||||
|
filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Read all CSVs matching the pattern from input_dir in order,
|
||||||
|
concatenate them using concat_faa_historical_df, and output a single CSV.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_dir: Directory containing the CSV files to concatenate
|
||||||
|
output_dir: Directory where the output CSV will be saved
|
||||||
|
filename_pattern: Regex pattern to match CSV filenames
|
||||||
|
"""
|
||||||
|
input_dir = Path(input_dir)
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Find all matching CSV files
|
||||||
|
pattern = re.compile(filename_pattern)
|
||||||
|
csv_files = []
|
||||||
|
|
||||||
|
for csv_path in sorted(input_dir.glob("*.csv")):
|
||||||
|
match = pattern.search(csv_path.name)
|
||||||
|
if match:
|
||||||
|
start_date = match.group(1)
|
||||||
|
end_date = match.group(2)
|
||||||
|
csv_files.append((start_date, end_date, csv_path))
|
||||||
|
|
||||||
|
# Sort by start date, then end date
|
||||||
|
csv_files.sort(key=lambda x: (x[0], x[1]))
|
||||||
|
|
||||||
|
if not csv_files:
|
||||||
|
raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
|
||||||
|
|
||||||
|
print(f"Found {len(csv_files)} CSV files to concatenate")
|
||||||
|
|
||||||
|
# Read first CSV as base
|
||||||
|
first_start_date, first_end_date, first_path = csv_files[0]
|
||||||
|
print(f"Reading base file: {first_path.name}")
|
||||||
|
df_base = pd.read_csv(
|
||||||
|
first_path,
|
||||||
|
dtype={
|
||||||
|
'transponder_code': str,
|
||||||
|
'unique_regulatory_id': str,
|
||||||
|
'registrant_county': str
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Concatenate remaining CSVs
|
||||||
|
for start_date, end_date, csv_path in csv_files[1:]:
|
||||||
|
print(f"Concatenating: {csv_path.name}")
|
||||||
|
df_new = pd.read_csv(
|
||||||
|
csv_path,
|
||||||
|
dtype={
|
||||||
|
'transponder_code': str,
|
||||||
|
'unique_regulatory_id': str,
|
||||||
|
'registrant_county': str
|
||||||
|
}
|
||||||
|
)
|
||||||
|
df_base = concat_faa_historical_df(df_base, df_new)
|
||||||
|
|
||||||
|
# Verify monotonic increasing download_date
|
||||||
|
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
|
||||||
|
|
||||||
|
# Output filename uses first start date and last end date
|
||||||
|
last_start_date, last_end_date, _ = csv_files[-1]
|
||||||
|
output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
|
||||||
|
output_path = output_dir / output_filename
|
||||||
|
|
||||||
|
print(f"Writing output to: {output_path}")
|
||||||
|
df_base.to_csv(output_path, index=False)
|
||||||
|
print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
|
||||||
|
print(f"Total rows: {len(df_base)}")
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage - modify these paths as needed
|
||||||
|
concatenate_aircraft_csvs(
|
||||||
|
input_dir=Path("data/faa_releasable_historical"),
|
||||||
|
output_dir=Path("data/planequery_aircraft")
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user