mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-04-24 12:06:31 +02:00
add concat csvs
This commit is contained in:
@@ -0,0 +1,89 @@
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import re
|
||||
from derive_from_faa_master_txt import concat_faa_historical_df
|
||||
|
||||
def concatenate_aircraft_csvs(
|
||||
input_dir: Path = Path("data/faa_releasable_historical"),
|
||||
output_dir: Path = Path("data/planequery_aircraft"),
|
||||
filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
|
||||
):
|
||||
"""
|
||||
Read all CSVs matching the pattern from input_dir in order,
|
||||
concatenate them using concat_faa_historical_df, and output a single CSV.
|
||||
|
||||
Args:
|
||||
input_dir: Directory containing the CSV files to concatenate
|
||||
output_dir: Directory where the output CSV will be saved
|
||||
filename_pattern: Regex pattern to match CSV filenames
|
||||
"""
|
||||
input_dir = Path(input_dir)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Find all matching CSV files
|
||||
pattern = re.compile(filename_pattern)
|
||||
csv_files = []
|
||||
|
||||
for csv_path in sorted(input_dir.glob("*.csv")):
|
||||
match = pattern.search(csv_path.name)
|
||||
if match:
|
||||
start_date = match.group(1)
|
||||
end_date = match.group(2)
|
||||
csv_files.append((start_date, end_date, csv_path))
|
||||
|
||||
# Sort by start date, then end date
|
||||
csv_files.sort(key=lambda x: (x[0], x[1]))
|
||||
|
||||
if not csv_files:
|
||||
raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
|
||||
|
||||
print(f"Found {len(csv_files)} CSV files to concatenate")
|
||||
|
||||
# Read first CSV as base
|
||||
first_start_date, first_end_date, first_path = csv_files[0]
|
||||
print(f"Reading base file: {first_path.name}")
|
||||
df_base = pd.read_csv(
|
||||
first_path,
|
||||
dtype={
|
||||
'transponder_code': str,
|
||||
'unique_regulatory_id': str,
|
||||
'registrant_county': str
|
||||
}
|
||||
)
|
||||
|
||||
# Concatenate remaining CSVs
|
||||
for start_date, end_date, csv_path in csv_files[1:]:
|
||||
print(f"Concatenating: {csv_path.name}")
|
||||
df_new = pd.read_csv(
|
||||
csv_path,
|
||||
dtype={
|
||||
'transponder_code': str,
|
||||
'unique_regulatory_id': str,
|
||||
'registrant_county': str
|
||||
}
|
||||
)
|
||||
df_base = concat_faa_historical_df(df_base, df_new)
|
||||
|
||||
# Verify monotonic increasing download_date
|
||||
assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
|
||||
|
||||
# Output filename uses first start date and last end date
|
||||
last_start_date, last_end_date, _ = csv_files[-1]
|
||||
output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
print(f"Writing output to: {output_path}")
|
||||
df_base.to_csv(output_path, index=False)
|
||||
print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
|
||||
print(f"Total rows: {len(df_base)}")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage - modify these paths as needed
|
||||
concatenate_aircraft_csvs(
|
||||
input_dir=Path("data/faa_releasable_historical"),
|
||||
output_dir=Path("data/planequery_aircraft")
|
||||
)
|
||||
Reference in New Issue
Block a user