add concat csvs

2026-06-08 14:13:57 +02:00 · 2026-02-01 21:47:07 -05:00
parent 2763e923fc
commit 1ea839669c
2 changed files with 143 additions and 13 deletions
@@ -0,0 +1,89 @@
+from pathlib import Path
+import pandas as pd
+import re
+from derive_from_faa_master_txt import concat_faa_historical_df
+
+def concatenate_aircraft_csvs(
+    input_dir: Path = Path("data/faa_releasable_historical"),
+    output_dir: Path = Path("data/planequery_aircraft"),
+    filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
+):
+    """
+    Read all CSVs matching the pattern from input_dir in order,
+    concatenate them using concat_faa_historical_df, and output a single CSV.
+    
+    Args:
+        input_dir: Directory containing the CSV files to concatenate
+        output_dir: Directory where the output CSV will be saved
+        filename_pattern: Regex pattern to match CSV filenames
+    """
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Find all matching CSV files
+    pattern = re.compile(filename_pattern)
+    csv_files = []
+    
+    for csv_path in sorted(input_dir.glob("*.csv")):
+        match = pattern.search(csv_path.name)
+        if match:
+            start_date = match.group(1)
+            end_date = match.group(2)
+            csv_files.append((start_date, end_date, csv_path))
+    
+    # Sort by start date, then end date
+    csv_files.sort(key=lambda x: (x[0], x[1]))
+    
+    if not csv_files:
+        raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
+    
+    print(f"Found {len(csv_files)} CSV files to concatenate")
+    
+    # Read first CSV as base
+    first_start_date, first_end_date, first_path = csv_files[0]
+    print(f"Reading base file: {first_path.name}")
+    df_base = pd.read_csv(
+        first_path,
+        dtype={
+            'transponder_code': str,
+            'unique_regulatory_id': str,
+            'registrant_county': str
+        }
+    )
+    
+    # Concatenate remaining CSVs
+    for start_date, end_date, csv_path in csv_files[1:]:
+        print(f"Concatenating: {csv_path.name}")
+        df_new = pd.read_csv(
+            csv_path,
+            dtype={
+                'transponder_code': str,
+                'unique_regulatory_id': str,
+                'registrant_county': str
+            }
+        )
+        df_base = concat_faa_historical_df(df_base, df_new)
+    
+    # Verify monotonic increasing download_date
+    assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
+    
+    # Output filename uses first start date and last end date
+    last_start_date, last_end_date, _ = csv_files[-1]
+    output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
+    output_path = output_dir / output_filename
+    
+    print(f"Writing output to: {output_path}")
+    df_base.to_csv(output_path, index=False)
+    print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
+    print(f"Total rows: {len(df_base)}")
+    
+    return output_path
+
+
+if __name__ == "__main__":
+    # Example usage - modify these paths as needed
+    concatenate_aircraft_csvs(
+        input_dir=Path("data/faa_releasable_historical"),
+        output_dir=Path("data/planequery_aircraft")
+    )