Create derived csv daily. Get historical FAA data

2026-07-23 02:00:49 +02:00 · 2026-02-02 20:48:35 -05:00
parent 2e60c64f61
commit 16a0a5fec8
12 changed files with 677 additions and 88 deletions
@@ -0,0 +1,14 @@
+#unique_regulatory_id
+# 1. read historoical and output
+# 2. read sequentially
+
+# Instead of reading all csvs I can read just the latest release csv to get everything.
+
+from pathlib import Path
+
+base = Path("data/faa_releasable_historical")
+for day_dir in sorted(base.glob("2024-02-*")):
+    master = day_dir / "Master.txt"
+    if master.exists():
+        out_csv = master_txt_to_releasable_csv(master, out_dir="data/faa_releasable_historical_csv")
+        print(day_dir.name, "->", out_csv)
@@ -0,0 +1,89 @@
+from pathlib import Path
+import pandas as pd
+import re
+from derive_from_faa_master_txt import concat_faa_historical_df
+
+def concatenate_aircraft_csvs(
+    input_dir: Path = Path("data/concat"),
+    output_dir: Path = Path("data/planequery_aircraft"),
+    filename_pattern: str = r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv"
+):
+    """
+    Read all CSVs matching the pattern from input_dir in order,
+    concatenate them using concat_faa_historical_df, and output a single CSV.
+    
+    Args:
+        input_dir: Directory containing the CSV files to concatenate
+        output_dir: Directory where the output CSV will be saved
+        filename_pattern: Regex pattern to match CSV filenames
+    """
+    input_dir = Path(input_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Find all matching CSV files
+    pattern = re.compile(filename_pattern)
+    csv_files = []
+    
+    for csv_path in sorted(input_dir.glob("*.csv")):
+        match = pattern.search(csv_path.name)
+        if match:
+            start_date = match.group(1)
+            end_date = match.group(2)
+            csv_files.append((start_date, end_date, csv_path))
+    
+    # Sort by start date, then end date
+    csv_files.sort(key=lambda x: (x[0], x[1]))
+    
+    if not csv_files:
+        raise FileNotFoundError(f"No CSV files matching pattern found in {input_dir}")
+    
+    print(f"Found {len(csv_files)} CSV files to concatenate")
+    
+    # Read first CSV as base
+    first_start_date, first_end_date, first_path = csv_files[0]
+    print(f"Reading base file: {first_path.name}")
+    df_base = pd.read_csv(
+        first_path,
+        dtype={
+            'transponder_code': str,
+            'unique_regulatory_id': str,
+            'registrant_county': str
+        }
+    )
+    
+    # Concatenate remaining CSVs
+    for start_date, end_date, csv_path in csv_files[1:]:
+        print(f"Concatenating: {csv_path.name}")
+        df_new = pd.read_csv(
+            csv_path,
+            dtype={
+                'transponder_code': str,
+                'unique_regulatory_id': str,
+                'registrant_county': str
+            }
+        )
+        df_base = concat_faa_historical_df(df_base, df_new)
+    
+    # Verify monotonic increasing download_date
+    assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
+    
+    # Output filename uses first start date and last end date
+    last_start_date, last_end_date, _ = csv_files[-1]
+    output_filename = f"planequery_aircraft_{first_start_date}_{last_end_date}.csv"
+    output_path = output_dir / output_filename
+    
+    print(f"Writing output to: {output_path}")
+    df_base.to_csv(output_path, index=False)
+    print(f"Successfully concatenated {len(csv_files)} files into {output_filename}")
+    print(f"Total rows: {len(df_base)}")
+    
+    return output_path
+
+
+if __name__ == "__main__":
+    # Example usage - modify these paths as needed
+    concatenate_aircraft_csvs(
+        input_dir=Path("data/concat"),
+        output_dir=Path("data/planequery_aircraft")
+    )
@@ -0,0 +1,33 @@
+from pathlib import Path
+from datetime import datetime, timezone
+date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+
+out_dir = Path("data/faa_releasable")
+out_dir.mkdir(parents=True, exist_ok=True)
+zip_name = f"ReleasableAircraft_{date_str}.zip"
+
+zip_path = out_dir / zip_name
+if not zip_path.exists():
+    # URL and paths
+    url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
+    from urllib.request import Request, urlopen
+
+    req = Request(
+        url,
+        headers={"User-Agent": "Mozilla/5.0"},
+        method="GET",
+    )
+
+    with urlopen(req, timeout=120) as r:
+        body = r.read()
+        zip_path.write_bytes(body)
+
+OUT_ROOT = Path("data/planequery_aircraft")
+OUT_ROOT.mkdir(parents=True, exist_ok=True)
+from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
+from get_latest_planequery_aircraft_release import get_latest_aircraft_csv_df
+df_new = convert_faa_master_txt_to_df(zip_path, date_str)
+df_base, start_date_str = get_latest_aircraft_csv_df()
+df_base = concat_faa_historical_df(df_base, df_new)
+assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
+df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date_str}_{date_str}.csv", index=False)
@@ -0,0 +1,127 @@
+from pathlib import Path
+import zipfile
+import pandas as pd
+from faa_aircraft_registry import read
+
+def convert_faa_master_txt_to_df(zip_path: Path, date: str):
+    with zipfile.ZipFile(zip_path) as z:
+        registrations = read(z)
+
+    df = pd.DataFrame(registrations['master'].values())
+    
+    df.insert(0, "download_date", date)
+    
+    registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
+    df = df.drop(columns="registrant").join(registrant)
+    
+    # Move transponder_code_hex to second column (after registration_number)
+    cols = df.columns.tolist()
+    cols.remove("transponder_code_hex")
+    cols.insert(1, "transponder_code_hex")
+    df = df[cols]
+    
+    df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
+    aircraft = pd.json_normalize(df["aircraft"].where(df["aircraft"].notna(), {})).add_prefix("aircraft_")
+    df = df.drop(columns="aircraft").join(aircraft)
+    df = df.rename(columns={"engine_type": "engine_type_2"})
+    engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
+    df = df.drop(columns="engine").join(engine)
+    certification = pd.json_normalize(df["certification"].where(df["certification"].notna(), {})).add_prefix("certificate_")
+    df = df.drop(columns="certification").join(certification)
+    
+    # Create planequery_airframe_id
+    df["planequery_airframe_id"] = (
+        normalize(df["aircraft_manufacturer"])
+        + "|"
+        + normalize(df["aircraft_model"])
+        + "|"
+        + normalize(df["serial_number"])
+    )
+    
+    # Move planequery_airframe_id to come after registration_number
+    cols = df.columns.tolist()
+    cols.remove("planequery_airframe_id")
+    reg_idx = cols.index("registration_number")
+    cols.insert(reg_idx + 1, "planequery_airframe_id")
+    df = df[cols]
+    
+    # Convert all NaN to empty strings
+    df = df.fillna("")
+    
+    return df
+
+
+
+def normalize(s: pd.Series) -> pd.Series:
+    return (
+        s.fillna("")
+         .astype(str)
+         .str.upper()
+         .str.strip()
+         # collapse whitespace
+         .str.replace(r"\s+", " ", regex=True)
+         # remove characters that cause false mismatches
+         .str.replace(r"[^\w\-]", "", regex=True)
+    )
+
+
+def concat_faa_historical_df(df_base, df_new):
+    df_new = df_new[df_base.columns]
+    df_base = pd.concat([df_base, df_new], ignore_index=True)
+    
+    CONTENT_COLS = [
+        c for c in df_base.columns
+        if c not in {"download_date"}
+    ]
+    
+    # Normalize values to handle numeric type, formatting, and list ordering differences
+    def normalize_series(series):
+        def normalize_value(val):
+            # Handle lists (sort them for consistent comparison)
+            if isinstance(val, list):
+                return "|".join(sorted(str(v) for v in val))
+            
+            # Convert to string
+            val_str = str(val).strip()
+            
+            # Handle empty strings
+            if val_str == "" or val_str == "nan":
+                return ""
+            
+            # Check if it looks like a list representation (starts with [ )
+            if val_str.startswith('[') and val_str.endswith(']'):
+                try:
+                    # Try to parse as a list-like string
+                    import ast
+                    parsed = ast.literal_eval(val_str)
+                    if isinstance(parsed, list):
+                        return "|".join(sorted(str(v) for v in parsed))
+                except (ValueError, SyntaxError):
+                    pass  # Not a valid list, continue to other checks
+            
+            # Try to normalize as number
+            try:
+                # Remove leading zeros and convert float/int representations
+                num_val = float(val_str)
+                # If it's a whole number, return as int string (no .0)
+                if num_val == int(num_val):
+                    return str(int(num_val))
+                # Otherwise return as float
+                return str(num_val)
+            except (ValueError, OverflowError):
+                # Not a number, return as-is
+                return val_str
+        
+        return series.apply(normalize_value)
+    
+    df_base["row_fingerprint"] = (
+        df_base[CONTENT_COLS]
+        .apply(normalize_series, axis=0)
+        .apply(lambda row: "|".join(row), axis=1)
+    )
+    
+    df_base = df_base.drop_duplicates(
+              subset=["row_fingerprint"],
+              keep="first"
+          ).drop(columns=["row_fingerprint"])
+    return df_base
@@ -1,63 +1,116 @@
-'''Generated with ChatGPT 5.2 prompt
-scrape-faa-releasable-aircraft
-Every day it creates a new commit that takes ReleasableAircraft zip from FAA takes Master.txt to make these files (it does this so that all files stay under 100mb). For every commit day I want to recombine all the files into one Master.txt again. It has data/commits since 2023.
-scrape-faa-releasable-aircraft % ls
-ACFTREF.txt     DOCINDEX.txt    MASTER-1.txt    MASTER-3.txt    MASTER-5.txt    MASTER-7.txt    MASTER-9.txt    RESERVED.txt
-DEALER.txt      ENGINE.txt      MASTER-2.txt    MASTER-4.txt    MASTER-6.txt    MASTER-8.txt    README.md       ardata.pdf
-'''
+"""
+For each commit-day in Feb 2024 (last commit per day):
+- Write ALL FAA text files from that commit into: data/faa_releasable_historical/YYYY-MM-DD/
+    ACFTREF.txt, DEALER.txt, DOCINDEX.txt, ENGINE.txt, RESERVED.txt
+- Recombine MASTER-*.txt into Master.txt
+- Produce Master.csv via convert_faa_master_txt_to_csv
+
+Assumes the non-master files are present in every commit.
+"""
 import subprocess, re
 from pathlib import Path
+import shutil
 from collections import OrderedDict
+from derive_from_faa_master_txt import convert_faa_master_txt_to_df, concat_faa_historical_df
+import zipfile
+import pandas as pd
+import argparse
+from datetime import datetime, timedelta

-def run(*args: str) -> str:
-    return subprocess.check_output(args, text=True).strip()
+# Parse command line arguments
+parser = argparse.ArgumentParser(description="Process historical FAA data from git commits")
+parser.add_argument("since", help="Start date (YYYY-MM-DD)")
+parser.add_argument("until", help="End date (YYYY-MM-DD)")
+args = parser.parse_args()

-# Get commits that touched any MASTER-*.txt, oldest -> newest
-log = run("git", "log", "--reverse", "--format=%H %cs", "--", ".")
-# If you want to restrict to only commits that touched the master parts, use:
-# log = run("git", "log", "--reverse", "--format=%H %cs", "--", "MASTER-1.txt")
+# Clone repository if it doesn't exist
+REPO = Path("data/scrape-faa-releasable-aircraft")
+OUT_ROOT = Path("data/faa_releasable_historical")
+OUT_ROOT.mkdir(parents=True, exist_ok=True)

+def run_git_text(*args: str) -> str:
+    return subprocess.check_output(["git", "-C", str(REPO), *args], text=True).strip()
+
+def run_git_bytes(*args: str) -> bytes:
+    return subprocess.check_output(["git", "-C", str(REPO), *args])
+
+# Parse dates and adjust --since to the day before
+since_date = datetime.strptime(args.since, "%Y-%m-%d")
+adjusted_since = (since_date - timedelta(days=1)).strftime("%Y-%m-%d")
+
+# All commits in specified date range (oldest -> newest)
+log = run_git_text(
+    "log",
+    "--reverse",
+    "--format=%H %cs",
+    f"--since={adjusted_since}",
+    f"--until={args.until}",
+)
 lines = [ln for ln in log.splitlines() if ln.strip()]
 if not lines:
-    raise SystemExit("No commits found.")
+    raise SystemExit(f"No commits found between {args.since} and {args.until}.")

-# Map date -> last commit SHA on that date (Ordered by history)
+# date -> last SHA that day
 date_to_sha = OrderedDict()
 for ln in lines:
    sha, date = ln.split()
-    # keep last SHA per day
    date_to_sha[date] = sha

-out_root = Path("out_master_by_day")
-out_root.mkdir(exist_ok=True)
-
+OTHER_FILES = ["ACFTREF.txt", "DEALER.txt", "DOCINDEX.txt", "ENGINE.txt", "RESERVED.txt"]
 master_re = re.compile(r"^MASTER-(\d+)\.txt$")
-
+df_base = pd.DataFrame()
+start_date = None
+end_date = None
 for date, sha in date_to_sha.items():
-    # list files at this commit, filter MASTER-*.txt in repo root
-    names = run("git", "ls-tree", "--name-only", sha).splitlines()
+    if start_date is None:
+        start_date = date
+    end_date = date
+    day_dir = OUT_ROOT / date
+    day_dir.mkdir(parents=True, exist_ok=True)
+
+    # Write auxiliary files (assumed present)
+    for fname in OTHER_FILES:
+        (day_dir / fname).write_bytes(run_git_bytes("show", f"{sha}:{fname}"))
+
+    # Recombine MASTER parts
+    names = run_git_text("ls-tree", "--name-only", sha).splitlines()
    parts = []
    for n in names:
        m = master_re.match(n)
        if m:
            parts.append((int(m.group(1)), n))
    parts.sort()
-
    if not parts:
-        # no master parts in that commit/day; skip
-        continue
+        raise RuntimeError(f"{date} {sha[:7]}: no MASTER-*.txt parts found")

-    day_dir = out_root / date
-    day_dir.mkdir(parents=True, exist_ok=True)
-    out_path = day_dir / "Master.txt"
-
-    with out_path.open("wb") as w:
+    master_path = day_dir / "MASTER.txt"
+    with master_path.open("wb") as w:
        for _, fname in parts:
-            data = subprocess.check_output(["git", "show", f"{sha}:{fname}"])
+            data = run_git_bytes("show", f"{sha}:{fname}")
            w.write(data)
            if data and not data.endswith(b"\n"):
                w.write(b"\n")

-    print(f"{date} {sha[:7]} -> {out_path} ({len(parts)} parts)")
+    # 3) Zip the day's files
+    zip_path = day_dir / f"ReleasableAircraft.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
+        for p in day_dir.iterdir():
+            z.write(p, arcname=p.name)

-print(f"\nDone. Output root: {out_root.resolve()}")
+    print(f"{date} {sha[:7]} -> {day_dir} (master parts: {len(parts)})")
+    # 4) Convert ZIP -> CSV
+    df_new = convert_faa_master_txt_to_df(zip_path, date)
+    if df_base.empty:
+        df_base = df_new
+        print(len(df_base), "total entries so far")
+        # Delete all files in the day directory
+        shutil.rmtree(day_dir)
+        continue
+    
+    df_base = concat_faa_historical_df(df_base, df_new)
+    shutil.rmtree(day_dir)
+    print(len(df_base), "total entries so far")
+
+assert df_base['download_date'].is_monotonic_increasing, "download_date is not monotonic increasing"
+df_base.to_csv(OUT_ROOT / f"planequery_aircraft_{start_date}_{end_date}.csv", index=False)
+# TODO: get average number of new rows per day.
@@ -0,0 +1,144 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Optional
+import re
+import urllib.request
+import urllib.error
+import json
+
+
+REPO = "PlaneQuery/planequery-aircraft"
+LATEST_RELEASE_URL = f"https://api.github.com/repos/{REPO}/releases/latest"
+
+
+@dataclass(frozen=True)
+class ReleaseAsset:
+    name: str
+    download_url: str
+    size: int  # bytes
+
+
+def _http_get_json(url: str, headers: dict[str, str]) -> dict:
+    req = urllib.request.Request(url, headers=headers, method="GET")
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        data = resp.read()
+    return json.loads(data.decode("utf-8"))
+
+
+def get_latest_release_assets(repo: str = REPO, github_token: Optional[str] = None) -> list[ReleaseAsset]:
+    url = f"https://api.github.com/repos/{repo}/releases/latest"
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "User-Agent": "planequery-aircraft-downloader/1.0",
+    }
+    if github_token:
+        headers["Authorization"] = f"Bearer {github_token}"
+
+    payload = _http_get_json(url, headers=headers)
+    assets = []
+    for a in payload.get("assets", []):
+        assets.append(
+            ReleaseAsset(
+                name=a["name"],
+                download_url=a["browser_download_url"],
+                size=int(a.get("size", 0)),
+            )
+        )
+    return assets
+
+
+def pick_asset(
+    assets: Iterable[ReleaseAsset],
+    *,
+    exact_name: Optional[str] = None,
+    name_regex: Optional[str] = None,
+) -> ReleaseAsset:
+    assets = list(assets)
+
+    if exact_name:
+        for a in assets:
+            if a.name == exact_name:
+                return a
+        raise FileNotFoundError(f"No asset exactly named {exact_name!r}. Available: {[a.name for a in assets]}")
+
+    if name_regex:
+        rx = re.compile(name_regex)
+        matches = [a for a in assets if rx.search(a.name)]
+        if not matches:
+            raise FileNotFoundError(f"No asset matched regex {name_regex!r}. Available: {[a.name for a in assets]}")
+        if len(matches) > 1:
+            raise FileExistsError(f"Regex {name_regex!r} matched multiple assets: {[m.name for m in matches]}")
+        return matches[0]
+
+    raise ValueError("Provide either exact_name=... or name_regex=...")
+
+
+def download_asset(asset: ReleaseAsset, out_path: Path, github_token: Optional[str] = None) -> Path:
+    out_path = Path(out_path)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    headers = {
+        "User-Agent": "planequery-aircraft-downloader/1.0",
+        "Accept": "application/octet-stream",
+    }
+    if github_token:
+        headers["Authorization"] = f"Bearer {github_token}"
+
+    req = urllib.request.Request(asset.download_url, headers=headers, method="GET")
+
+    try:
+        with urllib.request.urlopen(req, timeout=300) as resp, out_path.open("wb") as f:
+            # Stream download
+            while True:
+                chunk = resp.read(1024 * 1024)  # 1 MiB
+                if not chunk:
+                    break
+                f.write(chunk)
+    except urllib.error.HTTPError as e:
+        body = e.read().decode("utf-8", errors="replace") if hasattr(e, "read") else ""
+        raise RuntimeError(f"HTTPError {e.code} downloading {asset.name}: {body[:500]}") from e
+
+    return out_path
+
+
+def download_latest_aircraft_csv(
+    output_dir: Path = Path("downloads"),
+    github_token: Optional[str] = None,
+    repo: str = REPO,
+) -> Path:
+    """
+    Download the latest planequery_aircraft_*.csv file from the latest GitHub release.
+
+    Args:
+        output_dir: Directory to save the downloaded file (default: "downloads")
+        github_token: Optional GitHub token for authentication
+        repo: GitHub repository in format "owner/repo" (default: REPO)
+
+    Returns:
+        Path to the downloaded file
+    """
+    assets = get_latest_release_assets(repo, github_token=github_token)
+    asset = pick_asset(assets, name_regex=r"^planequery_aircraft_.*\.csv$")
+    saved_to = download_asset(asset, output_dir / asset.name, github_token=github_token)
+    print(f"Downloaded: {asset.name} ({asset.size} bytes) -> {saved_to}")
+    return saved_to
+
+def get_latest_aircraft_csv_df():
+    csv_path = download_latest_aircraft_csv()
+    import pandas as pd
+    df = pd.read_csv(csv_path, dtype={'transponder_code': str, 
+           'unique_regulatory_id': str, 
+           'registrant_county': str})
+    df = df.fillna("")
+    # Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv
+    match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
+    if not match:
+        raise ValueError(f"Could not extract date from filename: {csv_path.name}")
+    
+    date_str = match.group(1)
+    return df, date_str
+
+if __name__ == "__main__":
+    download_latest_aircraft_csv()
@@ -1,48 +0,0 @@
-from faa_aircraft_registry import read
-import pandas as pd
-import zipfile
-import zipfile
-from pathlib import Path
-from datetime import datetime, timezone
-date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
-
-out_dir = Path("data/faa_releasable")
-out_dir.mkdir(parents=True, exist_ok=True)
-zip_name = f"ReleasableAircraft_{date_str}.zip"
-csv_name = f"ReleasableAircraft_{date_str}.csv"
-
-zip_path = out_dir / zip_name
-csv_path = out_dir / csv_name
-
-# URL and paths
-url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
-from urllib.request import Request, urlopen
-
-req = Request(
-    url,
-    headers={"User-Agent": "Mozilla/5.0"},
-    method="GET",
-)
-
-with urlopen(req, timeout=120) as r:
-    body = r.read()
-    zip_path.write_bytes(body)
-
-with zipfile.ZipFile(zip_path) as z:
-    registrations = read(z)
-
-df = pd.DataFrame(registrations['master'].values())
-col = "transponder_code_hex"
-df = df[[col] + [c for c in df.columns if c != col]]
-df = df.rename(columns={"transponder_code_hex": "icao"})
-registrant = pd.json_normalize(df["registrant"]).add_prefix("registrant_")
-df = df.drop(columns="registrant").join(registrant)
-df = df.rename(columns={"aircraft_type": "aircraft_type_2"})
-aircraft = pd.json_normalize(df["aircraft"]).add_prefix("aircraft_")
-df = df.drop(columns="aircraft").join(aircraft)
-df = df.rename(columns={"engine_type": "engine_type_2"})
-engine = pd.json_normalize(df["engine"].where(df["engine"].notna(), {})).add_prefix("engine_")
-df = df.drop(columns="engine").join(engine)
-df = df.sort_values(by=["icao"])
-df.to_csv(csv_path, index=False)
-