From 87e37df035927b9f9910b6f2dd79b65ef741990d Mon Sep 17 00:00:00 2001 From: ggman12 Date: Mon, 2 Feb 2026 20:28:09 -0500 Subject: [PATCH] handle duplictes much better --- .../planequery-aircraft-daily-release.yaml | 2 +- ...reate_daily_planequery_aircraft_release.py | 24 +++++----- src/derive_from_faa_master_txt.py | 45 +++++++++++++++++-- src/get_latest_planequery_aircraft_release.py | 5 ++- 4 files changed, 58 insertions(+), 18 deletions(-) diff --git a/.github/workflows/planequery-aircraft-daily-release.yaml b/.github/workflows/planequery-aircraft-daily-release.yaml index 003ffac..37309ca 100644 --- a/.github/workflows/planequery-aircraft-daily-release.yaml +++ b/.github/workflows/planequery-aircraft-daily-release.yaml @@ -39,7 +39,7 @@ jobs: id: meta run: | DATE=$(date -u +"%Y-%m-%d") - TAG="faa-${DATE}" + TAG="planequery-aircraft-${DATE}" # Find the CSV file in data/planequery_aircraft matching the pattern CSV_FILE=$(ls data/planequery_aircraft/planequery_aircraft_*_${DATE}.csv | head -1) CSV_BASENAME=$(basename "$CSV_FILE") diff --git a/src/create_daily_planequery_aircraft_release.py b/src/create_daily_planequery_aircraft_release.py index e86528f..4019aa7 100644 --- a/src/create_daily_planequery_aircraft_release.py +++ b/src/create_daily_planequery_aircraft_release.py @@ -7,20 +7,20 @@ out_dir.mkdir(parents=True, exist_ok=True) zip_name = f"ReleasableAircraft_{date_str}.zip" zip_path = out_dir / zip_name +if not zip_path.exists(): + # URL and paths + url = "https://registry.faa.gov/database/ReleasableAircraft.zip" + from urllib.request import Request, urlopen -# URL and paths -url = "https://registry.faa.gov/database/ReleasableAircraft.zip" -from urllib.request import Request, urlopen + req = Request( + url, + headers={"User-Agent": "Mozilla/5.0"}, + method="GET", + ) -req = Request( - url, - headers={"User-Agent": "Mozilla/5.0"}, - method="GET", -) - -with urlopen(req, timeout=120) as r: - body = r.read() - zip_path.write_bytes(body) + with urlopen(req, timeout=120) as r: + body = r.read() + zip_path.write_bytes(body) OUT_ROOT = Path("data/planequery_aircraft") OUT_ROOT.mkdir(parents=True, exist_ok=True) diff --git a/src/derive_from_faa_master_txt.py b/src/derive_from_faa_master_txt.py index 8ab519c..532d3f3 100644 --- a/src/derive_from_faa_master_txt.py +++ b/src/derive_from_faa_master_txt.py @@ -66,7 +66,7 @@ def normalize(s: pd.Series) -> pd.Series: def concat_faa_historical_df(df_base, df_new): - + df_new = df_new[df_base.columns] df_base = pd.concat([df_base, df_new], ignore_index=True) CONTENT_COLS = [ @@ -74,10 +74,49 @@ def concat_faa_historical_df(df_base, df_new): if c not in {"download_date"} ] + # Normalize values to handle numeric type, formatting, and list ordering differences + def normalize_series(series): + def normalize_value(val): + # Handle lists (sort them for consistent comparison) + if isinstance(val, list): + return "|".join(sorted(str(v) for v in val)) + + # Convert to string + val_str = str(val).strip() + + # Handle empty strings + if val_str == "" or val_str == "nan": + return "" + + # Check if it looks like a list representation (starts with [ ) + if val_str.startswith('[') and val_str.endswith(']'): + try: + # Try to parse as a list-like string + import ast + parsed = ast.literal_eval(val_str) + if isinstance(parsed, list): + return "|".join(sorted(str(v) for v in parsed)) + except (ValueError, SyntaxError): + pass # Not a valid list, continue to other checks + + # Try to normalize as number + try: + # Remove leading zeros and convert float/int representations + num_val = float(val_str) + # If it's a whole number, return as int string (no .0) + if num_val == int(num_val): + return str(int(num_val)) + # Otherwise return as float + return str(num_val) + except (ValueError, OverflowError): + # Not a number, return as-is + return val_str + + return series.apply(normalize_value) + df_base["row_fingerprint"] = ( df_base[CONTENT_COLS] - .fillna("") - .astype(str) + .apply(normalize_series, axis=0) .apply(lambda row: "|".join(row), axis=1) ) diff --git a/src/get_latest_planequery_aircraft_release.py b/src/get_latest_planequery_aircraft_release.py index 27f6941..7c4d4d9 100644 --- a/src/get_latest_planequery_aircraft_release.py +++ b/src/get_latest_planequery_aircraft_release.py @@ -126,12 +126,13 @@ def download_latest_aircraft_csv( return saved_to def get_latest_aircraft_csv_df(): - csv_path = download_latest_aircraft_csv() + # csv_path = download_latest_aircraft_csv() + csv_path = '/Users/jonahgoode/Documents/PlaneQuery/Code/planequery-aircraft/data/planequery_aircraft/planequery_aircraft_2023-08-16_2026-01-31.csv' import pandas as pd df = pd.read_csv(csv_path, dtype={'transponder_code': str, 'unique_regulatory_id': str, 'registrant_county': str}) - + df = df.fillna("") # Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path)) if not match: