mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-06-08 22:23:56 +02:00
handle duplictes much better
This commit is contained in:
@@ -39,7 +39,7 @@ jobs:
|
|||||||
id: meta
|
id: meta
|
||||||
run: |
|
run: |
|
||||||
DATE=$(date -u +"%Y-%m-%d")
|
DATE=$(date -u +"%Y-%m-%d")
|
||||||
TAG="faa-${DATE}"
|
TAG="planequery-aircraft-${DATE}"
|
||||||
# Find the CSV file in data/planequery_aircraft matching the pattern
|
# Find the CSV file in data/planequery_aircraft matching the pattern
|
||||||
CSV_FILE=$(ls data/planequery_aircraft/planequery_aircraft_*_${DATE}.csv | head -1)
|
CSV_FILE=$(ls data/planequery_aircraft/planequery_aircraft_*_${DATE}.csv | head -1)
|
||||||
CSV_BASENAME=$(basename "$CSV_FILE")
|
CSV_BASENAME=$(basename "$CSV_FILE")
|
||||||
|
|||||||
@@ -7,20 +7,20 @@ out_dir.mkdir(parents=True, exist_ok=True)
|
|||||||
zip_name = f"ReleasableAircraft_{date_str}.zip"
|
zip_name = f"ReleasableAircraft_{date_str}.zip"
|
||||||
|
|
||||||
zip_path = out_dir / zip_name
|
zip_path = out_dir / zip_name
|
||||||
|
if not zip_path.exists():
|
||||||
|
# URL and paths
|
||||||
|
url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
|
|
||||||
# URL and paths
|
req = Request(
|
||||||
url = "https://registry.faa.gov/database/ReleasableAircraft.zip"
|
url,
|
||||||
from urllib.request import Request, urlopen
|
headers={"User-Agent": "Mozilla/5.0"},
|
||||||
|
method="GET",
|
||||||
|
)
|
||||||
|
|
||||||
req = Request(
|
with urlopen(req, timeout=120) as r:
|
||||||
url,
|
body = r.read()
|
||||||
headers={"User-Agent": "Mozilla/5.0"},
|
zip_path.write_bytes(body)
|
||||||
method="GET",
|
|
||||||
)
|
|
||||||
|
|
||||||
with urlopen(req, timeout=120) as r:
|
|
||||||
body = r.read()
|
|
||||||
zip_path.write_bytes(body)
|
|
||||||
|
|
||||||
OUT_ROOT = Path("data/planequery_aircraft")
|
OUT_ROOT = Path("data/planequery_aircraft")
|
||||||
OUT_ROOT.mkdir(parents=True, exist_ok=True)
|
OUT_ROOT.mkdir(parents=True, exist_ok=True)
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ def normalize(s: pd.Series) -> pd.Series:
|
|||||||
|
|
||||||
|
|
||||||
def concat_faa_historical_df(df_base, df_new):
|
def concat_faa_historical_df(df_base, df_new):
|
||||||
|
df_new = df_new[df_base.columns]
|
||||||
df_base = pd.concat([df_base, df_new], ignore_index=True)
|
df_base = pd.concat([df_base, df_new], ignore_index=True)
|
||||||
|
|
||||||
CONTENT_COLS = [
|
CONTENT_COLS = [
|
||||||
@@ -74,10 +74,49 @@ def concat_faa_historical_df(df_base, df_new):
|
|||||||
if c not in {"download_date"}
|
if c not in {"download_date"}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Normalize values to handle numeric type, formatting, and list ordering differences
|
||||||
|
def normalize_series(series):
|
||||||
|
def normalize_value(val):
|
||||||
|
# Handle lists (sort them for consistent comparison)
|
||||||
|
if isinstance(val, list):
|
||||||
|
return "|".join(sorted(str(v) for v in val))
|
||||||
|
|
||||||
|
# Convert to string
|
||||||
|
val_str = str(val).strip()
|
||||||
|
|
||||||
|
# Handle empty strings
|
||||||
|
if val_str == "" or val_str == "nan":
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Check if it looks like a list representation (starts with [ )
|
||||||
|
if val_str.startswith('[') and val_str.endswith(']'):
|
||||||
|
try:
|
||||||
|
# Try to parse as a list-like string
|
||||||
|
import ast
|
||||||
|
parsed = ast.literal_eval(val_str)
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
return "|".join(sorted(str(v) for v in parsed))
|
||||||
|
except (ValueError, SyntaxError):
|
||||||
|
pass # Not a valid list, continue to other checks
|
||||||
|
|
||||||
|
# Try to normalize as number
|
||||||
|
try:
|
||||||
|
# Remove leading zeros and convert float/int representations
|
||||||
|
num_val = float(val_str)
|
||||||
|
# If it's a whole number, return as int string (no .0)
|
||||||
|
if num_val == int(num_val):
|
||||||
|
return str(int(num_val))
|
||||||
|
# Otherwise return as float
|
||||||
|
return str(num_val)
|
||||||
|
except (ValueError, OverflowError):
|
||||||
|
# Not a number, return as-is
|
||||||
|
return val_str
|
||||||
|
|
||||||
|
return series.apply(normalize_value)
|
||||||
|
|
||||||
df_base["row_fingerprint"] = (
|
df_base["row_fingerprint"] = (
|
||||||
df_base[CONTENT_COLS]
|
df_base[CONTENT_COLS]
|
||||||
.fillna("")
|
.apply(normalize_series, axis=0)
|
||||||
.astype(str)
|
|
||||||
.apply(lambda row: "|".join(row), axis=1)
|
.apply(lambda row: "|".join(row), axis=1)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -126,12 +126,13 @@ def download_latest_aircraft_csv(
|
|||||||
return saved_to
|
return saved_to
|
||||||
|
|
||||||
def get_latest_aircraft_csv_df():
|
def get_latest_aircraft_csv_df():
|
||||||
csv_path = download_latest_aircraft_csv()
|
# csv_path = download_latest_aircraft_csv()
|
||||||
|
csv_path = '/Users/jonahgoode/Documents/PlaneQuery/Code/planequery-aircraft/data/planequery_aircraft/planequery_aircraft_2023-08-16_2026-01-31.csv'
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
df = pd.read_csv(csv_path, dtype={'transponder_code': str,
|
df = pd.read_csv(csv_path, dtype={'transponder_code': str,
|
||||||
'unique_regulatory_id': str,
|
'unique_regulatory_id': str,
|
||||||
'registrant_county': str})
|
'registrant_county': str})
|
||||||
|
df = df.fillna("")
|
||||||
# Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv
|
# Extract date from filename pattern: planequery_aircraft_{date}_{date}.csv
|
||||||
match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
|
match = re.search(r"planequery_aircraft_(\d{4}-\d{2}-\d{2})_", str(csv_path))
|
||||||
if not match:
|
if not match:
|
||||||
|
|||||||
Reference in New Issue
Block a user