mirror of
https://github.com/PlaneQuery/OpenAirframes.git
synced 2026-06-07 13:53:52 +02:00
Fix Community Submission export. Fix CSV concatenation logic to prevent duplicates when there is no new ADSB.lol data.
This commit is contained in:
@@ -37,14 +37,31 @@ def main():
|
|||||||
if args.concat_with_latest_csv:
|
if args.concat_with_latest_csv:
|
||||||
print("Loading latest CSV from GitHub releases to concatenate with...")
|
print("Loading latest CSV from GitHub releases to concatenate with...")
|
||||||
from src.get_latest_release import get_latest_aircraft_adsb_csv_df
|
from src.get_latest_release import get_latest_aircraft_adsb_csv_df
|
||||||
df_latest_csv, csv_date = get_latest_aircraft_adsb_csv_df()
|
from datetime import datetime
|
||||||
# Ensure column order matches before concatenating
|
|
||||||
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
|
df_latest_csv, csv_start_date, csv_end_date = get_latest_aircraft_adsb_csv_df()
|
||||||
from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
|
|
||||||
df_final = concat_compressed_dfs(df_latest_csv, df)
|
# Compare dates: end_date is exclusive, so if csv_end_date > args.date,
|
||||||
df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
|
# the latest CSV already includes this day's data
|
||||||
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_date}_{args.date}.csv.gz"
|
csv_end_dt = datetime.strptime(csv_end_date, "%Y-%m-%d")
|
||||||
df_final.write_csv(final_csv_output_path, compression="gzip")
|
args_dt = datetime.strptime(args.date, "%Y-%m-%d")
|
||||||
|
|
||||||
|
if csv_end_dt >= args_dt:
|
||||||
|
print(f"Latest CSV already includes data through {args.date} (end_date={csv_end_date} is exclusive)")
|
||||||
|
print("Writing latest CSV directly without concatenation to avoid duplicates")
|
||||||
|
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{csv_end_date}.csv.gz"
|
||||||
|
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
|
||||||
|
df_latest_csv.write_csv(final_csv_output_path, compression="gzip")
|
||||||
|
else:
|
||||||
|
print(f"Concatenating latest CSV (through {csv_end_date}) with new data ({args.date})")
|
||||||
|
# Ensure column order matches before concatenating
|
||||||
|
df_latest_csv = df_latest_csv.select(CORRECT_ORDER_OF_COLUMNS)
|
||||||
|
from src.adsb.compress_adsb_to_aircraft_data import concat_compressed_dfs
|
||||||
|
df_final = concat_compressed_dfs(df_latest_csv, df)
|
||||||
|
df_final = df_final.select(CORRECT_ORDER_OF_COLUMNS)
|
||||||
|
final_csv_output_path = OUTPUT_DIR / f"openairframes_adsb_{csv_start_date}_{args.date}.csv.gz"
|
||||||
|
df_final.write_csv(final_csv_output_path, compression="gzip")
|
||||||
|
print(f"Final CSV written to {final_csv_output_path}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
@@ -24,7 +24,7 @@ def read_all_submissions(community_dir: Path) -> list[dict]:
|
|||||||
"""Read all JSON submissions from the community directory."""
|
"""Read all JSON submissions from the community directory."""
|
||||||
all_submissions = []
|
all_submissions = []
|
||||||
|
|
||||||
for json_file in sorted(community_dir.glob("*.json")):
|
for json_file in sorted(community_dir.glob("**/*.json")):
|
||||||
try:
|
try:
|
||||||
with open(json_file) as f:
|
with open(json_file) as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|||||||
@@ -207,7 +207,11 @@ def download_latest_aircraft_adsb_csv(
|
|||||||
|
|
||||||
import polars as pl
|
import polars as pl
|
||||||
def get_latest_aircraft_adsb_csv_df():
|
def get_latest_aircraft_adsb_csv_df():
|
||||||
"""Download and load the latest ADS-B CSV from GitHub releases."""
|
"""Download and load the latest ADS-B CSV from GitHub releases.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (df, start_date, end_date) where dates are in YYYY-MM-DD format
|
||||||
|
"""
|
||||||
import re
|
import re
|
||||||
|
|
||||||
csv_path = download_latest_aircraft_adsb_csv()
|
csv_path = download_latest_aircraft_adsb_csv()
|
||||||
@@ -231,15 +235,16 @@ def get_latest_aircraft_adsb_csv_df():
|
|||||||
if df[col].dtype == pl.Utf8:
|
if df[col].dtype == pl.Utf8:
|
||||||
df = df.with_columns(pl.col(col).fill_null(""))
|
df = df.with_columns(pl.col(col).fill_null(""))
|
||||||
|
|
||||||
# Extract start date from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
|
# Extract start and end dates from filename pattern: openairframes_adsb_{start_date}_{end_date}.csv[.gz]
|
||||||
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_", str(csv_path))
|
match = re.search(r"openairframes_adsb_(\d{4}-\d{2}-\d{2})_(\d{4}-\d{2}-\d{2})\.csv", str(csv_path))
|
||||||
if not match:
|
if not match:
|
||||||
raise ValueError(f"Could not extract date from filename: {csv_path.name}")
|
raise ValueError(f"Could not extract dates from filename: {csv_path.name}")
|
||||||
|
|
||||||
date_str = match.group(1)
|
start_date = match.group(1)
|
||||||
|
end_date = match.group(2)
|
||||||
print(df.columns)
|
print(df.columns)
|
||||||
print(df.dtypes)
|
print(df.dtypes)
|
||||||
return df, date_str
|
return df, start_date, end_date
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user