works

2026-04-23 11:36:35 +02:00 · 2026-02-17 15:46:07 -05:00
parent 1afe2bed4e
commit 70ec797535
1 changed files with 27 additions and 77 deletions
@@ -1,11 +1,9 @@
 """
-Processes trace files from pre-extracted directory for a single day.
+Processes trace files from a single archive part for a single day.
 This is the map phase of the map-reduce pipeline.

-Expects extract_dir to already exist with trace files.
-
 Usage:
-    python -m src.adsb.process_icao_chunk --chunk-id 0 --date 2026-01-01
+    python -m src.adsb.process_icao_chunk --part-id 1 --date 2026-01-01
 """
 import gc
 import os
@@ -23,7 +21,6 @@ import pyarrow.parquet as pq

 from src.adsb.download_adsb_data_to_parquet import (
    OUTPUT_DIR,
-    PARQUET_DIR,
    PARQUET_SCHEMA,
    COLUMNS,
    MAX_WORKERS,
@@ -73,23 +70,13 @@ def rows_to_table(rows: list) -> pa.Table:


 def process_chunk(
-    trace_map: dict[str, str] | dict[str, list[str]],
-    chunk_id: int,
-    output_id: str,
+    trace_files: list[str],
+    part_id: int,
+    date_str: str,
 ) -> str | None:
-    """Process trace files and write to a single parquet file.
+    """Process trace files and write to a single parquet file."""
    
-    Args:
-        trace_map: Map of ICAO -> trace file path (str) or list of trace file paths (list[str])
-        chunk_id: This chunk's ID (0-indexed)
-        output_id: Identifier for output file (date or date range)
-    """
-    
-    # Get trace file paths from the map
-    trace_files = list(trace_map.values())
-    
-    # Single output file
-    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"chunk_{chunk_id}_{output_id}.parquet")
+    output_path = os.path.join(CHUNK_OUTPUT_DIR, f"part_{part_id}_{date_str}.parquet")
    
    start_time = time.perf_counter()
    total_rows = 0
@@ -97,10 +84,8 @@ def process_chunk(
    writer = None
    
    try:
-        # Open writer once at the start
        writer = pq.ParquetWriter(output_path, PARQUET_SCHEMA, compression='snappy')
        
-        # Process files in batches
        files_per_batch = MAX_WORKERS * 100
        for offset in range(0, len(trace_files), files_per_batch):
            batch_files = trace_files[offset:offset + files_per_batch]
@@ -110,85 +95,50 @@ def process_chunk(
                    if rows:
                        batch_rows.extend(rows)
                        
-                        # Write when batch is full
                        if len(batch_rows) >= BATCH_SIZE:
-                            table = rows_to_table(batch_rows)
-                            writer.write_table(table)
+                            writer.write_table(rows_to_table(batch_rows))
                            total_rows += len(batch_rows)
-                            
                            batch_rows = []
-                            del table
                            gc.collect()
                            
-                            elapsed = time.perf_counter() - start_time
-                            print(f"Chunk {chunk_id}: {total_rows} rows, {elapsed:.1f}s | {get_resource_usage()}")
+                            print(f"Part {part_id}: {total_rows} rows, {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
            
            gc.collect()
        
-        # Write remaining rows
        if batch_rows:
-            table = rows_to_table(batch_rows)
-            writer.write_table(table)
+            writer.write_table(rows_to_table(batch_rows))
            total_rows += len(batch_rows)
-            del table
    
    finally:
        if writer:
            writer.close()
    
-    elapsed = time.perf_counter() - start_time
-    print(f"Chunk {chunk_id}: Done! {total_rows} rows in {elapsed:.1f}s | {get_resource_usage()}")
+    print(f"Part {part_id}: Done! {total_rows} rows in {time.perf_counter() - start_time:.1f}s | {get_resource_usage()}")
    
-    if total_rows > 0:
-        return output_path
-    return None
-
-
-def process_single_day(
-    chunk_id: int,
-    target_day: datetime,
-) -> str | None:
-    """Process a single day for this chunk."""
-    date_str = target_day.strftime("%Y-%m-%d")
-    archive_dir = os.path.join(OUTPUT_DIR, "adsb_archives", date_str)
-    
-    archive_files = sorted([
-        os.path.join(archive_dir, f)
-        for f in os.listdir(archive_dir)
-        if f.startswith(f"{date_str}_part_") and f.endswith(".tar.gz")
-    ])
-    
-    print(f"Processing {len(archive_files)} archive files")
-    
-    all_trace_files = []
-    for archive_path in archive_files:
-        trace_map = build_trace_file_map(archive_path)
-        all_trace_files.extend(trace_map.values())
-    
-    print(f"Total trace files: {len(all_trace_files)}")
-    
-    # Convert list to dict for process_chunk compatibility
-    trace_map = {str(i): path for i, path in enumerate(all_trace_files)}
-    
-    return process_chunk(trace_map, chunk_id, date_str)
+    return output_path if total_rows > 0 else None


 def main():
-    parser = argparse.ArgumentParser(description="Process a chunk of ICAOs for a single day")
-    parser.add_argument("--chunk-id", type=int, required=True, help="Chunk ID (0-indexed)")
+    parser = argparse.ArgumentParser(description="Process a single archive part for a day")
+    parser.add_argument("--part-id", type=int, required=True, help="Part ID (1-indexed)")
    parser.add_argument("--date", type=str, required=True, help="Date in YYYY-MM-DD format")
    args = parser.parse_args()
    
-    print(f"Processing chunk {args.chunk_id} for {args.date}")
-    print(f"Resource usage: {get_resource_usage()}")
+    print(f"Processing part {args.part_id} for {args.date}")
    
-    target_day = datetime.strptime(args.date, "%Y-%m-%d")
-    output_path = process_single_day(args.chunk_id, target_day)
+    # Get specific archive file for this part
+    archive_path = os.path.join(OUTPUT_DIR, "adsb_archives", args.date, f"{args.date}_part_{args.part_id}.tar.gz")
    
-    if output_path:
-        print(f"Output: {output_path}")
-    else:
-        print("No output generated")
+    # Extract and collect trace files
+    trace_map = build_trace_file_map(archive_path)
+    all_trace_files = list(trace_map.values())
+    
+    print(f"Total trace files: {len(all_trace_files)}")
+    
+    # Process and write output
+    output_path = process_chunk(all_trace_files, args.part_id, args.date)
+    
+    print(f"Output: {output_path}" if output_path else "No output generated")


 if __name__ == "__main__":