From 6f50bd32b7345608b3d7ad192eb8d3528c2b705e Mon Sep 17 00:00:00 2001
From: AJ Collins <16407457+ajc133@users.noreply.github.com>
Date: Mon, 16 Dec 2024 08:42:58 -0800
Subject: [PATCH] Feat/segment node regions (#21)

* lint alpr_clusters.py

* feat: Segment nodes into 5x5 regions
---
 serverless/alpr_clusters/src/alpr_clusters.py | 119 +++++++++++++-----
 1 file changed, 85 insertions(+), 34 deletions(-)

diff --git a/serverless/alpr_clusters/src/alpr_clusters.py b/serverless/alpr_clusters/src/alpr_clusters.py
index f00b092..dc3015b 100644
--- a/serverless/alpr_clusters/src/alpr_clusters.py
+++ b/serverless/alpr_clusters/src/alpr_clusters.py
@@ -1,10 +1,14 @@
-import boto3
-import requests
 import json
-from sklearn.cluster import DBSCAN
-import numpy as np
+from collections import defaultdict
+from typing import Any
 
-def get_clusters():
+import boto3
+import numpy as np
+import requests
+from sklearn.cluster import DBSCAN
+
+
+def get_all_nodes():
     # Set up the Overpass API query
     query = """
     [out:json];
@@ -12,31 +16,40 @@ def get_clusters():
     out body;
     """
 
+    url = "http://overpass-api.de/api/interpreter"
+    response = requests.get(
+        url, params={"data": query}, headers={"User-Agent": "DeFlock/1.0"}
+    )
+    response.raise_for_status()
+    return response.json()["elements"]
+
+
+def get_clusters(nodes: list[Any]):
     # Request data from Overpass API
     print("Requesting data from Overpass API.")
-    url = "http://overpass-api.de/api/interpreter"
-    response = requests.get(url, params={'data': query}, headers={'User-Agent': 'DeFlock/1.0'})
-    data = response.json()
+
     print("Data received. Parsing nodes...")
 
     # Parse nodes and extract lat/lon for clustering
     coordinates = []
     node_ids = []
-    for element in data['elements']:
-        if element['type'] == 'node':
-            coordinates.append([element['lat'], element['lon']])
-            node_ids.append(element['id'])
+    for element in nodes:
+        if element["type"] == "node":
+            coordinates.append([element["lat"], element["lon"]])
+            node_ids.append(element["id"])
 
     # Convert coordinates to NumPy array for DBSCAN
     coordinates = np.array(coordinates)
 
-    # Define the clustering radius (10 miles in meters)
+    # Define the clustering radius (50 miles in meters)
     radius_miles = 50
     radius_km = radius_miles * 1.60934  # 1 mile = 1.60934 km
     radius_in_radians = radius_km / 6371.0  # Earth's radius in km
 
     # Perform DBSCAN clustering
-    db = DBSCAN(eps=radius_in_radians, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coordinates))
+    db = DBSCAN(
+        eps=radius_in_radians, min_samples=1, algorithm="ball_tree", metric="haversine"
+    ).fit(np.radians(coordinates))
     labels = db.labels_
 
     # Prepare clusters and calculate centroids
@@ -45,34 +58,72 @@ def get_clusters():
         cluster_points = coordinates[labels == label]
         centroid = np.mean(cluster_points, axis=0)
         first_node_id = node_ids[labels.tolist().index(label)]
-        
+
         # Store in clusters dict with centroid and first node ID
-        clusters[label] = {
-            "lat": centroid[0],
-            "lon": centroid[1],
-            "id": first_node_id
-        }
+        clusters[label] = {"lat": centroid[0], "lon": centroid[1], "id": first_node_id}
 
     output = {"clusters": list(clusters.values())}
     print("Clustering complete.")
     return output
 
 
+def segment_regions(nodes: Any, tile_size_degrees: int) -> list[dict[str, Any]]:
+    tile_dict = defaultdict(list)
+    for node in nodes["elements"]:
+        lat, lon = node["lat"], node["lon"]
+        tile_lat = int(np.floor(lat / tile_size_degrees)) * tile_size_degrees
+        tile_lon = int(np.floor(lon / tile_size_degrees)) * tile_size_degrees
+        tile_dict[(tile_lat, tile_lon)].append(node)
+
+    tile_list = []
+    for region, nodes in tile_dict.items():
+        tile_list.append({"lat": region[0], "lon": region[1], "nodes": nodes})
+
+    return tile_list
+
+
 def lambda_handler(event, context):
-  alpr_clusters = get_clusters()
+    nodes = get_all_nodes()
+    alpr_clusters = get_clusters(nodes)
+    regions = segment_regions(nodes=nodes, tile_size_degrees=5)
 
-  s3 = boto3.client('s3')
-  bucket = 'deflock-clusters'
-  key = 'alpr_clusters.json'
+    s3 = boto3.client("s3")
+    bucket = "deflock-clusters"
+    key = "alpr_clusters.json"
 
-  s3.put_object(
-    Bucket=bucket,
-    Key=key,
-    Body=json.dumps(alpr_clusters),
-    ContentType='application/json'
-  )
+    s3.put_object(
+        Bucket=bucket,
+        Key=key,
+        Body=json.dumps(alpr_clusters),
+        ContentType="application/json",
+    )
 
-  return {
-    'statusCode': 200,
-    'body': 'Successfully fetched ALPR counts.',
-  }
+    for region in regions:
+        lat, lon = region["lat"], region["lon"]
+        s3.put_object(
+            Bucket=bucket,
+            Key=f"regions/{lat}/{lon}.json",
+            Body=json.dumps(region["nodes"]),
+            ContentType="application/json",
+        )
+
+    return {
+        "statusCode": 200,
+        "body": "Successfully fetched ALPR counts.",
+    }
+
+
+if __name__ == "__main__":
+    from pathlib import Path
+
+    nodes_file_path = Path("nodes.json")
+    if nodes_file_path.exists():
+        nodes = json.load(nodes_file_path.open())
+    else:
+        nodes = get_all_nodes()
+        with nodes_file_path.open("w") as f:
+            json.dump(nodes, f)
+
+    regions = segment_regions(nodes=nodes, tile_size_degrees=5)
+    with open("regions.json", "w") as f:
+        json.dump(regions, f)