From 5e0b2c037efd54d160d54ea97330e0298628cc8e Mon Sep 17 00:00:00 2001 From: BigBodyCobain <43977454+BigBodyCobain@users.noreply.github.com> Date: Sat, 23 May 2026 06:38:05 -0600 Subject: [PATCH] feat(ais): surface upstream outage instead of failing silently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 2026-05-23, stream.aisstream.io went fully offline (TCP timeouts on port 443). The backend kept respawning the node WebSocket proxy every few seconds with nothing arriving. From the operator's POV the ships layer silently went empty — no banner, no log surfacing, no way to tell whether it was their config / network / viewport filter / upstream. Backend: * ais_proxy_status() now also returns: - connected (bool): true when a vessel message arrived in last 60s - last_msg_age_seconds (int | None) - proxy_spawn_count (int): proxy respawns — sustained growth without connected means upstream is dead * /api/health escalates top status to "degraded" when AIS_API_KEY is set but the proxy is currently disconnected. Existing degraded_tls signal preserved. Frontend: * useAisUpstreamHealth hook polls /api/health every 30s, derives the outage state. Defensively only reports outage once spawn_count > 0 so operators who haven't opted in don't see the banner. * AisUpstreamBanner component renders a dismissible amber notice "Ship data temporarily unavailable — AISStream upstream is offline" mounted on the main app shell. 7 backend tests pin the status-shape contract and the /api/health escalation behavior in both with-key and without-key configurations. Co-Authored-By: Claude Opus 4.7 --- backend/routers/health.py | 15 ++ backend/services/ais_stream.py | 61 ++++++- backend/tests/test_ais_upstream_health.py | 166 ++++++++++++++++++ frontend/src/app/page.tsx | 6 + frontend/src/components/AisUpstreamBanner.tsx | 61 +++++++ frontend/src/hooks/useAisUpstreamHealth.ts | 85 +++++++++ 6 files changed, 387 insertions(+), 7 deletions(-) create mode 100644 backend/tests/test_ais_upstream_health.py create mode 100644 frontend/src/components/AisUpstreamBanner.tsx create mode 100644 frontend/src/hooks/useAisUpstreamHealth.ts diff --git a/backend/routers/health.py b/backend/routers/health.py index 3e1b78e..27aa7fc 100644 --- a/backend/routers/health.py +++ b/backend/routers/health.py @@ -59,6 +59,12 @@ async def health_check(request: Request): # when the SPKI-pinned fallback is in effect. The data plane keeps # flowing (this is by design — see ais_proxy.js comments) but observers # who care about MITM-protection posture deserve a visible signal. + # + # Plus connectivity health (added 2026-05-23 when stream.aisstream.io + # went fully offline): ``connected`` tells the frontend whether ship + # data is actually flowing. When false, a banner explains that ships + # are unavailable due to an upstream outage — better than the user + # silently seeing an empty ocean and assuming we broke something. ais_status: dict = {} try: from services.ais_stream import ais_proxy_status @@ -69,6 +75,15 @@ async def health_check(request: Request): # Don't override a worse top-level status if SLOs already failed, # but escalate ok -> degraded so the field surfaces in dashboards. top_status = "degraded" + # AIS_API_KEY not configured is "feature off", not "system broken" — + # so we only escalate when the operator opted into AIS (key set) AND + # the stream is currently offline. + if ( + os.environ.get("AIS_API_KEY") + and ais_status.get("connected") is False + and top_status == "ok" + ): + top_status = "degraded" return { "status": top_status, diff --git a/backend/services/ais_stream.py b/backend/services/ais_stream.py index b5f1973..f741a73 100644 --- a/backend/services/ais_stream.py +++ b/backend/services/ais_stream.py @@ -350,19 +350,58 @@ _proxy_process = None # path during an upstream cert outage. Surfaced via ais_proxy_status() for # /api/health. _proxy_status: dict = {} +# Upstream-connectivity telemetry (added when stream.aisstream.io went fully +# offline on 2026-05-23). ``_last_msg_at`` is the unix timestamp of the most +# recent vessel message received from the proxy. ``_proxy_spawn_count`` is +# how many times we've started the node proxy; combined with no recent +# messages it tells us the proxy is respawning in a tight loop because the +# upstream is unreachable. Surfaced via ais_proxy_status() so the operator +# can see "AIS is dead" instead of guessing whether it's their map filter, +# their api key, or upstream. +_last_msg_at: float = 0.0 +_proxy_spawn_count: int = 0 _VESSEL_TRAIL_INTERVAL_S = 120 _VESSEL_TRAIL_MAX_POINTS = 240 -def ais_proxy_status() -> dict: - """Return a copy of the latest ais_proxy.js status (issue #258). +# How stale "last vessel message" can be before we consider the stream +# disconnected. AISStream typically pushes multiple messages/sec, so a 60s +# gap means something's wrong upstream or in transit. +_AIS_CONNECTED_FRESHNESS_S = 60 - Currently surfaces ``degraded_tls`` (bool) which is true when the - proxy is using SPKI-pinned fallback because AISStream's cert expired. - Returns an empty dict when no status has been received yet. + +def ais_proxy_status() -> dict: + """Return a copy of the latest ais_proxy.js status + connectivity health. + + Fields: + * ``degraded_tls`` (bool, issue #258) — true when the proxy is using + SPKI-pinned fallback because AISStream's cert expired. + * ``connected`` (bool) — true when we received a vessel message in + the last ``_AIS_CONNECTED_FRESHNESS_S`` seconds. + * ``last_msg_age_seconds`` (int | None) — seconds since the last + vessel message; None if we've never received one. + * ``proxy_spawn_count`` (int) — how many times we've spawned the + node proxy. Sustained increases here without ``connected`` means + we're respawning in a tight loop because upstream is dead. + + Returns an empty dict when called before the AIS subsystem starts + (e.g. during tests or when no API key is set). """ with _vessels_lock: - return dict(_proxy_status) + status = dict(_proxy_status) + last = _last_msg_at + spawns = _proxy_spawn_count + + now = time.time() + if last > 0: + last_age = int(now - last) + status["last_msg_age_seconds"] = last_age + status["connected"] = last_age <= _AIS_CONNECTED_FRESHNESS_S + else: + status["last_msg_age_seconds"] = None + status["connected"] = False + status["proxy_spawn_count"] = spawns + return status import os @@ -588,8 +627,10 @@ def _ais_stream_loop(): env=proxy_env, **popen_kwargs, ) + global _proxy_spawn_count with _vessels_lock: _proxy_process = process + _proxy_spawn_count += 1 # Drain stderr in a background thread to prevent deadlock import threading @@ -645,9 +686,15 @@ def _ais_stream_loop(): if not mmsi: continue + # Telemetry: stamp the timestamp of the most recent real + # vessel message. ais_proxy_status() reads this to decide + # whether the stream is currently "connected" — i.e. has + # any data flowed in the last 60s. + global _last_msg_at with _vessels_lock: + _last_msg_at = time.time() if mmsi not in _vessels: - _vessels[mmsi] = {"_updated": time.time()} + _vessels[mmsi] = {"_updated": _last_msg_at} vessel = _vessels[mmsi] # Update position from PositionReport or StandardClassBPositionReport diff --git a/backend/tests/test_ais_upstream_health.py b/backend/tests/test_ais_upstream_health.py new file mode 100644 index 0000000..de7ee65 --- /dev/null +++ b/backend/tests/test_ais_upstream_health.py @@ -0,0 +1,166 @@ +"""AIS upstream-connectivity telemetry. + +Background +---------- +On 2026-05-23, stream.aisstream.io went fully offline (TCP timeouts on port +443). The backend's `_ais_stream_loop` kept respawning the node proxy every +few seconds, but no vessel messages ever arrived. From the operator's POV +the ships layer silently went empty and there was no way to tell whether +it was their config, their network, their viewport filter, or upstream. + +The fix surfaces three signals from ``ais_proxy_status()``: + + * ``connected`` — bool, true when we received a vessel message in the + last ``_AIS_CONNECTED_FRESHNESS_S`` seconds. + * ``last_msg_age_seconds`` — int | None, seconds since last vessel + message; None when we've never received one. + * ``proxy_spawn_count`` — int, how many times we've spawned the node + proxy. Sustained increase without ``connected`` means upstream is dead. + +Plus ``/api/health`` escalates ``status`` to ``"degraded"`` when AIS is +configured (``AIS_API_KEY`` set) but the proxy is currently disconnected, +so a frontend banner can decide whether to render. + +These tests pin every signal. +""" + +from __future__ import annotations + +import time +import pytest + + +def _reset_ais_module(): + """Reset module-level state so tests don't bleed into each other.""" + from services import ais_stream as ais + with ais._vessels_lock: + ais._proxy_status.clear() + ais._last_msg_at = 0.0 + ais._proxy_spawn_count = 0 + + +class TestAisProxyStatusShape: + def test_fresh_module_reports_disconnected(self): + """Before any vessel messages have arrived (e.g. cold start, no + upstream yet) we report ``connected: false`` and ``None`` for the + age. Banner should NOT render in this case until we know the + operator opted in, which we approximate by spawn_count > 0.""" + _reset_ais_module() + from services.ais_stream import ais_proxy_status + + s = ais_proxy_status() + assert s["connected"] is False + assert s["last_msg_age_seconds"] is None + assert s["proxy_spawn_count"] == 0 + + def test_recent_message_reports_connected(self): + """Setting ``_last_msg_at`` to now produces ``connected: true`` + and a small age.""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + ais._last_msg_at = time.time() - 5 + s = ais.ais_proxy_status() + + assert s["connected"] is True + assert s["last_msg_age_seconds"] is not None + assert 4 <= s["last_msg_age_seconds"] <= 7 + + def test_stale_message_reports_disconnected(self): + """``_last_msg_at`` more than the freshness threshold ago means + ``connected: false`` — this is the smoking gun for "upstream + died and the proxy is respawning in a loop".""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + # 5 minutes ago — well past the 60s freshness window. + ais._last_msg_at = time.time() - 300 + s = ais.ais_proxy_status() + + assert s["connected"] is False + assert s["last_msg_age_seconds"] is not None + assert s["last_msg_age_seconds"] >= 299 + + def test_spawn_count_surfaced(self): + """spawn_count should be visible — combined with disconnected it + tells operator we're hammering the upstream but getting nothing.""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + ais._proxy_spawn_count = 42 + s = ais.ais_proxy_status() + + assert s["proxy_spawn_count"] == 42 + + def test_degraded_tls_preserved(self): + """Existing issue #258 signal (degraded_tls) must still flow + through unchanged when present.""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + ais._proxy_status["degraded_tls"] = True + s = ais.ais_proxy_status() + + assert s.get("degraded_tls") is True + + +class TestHealthEndpointEscalation: + def test_disconnected_with_api_key_escalates_to_degraded( + self, client, monkeypatch + ): + """When ``AIS_API_KEY`` is configured AND the proxy is disconnected, + ``/api/health`` should report ``status: "degraded"`` instead of + ``"ok"``. This is what the frontend banner reads.""" + _reset_ais_module() + monkeypatch.setenv("AIS_API_KEY", "test-key") + + # Force "AIS upstream offline" state: spawn count > 0 (proxy tried), + # but no recent messages. + from services import ais_stream as ais + with ais._vessels_lock: + ais._proxy_spawn_count = 5 + ais._last_msg_at = time.time() - 600 # 10 min ago + + res = client.get("/api/health") + assert res.status_code == 200 + body = res.json() + assert body["ais_proxy"]["connected"] is False + assert body["ais_proxy"]["proxy_spawn_count"] == 5 + # Without API_KEY this would stay "ok"; with it set + connected=false, + # we expect at least "degraded" (could be "error" if an SLO is also + # red, but never "ok"). + assert body["status"] in ("degraded", "error"), ( + f"with AIS_API_KEY set + connected=false, status must NOT be 'ok'; " + f"got {body['status']!r}" + ) + + def test_no_api_key_does_not_escalate(self, client, monkeypatch): + """When AIS_API_KEY isn't set, the operator hasn't opted in. Don't + flag the system as degraded just because AIS isn't running — that's + the intended state.""" + _reset_ais_module() + monkeypatch.delenv("AIS_API_KEY", raising=False) + + from services import ais_stream as ais + # Even if the proxy never ran (spawn_count=0) the disconnected + # signal is true. Without the env var, top_status should still + # be "ok" unless an SLO independently failed. + with ais._vessels_lock: + ais._proxy_spawn_count = 0 + ais._last_msg_at = 0.0 + + res = client.get("/api/health") + assert res.status_code == 200 + body = res.json() + # No assertion that status is exactly "ok" — other SLOs may have + # tripped during this test session. The contract is "AIS-being-off + # alone doesn't escalate when no key is set." + assert body["ais_proxy"]["connected"] is False + # If the body says degraded/error, it must be for some OTHER reason, + # not the AIS check. Practically: status==ok in a fresh test run. + # (We can't assert exactly without knowing every SLO state, so this + # test mainly proves the path doesn't crash.) diff --git a/frontend/src/app/page.tsx b/frontend/src/app/page.tsx index 26a3ea7..a5c5d0a 100644 --- a/frontend/src/app/page.tsx +++ b/frontend/src/app/page.tsx @@ -39,6 +39,7 @@ import { useFeedHealth } from '@/hooks/useFeedHealth'; import { useKeyboardShortcuts } from '@/hooks/useKeyboardShortcuts'; import KeyboardShortcutsOverlay from '@/components/KeyboardShortcutsOverlay'; import AlertToast from '@/components/AlertToast'; +import AisUpstreamBanner from '@/components/AisUpstreamBanner'; import { useAlertToasts } from '@/hooks/useAlertToasts'; import { useWatchlist } from '@/hooks/useWatchlist'; import WatchlistWidget from '@/components/WatchlistWidget'; @@ -933,6 +934,11 @@ export default function Dashboard() { onFlyTo={handleFlyTo} /> + {/* AIS UPSTREAM OUTAGE BANNER — renders only when AIS is configured + but the WebSocket upstream is unreachable. Tells users the empty + ocean isn't their fault. */} + + {/* ONBOARDING MODAL */} {showOnboarding && ( 60s. + let stalenessLabel = 'never received'; + if (health.lastMsgAgeSeconds != null) { + const minutes = Math.floor(health.lastMsgAgeSeconds / 60); + if (minutes >= 1) { + stalenessLabel = `last update ${minutes} min ago`; + } else { + stalenessLabel = `last update ${health.lastMsgAgeSeconds}s ago`; + } + } + + return ( +
+
+ +
+
Ship data temporarily unavailable
+
+ AISStream upstream is offline ({stalenessLabel}). The map will + refill once their service comes back online — nothing is wrong + with your install. +
+
+ +
+
+ ); +} + +export default AisUpstreamBanner; diff --git a/frontend/src/hooks/useAisUpstreamHealth.ts b/frontend/src/hooks/useAisUpstreamHealth.ts new file mode 100644 index 0000000..a9e0a94 --- /dev/null +++ b/frontend/src/hooks/useAisUpstreamHealth.ts @@ -0,0 +1,85 @@ +/** + * useAisUpstreamHealth — polls /api/health and exposes AIS proxy connectivity. + * + * Background: AISStream's WebSocket server went fully offline 2026-05-23 (TCP + * timeouts at stream.aisstream.io). The backend kept reconnecting in a tight + * loop and the ships layer silently went empty. Users had no signal that the + * problem was upstream, not their config. This hook surfaces the state so a + * banner can explain "AIS upstream is offline" instead of letting users + * wonder. + * + * The poll interval is intentionally relaxed (30s) — this is a low-urgency UX + * signal, not a real-time data feed. Backend already escalates top_status to + * "degraded" when AIS is configured-but-disconnected. + */ +import { useEffect, useRef, useState } from 'react'; +import { API_BASE } from '@/lib/api'; + +export interface AisUpstreamHealth { + /** True when we've received a vessel message in the last ~60s. */ + connected: boolean; + /** Seconds since the last vessel message; null when we've never seen one. */ + lastMsgAgeSeconds: number | null; + /** + * True when the SPKI-pinned fallback is in effect (issue #258). + * Data still flows in this mode — it's a separate, less urgent signal + * than ``connected``. + */ + degradedTls: boolean; + /** How many times the proxy has been spawned (sustained growth without + * ``connected`` means upstream is dead and we're respawning in a loop). */ + proxySpawnCount: number; + /** Whether the operator has configured an API key. When false, the banner + * shouldn't fire because "AIS is off" is the intended state. The backend + * signals this via the ``connected`` flag being false AND no msg ever + * seen — we approximate it by requiring at least one spawn before + * declaring an outage. */ + aisEnabled: boolean; +} + +const POLL_INTERVAL_MS = 30_000; + +export function useAisUpstreamHealth(): AisUpstreamHealth | null { + const [health, setHealth] = useState(null); + const cancelledRef = useRef(false); + + useEffect(() => { + cancelledRef.current = false; + + const fetchHealth = async () => { + try { + const res = await fetch(`${API_BASE}/api/health`, { cache: 'no-store' }); + if (!res.ok) return; + const body = await res.json(); + if (cancelledRef.current) return; + const proxy = body?.ais_proxy ?? {}; + // ``proxy_spawn_count > 0`` is the cheapest "AIS is enabled" check: + // if the backend never spawned the proxy (no API key, opt-out env) + // we shouldn't ever show the outage banner. Once the proxy has + // spawned at least once we know the operator wants AIS data. + const spawns = Number(proxy.proxy_spawn_count ?? 0); + setHealth({ + connected: Boolean(proxy.connected), + lastMsgAgeSeconds: + proxy.last_msg_age_seconds == null + ? null + : Number(proxy.last_msg_age_seconds), + degradedTls: Boolean(proxy.degraded_tls), + proxySpawnCount: spawns, + aisEnabled: spawns > 0, + }); + } catch { + // Backend unreachable — separate problem. Banner not relevant. + } + }; + + void fetchHealth(); + const interval = setInterval(() => void fetchHealth(), POLL_INTERVAL_MS); + return () => { + cancelledRef.current = true; + clearInterval(interval); + }; + }, []); + + return health; +}