diff --git a/backend/routers/health.py b/backend/routers/health.py index 3e1b78e..27aa7fc 100644 --- a/backend/routers/health.py +++ b/backend/routers/health.py @@ -59,6 +59,12 @@ async def health_check(request: Request): # when the SPKI-pinned fallback is in effect. The data plane keeps # flowing (this is by design — see ais_proxy.js comments) but observers # who care about MITM-protection posture deserve a visible signal. + # + # Plus connectivity health (added 2026-05-23 when stream.aisstream.io + # went fully offline): ``connected`` tells the frontend whether ship + # data is actually flowing. When false, a banner explains that ships + # are unavailable due to an upstream outage — better than the user + # silently seeing an empty ocean and assuming we broke something. ais_status: dict = {} try: from services.ais_stream import ais_proxy_status @@ -69,6 +75,15 @@ async def health_check(request: Request): # Don't override a worse top-level status if SLOs already failed, # but escalate ok -> degraded so the field surfaces in dashboards. top_status = "degraded" + # AIS_API_KEY not configured is "feature off", not "system broken" — + # so we only escalate when the operator opted into AIS (key set) AND + # the stream is currently offline. + if ( + os.environ.get("AIS_API_KEY") + and ais_status.get("connected") is False + and top_status == "ok" + ): + top_status = "degraded" return { "status": top_status, diff --git a/backend/services/ais_stream.py b/backend/services/ais_stream.py index b5f1973..f741a73 100644 --- a/backend/services/ais_stream.py +++ b/backend/services/ais_stream.py @@ -350,19 +350,58 @@ _proxy_process = None # path during an upstream cert outage. Surfaced via ais_proxy_status() for # /api/health. _proxy_status: dict = {} +# Upstream-connectivity telemetry (added when stream.aisstream.io went fully +# offline on 2026-05-23). ``_last_msg_at`` is the unix timestamp of the most +# recent vessel message received from the proxy. ``_proxy_spawn_count`` is +# how many times we've started the node proxy; combined with no recent +# messages it tells us the proxy is respawning in a tight loop because the +# upstream is unreachable. Surfaced via ais_proxy_status() so the operator +# can see "AIS is dead" instead of guessing whether it's their map filter, +# their api key, or upstream. +_last_msg_at: float = 0.0 +_proxy_spawn_count: int = 0 _VESSEL_TRAIL_INTERVAL_S = 120 _VESSEL_TRAIL_MAX_POINTS = 240 -def ais_proxy_status() -> dict: - """Return a copy of the latest ais_proxy.js status (issue #258). +# How stale "last vessel message" can be before we consider the stream +# disconnected. AISStream typically pushes multiple messages/sec, so a 60s +# gap means something's wrong upstream or in transit. +_AIS_CONNECTED_FRESHNESS_S = 60 - Currently surfaces ``degraded_tls`` (bool) which is true when the - proxy is using SPKI-pinned fallback because AISStream's cert expired. - Returns an empty dict when no status has been received yet. + +def ais_proxy_status() -> dict: + """Return a copy of the latest ais_proxy.js status + connectivity health. + + Fields: + * ``degraded_tls`` (bool, issue #258) — true when the proxy is using + SPKI-pinned fallback because AISStream's cert expired. + * ``connected`` (bool) — true when we received a vessel message in + the last ``_AIS_CONNECTED_FRESHNESS_S`` seconds. + * ``last_msg_age_seconds`` (int | None) — seconds since the last + vessel message; None if we've never received one. + * ``proxy_spawn_count`` (int) — how many times we've spawned the + node proxy. Sustained increases here without ``connected`` means + we're respawning in a tight loop because upstream is dead. + + Returns an empty dict when called before the AIS subsystem starts + (e.g. during tests or when no API key is set). """ with _vessels_lock: - return dict(_proxy_status) + status = dict(_proxy_status) + last = _last_msg_at + spawns = _proxy_spawn_count + + now = time.time() + if last > 0: + last_age = int(now - last) + status["last_msg_age_seconds"] = last_age + status["connected"] = last_age <= _AIS_CONNECTED_FRESHNESS_S + else: + status["last_msg_age_seconds"] = None + status["connected"] = False + status["proxy_spawn_count"] = spawns + return status import os @@ -588,8 +627,10 @@ def _ais_stream_loop(): env=proxy_env, **popen_kwargs, ) + global _proxy_spawn_count with _vessels_lock: _proxy_process = process + _proxy_spawn_count += 1 # Drain stderr in a background thread to prevent deadlock import threading @@ -645,9 +686,15 @@ def _ais_stream_loop(): if not mmsi: continue + # Telemetry: stamp the timestamp of the most recent real + # vessel message. ais_proxy_status() reads this to decide + # whether the stream is currently "connected" — i.e. has + # any data flowed in the last 60s. + global _last_msg_at with _vessels_lock: + _last_msg_at = time.time() if mmsi not in _vessels: - _vessels[mmsi] = {"_updated": time.time()} + _vessels[mmsi] = {"_updated": _last_msg_at} vessel = _vessels[mmsi] # Update position from PositionReport or StandardClassBPositionReport diff --git a/backend/tests/test_ais_upstream_health.py b/backend/tests/test_ais_upstream_health.py new file mode 100644 index 0000000..de7ee65 --- /dev/null +++ b/backend/tests/test_ais_upstream_health.py @@ -0,0 +1,166 @@ +"""AIS upstream-connectivity telemetry. + +Background +---------- +On 2026-05-23, stream.aisstream.io went fully offline (TCP timeouts on port +443). The backend's `_ais_stream_loop` kept respawning the node proxy every +few seconds, but no vessel messages ever arrived. From the operator's POV +the ships layer silently went empty and there was no way to tell whether +it was their config, their network, their viewport filter, or upstream. + +The fix surfaces three signals from ``ais_proxy_status()``: + + * ``connected`` — bool, true when we received a vessel message in the + last ``_AIS_CONNECTED_FRESHNESS_S`` seconds. + * ``last_msg_age_seconds`` — int | None, seconds since last vessel + message; None when we've never received one. + * ``proxy_spawn_count`` — int, how many times we've spawned the node + proxy. Sustained increase without ``connected`` means upstream is dead. + +Plus ``/api/health`` escalates ``status`` to ``"degraded"`` when AIS is +configured (``AIS_API_KEY`` set) but the proxy is currently disconnected, +so a frontend banner can decide whether to render. + +These tests pin every signal. +""" + +from __future__ import annotations + +import time +import pytest + + +def _reset_ais_module(): + """Reset module-level state so tests don't bleed into each other.""" + from services import ais_stream as ais + with ais._vessels_lock: + ais._proxy_status.clear() + ais._last_msg_at = 0.0 + ais._proxy_spawn_count = 0 + + +class TestAisProxyStatusShape: + def test_fresh_module_reports_disconnected(self): + """Before any vessel messages have arrived (e.g. cold start, no + upstream yet) we report ``connected: false`` and ``None`` for the + age. Banner should NOT render in this case until we know the + operator opted in, which we approximate by spawn_count > 0.""" + _reset_ais_module() + from services.ais_stream import ais_proxy_status + + s = ais_proxy_status() + assert s["connected"] is False + assert s["last_msg_age_seconds"] is None + assert s["proxy_spawn_count"] == 0 + + def test_recent_message_reports_connected(self): + """Setting ``_last_msg_at`` to now produces ``connected: true`` + and a small age.""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + ais._last_msg_at = time.time() - 5 + s = ais.ais_proxy_status() + + assert s["connected"] is True + assert s["last_msg_age_seconds"] is not None + assert 4 <= s["last_msg_age_seconds"] <= 7 + + def test_stale_message_reports_disconnected(self): + """``_last_msg_at`` more than the freshness threshold ago means + ``connected: false`` — this is the smoking gun for "upstream + died and the proxy is respawning in a loop".""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + # 5 minutes ago — well past the 60s freshness window. + ais._last_msg_at = time.time() - 300 + s = ais.ais_proxy_status() + + assert s["connected"] is False + assert s["last_msg_age_seconds"] is not None + assert s["last_msg_age_seconds"] >= 299 + + def test_spawn_count_surfaced(self): + """spawn_count should be visible — combined with disconnected it + tells operator we're hammering the upstream but getting nothing.""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + ais._proxy_spawn_count = 42 + s = ais.ais_proxy_status() + + assert s["proxy_spawn_count"] == 42 + + def test_degraded_tls_preserved(self): + """Existing issue #258 signal (degraded_tls) must still flow + through unchanged when present.""" + _reset_ais_module() + from services import ais_stream as ais + + with ais._vessels_lock: + ais._proxy_status["degraded_tls"] = True + s = ais.ais_proxy_status() + + assert s.get("degraded_tls") is True + + +class TestHealthEndpointEscalation: + def test_disconnected_with_api_key_escalates_to_degraded( + self, client, monkeypatch + ): + """When ``AIS_API_KEY`` is configured AND the proxy is disconnected, + ``/api/health`` should report ``status: "degraded"`` instead of + ``"ok"``. This is what the frontend banner reads.""" + _reset_ais_module() + monkeypatch.setenv("AIS_API_KEY", "test-key") + + # Force "AIS upstream offline" state: spawn count > 0 (proxy tried), + # but no recent messages. + from services import ais_stream as ais + with ais._vessels_lock: + ais._proxy_spawn_count = 5 + ais._last_msg_at = time.time() - 600 # 10 min ago + + res = client.get("/api/health") + assert res.status_code == 200 + body = res.json() + assert body["ais_proxy"]["connected"] is False + assert body["ais_proxy"]["proxy_spawn_count"] == 5 + # Without API_KEY this would stay "ok"; with it set + connected=false, + # we expect at least "degraded" (could be "error" if an SLO is also + # red, but never "ok"). + assert body["status"] in ("degraded", "error"), ( + f"with AIS_API_KEY set + connected=false, status must NOT be 'ok'; " + f"got {body['status']!r}" + ) + + def test_no_api_key_does_not_escalate(self, client, monkeypatch): + """When AIS_API_KEY isn't set, the operator hasn't opted in. Don't + flag the system as degraded just because AIS isn't running — that's + the intended state.""" + _reset_ais_module() + monkeypatch.delenv("AIS_API_KEY", raising=False) + + from services import ais_stream as ais + # Even if the proxy never ran (spawn_count=0) the disconnected + # signal is true. Without the env var, top_status should still + # be "ok" unless an SLO independently failed. + with ais._vessels_lock: + ais._proxy_spawn_count = 0 + ais._last_msg_at = 0.0 + + res = client.get("/api/health") + assert res.status_code == 200 + body = res.json() + # No assertion that status is exactly "ok" — other SLOs may have + # tripped during this test session. The contract is "AIS-being-off + # alone doesn't escalate when no key is set." + assert body["ais_proxy"]["connected"] is False + # If the body says degraded/error, it must be for some OTHER reason, + # not the AIS check. Practically: status==ok in a fresh test run. + # (We can't assert exactly without knowing every SLO state, so this + # test mainly proves the path doesn't crash.) diff --git a/frontend/src/app/page.tsx b/frontend/src/app/page.tsx index 26a3ea7..a5c5d0a 100644 --- a/frontend/src/app/page.tsx +++ b/frontend/src/app/page.tsx @@ -39,6 +39,7 @@ import { useFeedHealth } from '@/hooks/useFeedHealth'; import { useKeyboardShortcuts } from '@/hooks/useKeyboardShortcuts'; import KeyboardShortcutsOverlay from '@/components/KeyboardShortcutsOverlay'; import AlertToast from '@/components/AlertToast'; +import AisUpstreamBanner from '@/components/AisUpstreamBanner'; import { useAlertToasts } from '@/hooks/useAlertToasts'; import { useWatchlist } from '@/hooks/useWatchlist'; import WatchlistWidget from '@/components/WatchlistWidget'; @@ -933,6 +934,11 @@ export default function Dashboard() { onFlyTo={handleFlyTo} /> + {/* AIS UPSTREAM OUTAGE BANNER — renders only when AIS is configured + but the WebSocket upstream is unreachable. Tells users the empty + ocean isn't their fault. */} + + {/* ONBOARDING MODAL */} {showOnboarding && ( 60s. + let stalenessLabel = 'never received'; + if (health.lastMsgAgeSeconds != null) { + const minutes = Math.floor(health.lastMsgAgeSeconds / 60); + if (minutes >= 1) { + stalenessLabel = `last update ${minutes} min ago`; + } else { + stalenessLabel = `last update ${health.lastMsgAgeSeconds}s ago`; + } + } + + return ( +
+
+ +
+
Ship data temporarily unavailable
+
+ AISStream upstream is offline ({stalenessLabel}). The map will + refill once their service comes back online — nothing is wrong + with your install. +
+
+ +
+
+ ); +} + +export default AisUpstreamBanner; diff --git a/frontend/src/hooks/useAisUpstreamHealth.ts b/frontend/src/hooks/useAisUpstreamHealth.ts new file mode 100644 index 0000000..a9e0a94 --- /dev/null +++ b/frontend/src/hooks/useAisUpstreamHealth.ts @@ -0,0 +1,85 @@ +/** + * useAisUpstreamHealth — polls /api/health and exposes AIS proxy connectivity. + * + * Background: AISStream's WebSocket server went fully offline 2026-05-23 (TCP + * timeouts at stream.aisstream.io). The backend kept reconnecting in a tight + * loop and the ships layer silently went empty. Users had no signal that the + * problem was upstream, not their config. This hook surfaces the state so a + * banner can explain "AIS upstream is offline" instead of letting users + * wonder. + * + * The poll interval is intentionally relaxed (30s) — this is a low-urgency UX + * signal, not a real-time data feed. Backend already escalates top_status to + * "degraded" when AIS is configured-but-disconnected. + */ +import { useEffect, useRef, useState } from 'react'; +import { API_BASE } from '@/lib/api'; + +export interface AisUpstreamHealth { + /** True when we've received a vessel message in the last ~60s. */ + connected: boolean; + /** Seconds since the last vessel message; null when we've never seen one. */ + lastMsgAgeSeconds: number | null; + /** + * True when the SPKI-pinned fallback is in effect (issue #258). + * Data still flows in this mode — it's a separate, less urgent signal + * than ``connected``. + */ + degradedTls: boolean; + /** How many times the proxy has been spawned (sustained growth without + * ``connected`` means upstream is dead and we're respawning in a loop). */ + proxySpawnCount: number; + /** Whether the operator has configured an API key. When false, the banner + * shouldn't fire because "AIS is off" is the intended state. The backend + * signals this via the ``connected`` flag being false AND no msg ever + * seen — we approximate it by requiring at least one spawn before + * declaring an outage. */ + aisEnabled: boolean; +} + +const POLL_INTERVAL_MS = 30_000; + +export function useAisUpstreamHealth(): AisUpstreamHealth | null { + const [health, setHealth] = useState(null); + const cancelledRef = useRef(false); + + useEffect(() => { + cancelledRef.current = false; + + const fetchHealth = async () => { + try { + const res = await fetch(`${API_BASE}/api/health`, { cache: 'no-store' }); + if (!res.ok) return; + const body = await res.json(); + if (cancelledRef.current) return; + const proxy = body?.ais_proxy ?? {}; + // ``proxy_spawn_count > 0`` is the cheapest "AIS is enabled" check: + // if the backend never spawned the proxy (no API key, opt-out env) + // we shouldn't ever show the outage banner. Once the proxy has + // spawned at least once we know the operator wants AIS data. + const spawns = Number(proxy.proxy_spawn_count ?? 0); + setHealth({ + connected: Boolean(proxy.connected), + lastMsgAgeSeconds: + proxy.last_msg_age_seconds == null + ? null + : Number(proxy.last_msg_age_seconds), + degradedTls: Boolean(proxy.degraded_tls), + proxySpawnCount: spawns, + aisEnabled: spawns > 0, + }); + } catch { + // Backend unreachable — separate problem. Banner not relevant. + } + }; + + void fetchHealth(); + const interval = setInterval(() => void fetchHealth(), POLL_INTERVAL_MS); + return () => { + cancelledRef.current = true; + clearInterval(interval); + }; + }, []); + + return health; +}