mirror of
https://github.com/BigBodyCobain/Shadowbroker.git
synced 2026-05-28 10:01:31 +02:00
Merge pull request #314 from BigBodyCobain/feat/ais-upstream-health
feat(ais): surface AISStream upstream outage instead of failing silently
This commit is contained in:
@@ -59,6 +59,12 @@ async def health_check(request: Request):
|
||||
# when the SPKI-pinned fallback is in effect. The data plane keeps
|
||||
# flowing (this is by design — see ais_proxy.js comments) but observers
|
||||
# who care about MITM-protection posture deserve a visible signal.
|
||||
#
|
||||
# Plus connectivity health (added 2026-05-23 when stream.aisstream.io
|
||||
# went fully offline): ``connected`` tells the frontend whether ship
|
||||
# data is actually flowing. When false, a banner explains that ships
|
||||
# are unavailable due to an upstream outage — better than the user
|
||||
# silently seeing an empty ocean and assuming we broke something.
|
||||
ais_status: dict = {}
|
||||
try:
|
||||
from services.ais_stream import ais_proxy_status
|
||||
@@ -69,6 +75,15 @@ async def health_check(request: Request):
|
||||
# Don't override a worse top-level status if SLOs already failed,
|
||||
# but escalate ok -> degraded so the field surfaces in dashboards.
|
||||
top_status = "degraded"
|
||||
# AIS_API_KEY not configured is "feature off", not "system broken" —
|
||||
# so we only escalate when the operator opted into AIS (key set) AND
|
||||
# the stream is currently offline.
|
||||
if (
|
||||
os.environ.get("AIS_API_KEY")
|
||||
and ais_status.get("connected") is False
|
||||
and top_status == "ok"
|
||||
):
|
||||
top_status = "degraded"
|
||||
|
||||
return {
|
||||
"status": top_status,
|
||||
|
||||
@@ -350,19 +350,58 @@ _proxy_process = None
|
||||
# path during an upstream cert outage. Surfaced via ais_proxy_status() for
|
||||
# /api/health.
|
||||
_proxy_status: dict = {}
|
||||
# Upstream-connectivity telemetry (added when stream.aisstream.io went fully
|
||||
# offline on 2026-05-23). ``_last_msg_at`` is the unix timestamp of the most
|
||||
# recent vessel message received from the proxy. ``_proxy_spawn_count`` is
|
||||
# how many times we've started the node proxy; combined with no recent
|
||||
# messages it tells us the proxy is respawning in a tight loop because the
|
||||
# upstream is unreachable. Surfaced via ais_proxy_status() so the operator
|
||||
# can see "AIS is dead" instead of guessing whether it's their map filter,
|
||||
# their api key, or upstream.
|
||||
_last_msg_at: float = 0.0
|
||||
_proxy_spawn_count: int = 0
|
||||
_VESSEL_TRAIL_INTERVAL_S = 120
|
||||
_VESSEL_TRAIL_MAX_POINTS = 240
|
||||
|
||||
|
||||
def ais_proxy_status() -> dict:
|
||||
"""Return a copy of the latest ais_proxy.js status (issue #258).
|
||||
# How stale "last vessel message" can be before we consider the stream
|
||||
# disconnected. AISStream typically pushes multiple messages/sec, so a 60s
|
||||
# gap means something's wrong upstream or in transit.
|
||||
_AIS_CONNECTED_FRESHNESS_S = 60
|
||||
|
||||
Currently surfaces ``degraded_tls`` (bool) which is true when the
|
||||
proxy is using SPKI-pinned fallback because AISStream's cert expired.
|
||||
Returns an empty dict when no status has been received yet.
|
||||
|
||||
def ais_proxy_status() -> dict:
|
||||
"""Return a copy of the latest ais_proxy.js status + connectivity health.
|
||||
|
||||
Fields:
|
||||
* ``degraded_tls`` (bool, issue #258) — true when the proxy is using
|
||||
SPKI-pinned fallback because AISStream's cert expired.
|
||||
* ``connected`` (bool) — true when we received a vessel message in
|
||||
the last ``_AIS_CONNECTED_FRESHNESS_S`` seconds.
|
||||
* ``last_msg_age_seconds`` (int | None) — seconds since the last
|
||||
vessel message; None if we've never received one.
|
||||
* ``proxy_spawn_count`` (int) — how many times we've spawned the
|
||||
node proxy. Sustained increases here without ``connected`` means
|
||||
we're respawning in a tight loop because upstream is dead.
|
||||
|
||||
Returns an empty dict when called before the AIS subsystem starts
|
||||
(e.g. during tests or when no API key is set).
|
||||
"""
|
||||
with _vessels_lock:
|
||||
return dict(_proxy_status)
|
||||
status = dict(_proxy_status)
|
||||
last = _last_msg_at
|
||||
spawns = _proxy_spawn_count
|
||||
|
||||
now = time.time()
|
||||
if last > 0:
|
||||
last_age = int(now - last)
|
||||
status["last_msg_age_seconds"] = last_age
|
||||
status["connected"] = last_age <= _AIS_CONNECTED_FRESHNESS_S
|
||||
else:
|
||||
status["last_msg_age_seconds"] = None
|
||||
status["connected"] = False
|
||||
status["proxy_spawn_count"] = spawns
|
||||
return status
|
||||
|
||||
import os
|
||||
|
||||
@@ -588,8 +627,10 @@ def _ais_stream_loop():
|
||||
env=proxy_env,
|
||||
**popen_kwargs,
|
||||
)
|
||||
global _proxy_spawn_count
|
||||
with _vessels_lock:
|
||||
_proxy_process = process
|
||||
_proxy_spawn_count += 1
|
||||
|
||||
# Drain stderr in a background thread to prevent deadlock
|
||||
import threading
|
||||
@@ -645,9 +686,15 @@ def _ais_stream_loop():
|
||||
if not mmsi:
|
||||
continue
|
||||
|
||||
# Telemetry: stamp the timestamp of the most recent real
|
||||
# vessel message. ais_proxy_status() reads this to decide
|
||||
# whether the stream is currently "connected" — i.e. has
|
||||
# any data flowed in the last 60s.
|
||||
global _last_msg_at
|
||||
with _vessels_lock:
|
||||
_last_msg_at = time.time()
|
||||
if mmsi not in _vessels:
|
||||
_vessels[mmsi] = {"_updated": time.time()}
|
||||
_vessels[mmsi] = {"_updated": _last_msg_at}
|
||||
vessel = _vessels[mmsi]
|
||||
|
||||
# Update position from PositionReport or StandardClassBPositionReport
|
||||
|
||||
@@ -0,0 +1,166 @@
|
||||
"""AIS upstream-connectivity telemetry.
|
||||
|
||||
Background
|
||||
----------
|
||||
On 2026-05-23, stream.aisstream.io went fully offline (TCP timeouts on port
|
||||
443). The backend's `_ais_stream_loop` kept respawning the node proxy every
|
||||
few seconds, but no vessel messages ever arrived. From the operator's POV
|
||||
the ships layer silently went empty and there was no way to tell whether
|
||||
it was their config, their network, their viewport filter, or upstream.
|
||||
|
||||
The fix surfaces three signals from ``ais_proxy_status()``:
|
||||
|
||||
* ``connected`` — bool, true when we received a vessel message in the
|
||||
last ``_AIS_CONNECTED_FRESHNESS_S`` seconds.
|
||||
* ``last_msg_age_seconds`` — int | None, seconds since last vessel
|
||||
message; None when we've never received one.
|
||||
* ``proxy_spawn_count`` — int, how many times we've spawned the node
|
||||
proxy. Sustained increase without ``connected`` means upstream is dead.
|
||||
|
||||
Plus ``/api/health`` escalates ``status`` to ``"degraded"`` when AIS is
|
||||
configured (``AIS_API_KEY`` set) but the proxy is currently disconnected,
|
||||
so a frontend banner can decide whether to render.
|
||||
|
||||
These tests pin every signal.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import pytest
|
||||
|
||||
|
||||
def _reset_ais_module():
|
||||
"""Reset module-level state so tests don't bleed into each other."""
|
||||
from services import ais_stream as ais
|
||||
with ais._vessels_lock:
|
||||
ais._proxy_status.clear()
|
||||
ais._last_msg_at = 0.0
|
||||
ais._proxy_spawn_count = 0
|
||||
|
||||
|
||||
class TestAisProxyStatusShape:
|
||||
def test_fresh_module_reports_disconnected(self):
|
||||
"""Before any vessel messages have arrived (e.g. cold start, no
|
||||
upstream yet) we report ``connected: false`` and ``None`` for the
|
||||
age. Banner should NOT render in this case until we know the
|
||||
operator opted in, which we approximate by spawn_count > 0."""
|
||||
_reset_ais_module()
|
||||
from services.ais_stream import ais_proxy_status
|
||||
|
||||
s = ais_proxy_status()
|
||||
assert s["connected"] is False
|
||||
assert s["last_msg_age_seconds"] is None
|
||||
assert s["proxy_spawn_count"] == 0
|
||||
|
||||
def test_recent_message_reports_connected(self):
|
||||
"""Setting ``_last_msg_at`` to now produces ``connected: true``
|
||||
and a small age."""
|
||||
_reset_ais_module()
|
||||
from services import ais_stream as ais
|
||||
|
||||
with ais._vessels_lock:
|
||||
ais._last_msg_at = time.time() - 5
|
||||
s = ais.ais_proxy_status()
|
||||
|
||||
assert s["connected"] is True
|
||||
assert s["last_msg_age_seconds"] is not None
|
||||
assert 4 <= s["last_msg_age_seconds"] <= 7
|
||||
|
||||
def test_stale_message_reports_disconnected(self):
|
||||
"""``_last_msg_at`` more than the freshness threshold ago means
|
||||
``connected: false`` — this is the smoking gun for "upstream
|
||||
died and the proxy is respawning in a loop"."""
|
||||
_reset_ais_module()
|
||||
from services import ais_stream as ais
|
||||
|
||||
with ais._vessels_lock:
|
||||
# 5 minutes ago — well past the 60s freshness window.
|
||||
ais._last_msg_at = time.time() - 300
|
||||
s = ais.ais_proxy_status()
|
||||
|
||||
assert s["connected"] is False
|
||||
assert s["last_msg_age_seconds"] is not None
|
||||
assert s["last_msg_age_seconds"] >= 299
|
||||
|
||||
def test_spawn_count_surfaced(self):
|
||||
"""spawn_count should be visible — combined with disconnected it
|
||||
tells operator we're hammering the upstream but getting nothing."""
|
||||
_reset_ais_module()
|
||||
from services import ais_stream as ais
|
||||
|
||||
with ais._vessels_lock:
|
||||
ais._proxy_spawn_count = 42
|
||||
s = ais.ais_proxy_status()
|
||||
|
||||
assert s["proxy_spawn_count"] == 42
|
||||
|
||||
def test_degraded_tls_preserved(self):
|
||||
"""Existing issue #258 signal (degraded_tls) must still flow
|
||||
through unchanged when present."""
|
||||
_reset_ais_module()
|
||||
from services import ais_stream as ais
|
||||
|
||||
with ais._vessels_lock:
|
||||
ais._proxy_status["degraded_tls"] = True
|
||||
s = ais.ais_proxy_status()
|
||||
|
||||
assert s.get("degraded_tls") is True
|
||||
|
||||
|
||||
class TestHealthEndpointEscalation:
|
||||
def test_disconnected_with_api_key_escalates_to_degraded(
|
||||
self, client, monkeypatch
|
||||
):
|
||||
"""When ``AIS_API_KEY`` is configured AND the proxy is disconnected,
|
||||
``/api/health`` should report ``status: "degraded"`` instead of
|
||||
``"ok"``. This is what the frontend banner reads."""
|
||||
_reset_ais_module()
|
||||
monkeypatch.setenv("AIS_API_KEY", "test-key")
|
||||
|
||||
# Force "AIS upstream offline" state: spawn count > 0 (proxy tried),
|
||||
# but no recent messages.
|
||||
from services import ais_stream as ais
|
||||
with ais._vessels_lock:
|
||||
ais._proxy_spawn_count = 5
|
||||
ais._last_msg_at = time.time() - 600 # 10 min ago
|
||||
|
||||
res = client.get("/api/health")
|
||||
assert res.status_code == 200
|
||||
body = res.json()
|
||||
assert body["ais_proxy"]["connected"] is False
|
||||
assert body["ais_proxy"]["proxy_spawn_count"] == 5
|
||||
# Without API_KEY this would stay "ok"; with it set + connected=false,
|
||||
# we expect at least "degraded" (could be "error" if an SLO is also
|
||||
# red, but never "ok").
|
||||
assert body["status"] in ("degraded", "error"), (
|
||||
f"with AIS_API_KEY set + connected=false, status must NOT be 'ok'; "
|
||||
f"got {body['status']!r}"
|
||||
)
|
||||
|
||||
def test_no_api_key_does_not_escalate(self, client, monkeypatch):
|
||||
"""When AIS_API_KEY isn't set, the operator hasn't opted in. Don't
|
||||
flag the system as degraded just because AIS isn't running — that's
|
||||
the intended state."""
|
||||
_reset_ais_module()
|
||||
monkeypatch.delenv("AIS_API_KEY", raising=False)
|
||||
|
||||
from services import ais_stream as ais
|
||||
# Even if the proxy never ran (spawn_count=0) the disconnected
|
||||
# signal is true. Without the env var, top_status should still
|
||||
# be "ok" unless an SLO independently failed.
|
||||
with ais._vessels_lock:
|
||||
ais._proxy_spawn_count = 0
|
||||
ais._last_msg_at = 0.0
|
||||
|
||||
res = client.get("/api/health")
|
||||
assert res.status_code == 200
|
||||
body = res.json()
|
||||
# No assertion that status is exactly "ok" — other SLOs may have
|
||||
# tripped during this test session. The contract is "AIS-being-off
|
||||
# alone doesn't escalate when no key is set."
|
||||
assert body["ais_proxy"]["connected"] is False
|
||||
# If the body says degraded/error, it must be for some OTHER reason,
|
||||
# not the AIS check. Practically: status==ok in a fresh test run.
|
||||
# (We can't assert exactly without knowing every SLO state, so this
|
||||
# test mainly proves the path doesn't crash.)
|
||||
@@ -39,6 +39,7 @@ import { useFeedHealth } from '@/hooks/useFeedHealth';
|
||||
import { useKeyboardShortcuts } from '@/hooks/useKeyboardShortcuts';
|
||||
import KeyboardShortcutsOverlay from '@/components/KeyboardShortcutsOverlay';
|
||||
import AlertToast from '@/components/AlertToast';
|
||||
import AisUpstreamBanner from '@/components/AisUpstreamBanner';
|
||||
import { useAlertToasts } from '@/hooks/useAlertToasts';
|
||||
import { useWatchlist } from '@/hooks/useWatchlist';
|
||||
import WatchlistWidget from '@/components/WatchlistWidget';
|
||||
@@ -933,6 +934,11 @@ export default function Dashboard() {
|
||||
onFlyTo={handleFlyTo}
|
||||
/>
|
||||
|
||||
{/* AIS UPSTREAM OUTAGE BANNER — renders only when AIS is configured
|
||||
but the WebSocket upstream is unreachable. Tells users the empty
|
||||
ocean isn't their fault. */}
|
||||
<AisUpstreamBanner />
|
||||
|
||||
{/* ONBOARDING MODAL */}
|
||||
{showOnboarding && (
|
||||
<OnboardingModal
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
/**
|
||||
* AisUpstreamBanner — visible notice that AIS ship data is unavailable
|
||||
* because the upstream provider (AISStream) is offline.
|
||||
*
|
||||
* Renders nothing when AIS is healthy or when AIS isn't configured at all.
|
||||
* Mounted at the app shell level so users see it before they wonder why
|
||||
* the ocean looks empty.
|
||||
*/
|
||||
import { useState } from 'react';
|
||||
import { useAisUpstreamHealth } from '@/hooks/useAisUpstreamHealth';
|
||||
|
||||
export function AisUpstreamBanner() {
|
||||
const health = useAisUpstreamHealth();
|
||||
const [dismissed, setDismissed] = useState(false);
|
||||
|
||||
if (!health || !health.aisEnabled || health.connected || dismissed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Format the staleness for the operator. ``null`` means we never received
|
||||
// anything since startup; otherwise show minutes if > 60s.
|
||||
let stalenessLabel = 'never received';
|
||||
if (health.lastMsgAgeSeconds != null) {
|
||||
const minutes = Math.floor(health.lastMsgAgeSeconds / 60);
|
||||
if (minutes >= 1) {
|
||||
stalenessLabel = `last update ${minutes} min ago`;
|
||||
} else {
|
||||
stalenessLabel = `last update ${health.lastMsgAgeSeconds}s ago`;
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div
|
||||
role="status"
|
||||
aria-live="polite"
|
||||
className="pointer-events-auto fixed top-3 left-1/2 z-[100] -translate-x-1/2 max-w-[640px] rounded-md border border-amber-500/60 bg-amber-900/85 px-4 py-2 text-sm text-amber-50 shadow-lg backdrop-blur"
|
||||
>
|
||||
<div className="flex items-start gap-3">
|
||||
<span aria-hidden className="mt-0.5 text-amber-300">⚠</span>
|
||||
<div className="flex-1">
|
||||
<div className="font-semibold">Ship data temporarily unavailable</div>
|
||||
<div className="text-xs opacity-90">
|
||||
AISStream upstream is offline ({stalenessLabel}). The map will
|
||||
refill once their service comes back online — nothing is wrong
|
||||
with your install.
|
||||
</div>
|
||||
</div>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setDismissed(true)}
|
||||
aria-label="Dismiss"
|
||||
className="text-amber-200 hover:text-white"
|
||||
>
|
||||
✕
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export default AisUpstreamBanner;
|
||||
@@ -0,0 +1,85 @@
|
||||
/**
|
||||
* useAisUpstreamHealth — polls /api/health and exposes AIS proxy connectivity.
|
||||
*
|
||||
* Background: AISStream's WebSocket server went fully offline 2026-05-23 (TCP
|
||||
* timeouts at stream.aisstream.io). The backend kept reconnecting in a tight
|
||||
* loop and the ships layer silently went empty. Users had no signal that the
|
||||
* problem was upstream, not their config. This hook surfaces the state so a
|
||||
* banner can explain "AIS upstream is offline" instead of letting users
|
||||
* wonder.
|
||||
*
|
||||
* The poll interval is intentionally relaxed (30s) — this is a low-urgency UX
|
||||
* signal, not a real-time data feed. Backend already escalates top_status to
|
||||
* "degraded" when AIS is configured-but-disconnected.
|
||||
*/
|
||||
import { useEffect, useRef, useState } from 'react';
|
||||
import { API_BASE } from '@/lib/api';
|
||||
|
||||
export interface AisUpstreamHealth {
|
||||
/** True when we've received a vessel message in the last ~60s. */
|
||||
connected: boolean;
|
||||
/** Seconds since the last vessel message; null when we've never seen one. */
|
||||
lastMsgAgeSeconds: number | null;
|
||||
/**
|
||||
* True when the SPKI-pinned fallback is in effect (issue #258).
|
||||
* Data still flows in this mode — it's a separate, less urgent signal
|
||||
* than ``connected``.
|
||||
*/
|
||||
degradedTls: boolean;
|
||||
/** How many times the proxy has been spawned (sustained growth without
|
||||
* ``connected`` means upstream is dead and we're respawning in a loop). */
|
||||
proxySpawnCount: number;
|
||||
/** Whether the operator has configured an API key. When false, the banner
|
||||
* shouldn't fire because "AIS is off" is the intended state. The backend
|
||||
* signals this via the ``connected`` flag being false AND no msg ever
|
||||
* seen — we approximate it by requiring at least one spawn before
|
||||
* declaring an outage. */
|
||||
aisEnabled: boolean;
|
||||
}
|
||||
|
||||
const POLL_INTERVAL_MS = 30_000;
|
||||
|
||||
export function useAisUpstreamHealth(): AisUpstreamHealth | null {
|
||||
const [health, setHealth] = useState<AisUpstreamHealth | null>(null);
|
||||
const cancelledRef = useRef(false);
|
||||
|
||||
useEffect(() => {
|
||||
cancelledRef.current = false;
|
||||
|
||||
const fetchHealth = async () => {
|
||||
try {
|
||||
const res = await fetch(`${API_BASE}/api/health`, { cache: 'no-store' });
|
||||
if (!res.ok) return;
|
||||
const body = await res.json();
|
||||
if (cancelledRef.current) return;
|
||||
const proxy = body?.ais_proxy ?? {};
|
||||
// ``proxy_spawn_count > 0`` is the cheapest "AIS is enabled" check:
|
||||
// if the backend never spawned the proxy (no API key, opt-out env)
|
||||
// we shouldn't ever show the outage banner. Once the proxy has
|
||||
// spawned at least once we know the operator wants AIS data.
|
||||
const spawns = Number(proxy.proxy_spawn_count ?? 0);
|
||||
setHealth({
|
||||
connected: Boolean(proxy.connected),
|
||||
lastMsgAgeSeconds:
|
||||
proxy.last_msg_age_seconds == null
|
||||
? null
|
||||
: Number(proxy.last_msg_age_seconds),
|
||||
degradedTls: Boolean(proxy.degraded_tls),
|
||||
proxySpawnCount: spawns,
|
||||
aisEnabled: spawns > 0,
|
||||
});
|
||||
} catch {
|
||||
// Backend unreachable — separate problem. Banner not relevant.
|
||||
}
|
||||
};
|
||||
|
||||
void fetchHealth();
|
||||
const interval = setInterval(() => void fetchHealth(), POLL_INTERVAL_MS);
|
||||
return () => {
|
||||
cancelledRef.current = true;
|
||||
clearInterval(interval);
|
||||
};
|
||||
}, []);
|
||||
|
||||
return health;
|
||||
}
|
||||
Reference in New Issue
Block a user