Files
Shadowbroker/backend/routers/health.py
T
BigBodyCobain 5e0b2c037e feat(ais): surface upstream outage instead of failing silently
On 2026-05-23, stream.aisstream.io went fully offline (TCP timeouts on port
443). The backend kept respawning the node WebSocket proxy every few
seconds with nothing arriving. From the operator's POV the ships layer
silently went empty — no banner, no log surfacing, no way to tell whether
it was their config / network / viewport filter / upstream.

Backend:
* ais_proxy_status() now also returns:
  - connected (bool): true when a vessel message arrived in last 60s
  - last_msg_age_seconds (int | None)
  - proxy_spawn_count (int): proxy respawns — sustained growth without
    connected means upstream is dead
* /api/health escalates top status to "degraded" when AIS_API_KEY is set
  but the proxy is currently disconnected. Existing degraded_tls signal
  preserved.

Frontend:
* useAisUpstreamHealth hook polls /api/health every 30s, derives the
  outage state. Defensively only reports outage once spawn_count > 0 so
  operators who haven't opted in don't see the banner.
* AisUpstreamBanner component renders a dismissible amber notice
  "Ship data temporarily unavailable — AISStream upstream is offline"
  mounted on the main app shell.

7 backend tests pin the status-shape contract and the /api/health
escalation behavior in both with-key and without-key configurations.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 06:38:05 -06:00

118 lines
4.3 KiB
Python

import time as _time_mod
from fastapi import APIRouter, Request, Depends
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from limiter import limiter
from auth import require_admin
from services.data_fetcher import get_latest_data
from services.schemas import HealthResponse
import os
APP_VERSION = os.environ.get("_HEALTH_APP_VERSION", "0.9.79")
router = APIRouter()
def _get_app_version() -> str:
# Import lazily to avoid circular import; main sets APP_VERSION before including routers
try:
import main as _main
return _main.APP_VERSION
except Exception:
return APP_VERSION
_start_time_ref: dict = {"value": None}
def _get_start_time() -> float:
if _start_time_ref["value"] is None:
try:
import main as _main
_start_time_ref["value"] = _main._start_time
except Exception:
_start_time_ref["value"] = _time_mod.time()
return _start_time_ref["value"]
@router.get("/api/health", response_model=HealthResponse)
@limiter.limit("30/minute")
async def health_check(request: Request):
from services.fetchers._store import get_source_timestamps_snapshot
from services.slo import compute_all_statuses, summarise_statuses
d = get_latest_data()
last = d.get("last_updated")
timestamps = get_source_timestamps_snapshot()
slo_statuses = compute_all_statuses(d, timestamps)
slo_summary = summarise_statuses(slo_statuses)
# Top-level status reflects worst SLO result — "degraded" if any
# yellow, "error" if any red, "ok" otherwise. This is the single
# field an external probe / pager can watch.
top_status = "ok"
if slo_summary.get("red", 0) > 0:
top_status = "error"
elif slo_summary.get("yellow", 0) > 0:
top_status = "degraded"
# Issue #258: surface AIS proxy degraded TLS state so operators can see
# when the SPKI-pinned fallback is in effect. The data plane keeps
# flowing (this is by design — see ais_proxy.js comments) but observers
# who care about MITM-protection posture deserve a visible signal.
#
# Plus connectivity health (added 2026-05-23 when stream.aisstream.io
# went fully offline): ``connected`` tells the frontend whether ship
# data is actually flowing. When false, a banner explains that ships
# are unavailable due to an upstream outage — better than the user
# silently seeing an empty ocean and assuming we broke something.
ais_status: dict = {}
try:
from services.ais_stream import ais_proxy_status
ais_status = ais_proxy_status() or {}
except Exception:
ais_status = {}
if ais_status.get("degraded_tls") and top_status == "ok":
# Don't override a worse top-level status if SLOs already failed,
# but escalate ok -> degraded so the field surfaces in dashboards.
top_status = "degraded"
# AIS_API_KEY not configured is "feature off", not "system broken" —
# so we only escalate when the operator opted into AIS (key set) AND
# the stream is currently offline.
if (
os.environ.get("AIS_API_KEY")
and ais_status.get("connected") is False
and top_status == "ok"
):
top_status = "degraded"
return {
"status": top_status,
"version": _get_app_version(),
"last_updated": last,
"sources": {
"flights": len(d.get("commercial_flights", [])),
"military": len(d.get("military_flights", [])),
"ships": len(d.get("ships", [])),
"satellites": len(d.get("satellites", [])),
"earthquakes": len(d.get("earthquakes", [])),
"cctv": len(d.get("cctv", [])),
"news": len(d.get("news", [])),
"uavs": len(d.get("uavs", [])),
"firms_fires": len(d.get("firms_fires", [])),
"liveuamap": len(d.get("liveuamap", [])),
"gdelt": len(d.get("gdelt", [])),
"uap_sightings": len(d.get("uap_sightings", [])),
},
"freshness": timestamps,
"uptime_seconds": round(_time_mod.time() - _get_start_time()),
"slo": slo_statuses,
"slo_summary": slo_summary,
"ais_proxy": ais_status,
}
@router.get("/api/debug-latest", dependencies=[Depends(require_admin)])
@limiter.limit("30/minute")
async def debug_latest_data(request: Request):
return list(get_latest_data().keys())